In [1]:
import great_expectations as gx
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# Crear un DataFrame de ejemplo
data = {
    'patient_id': ['P001', 'P002', 'P003', 'P004', 'P005', 'P001'],
    'name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Brown', 'Charlie Davis', 'John Doe'],
    'age': [35, 28, 52, 41, 39, 35],
    'gender': ['M', 'F', 'M', 'F', 'M', 'M'],
    'blood_type': ['A+', 'B-', 'O+', 'AB+', 'A-', 'A+'],
    'admission_date': ['2024-10-01', '2024-10-02', '2024-10-03', '2024-10-04', '2024-10-05', '2024-10-01'],
    'diagnosis': ['Hypertension', 'Diabetes', 'Asthma', 'Migraine', 'Arthritis', 'Hypertension'],
    'doctor_id': ['D1', 'D2', 'D3', 'D2', 'D1', 'D1']
}

In [3]:
df = pd.DataFrame(data)
df['admission_date'] = pd.to_datetime(df['admission_date'])

In [18]:
# Crear el contexto de Great Expectations
context = gx.get_context()

Crear Suite para luego agregar todas las Expectations

In [5]:
suite = context.suites.add(
    gx.core.expectation_suite.ExpectationSuite(name="expectations")
)

## 1. Exactitud (Accuracy):

* Verificamos que las edades estén entre 0 y 120 años.
* Aseguramos que los tipos de sangre sean válidos.

In [6]:
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="age", min_value=0, max_value=120)
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(column="blood_type", value_set=['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-'])
)

ExpectColumnValuesToBeInSet(id='94ca4a60-0a47-452e-a8d6-19be967bce95', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column='blood_type', mostly=1.0, value_set=['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-'])

## 2. Integridad (Completeness):

Comprobamos que no haya valores nulos en ninguna columna.

In [7]:
for column in df.columns:
    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToNotBeNull(column=column)
    )

## 3.Consistencia (Consistency):

Verificamos que la combinación de patient_id y admission_date sea única.

In [8]:
suite.add_expectation(
    gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=['patient_id', 'admission_date'])
)

ExpectCompoundColumnsToBeUnique(id='f1792281-63d6-4edd-9d05-d8ab8cf93e62', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column_list=['patient_id', 'admission_date'], mostly=1.0, ignore_row_if='all_values_are_missing')

## 4.Actualidad (Timeliness):

Aseguramos que todas las fechas de admisión estén dentro del último año.

In [9]:
current_date = datetime.now()
one_year_ago = current_date - timedelta(days=365)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="admission_date", min_value=one_year_ago, max_value=current_date)
)

ExpectColumnValuesToBeBetween(id='a75b39ea-7b04-4f61-8379-40ec40419be2', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column='admission_date', mostly=1.0, min_value=datetime.datetime(2023, 10, 9, 17, 14, 57, 581020), max_value=datetime.datetime(2024, 10, 8, 17, 14, 57, 581020), strict_min=False, strict_max=False)

## 5. Validez (Validity):

Verificamos que los IDs de paciente y doctor sigan el formato correcto.
Comprobamos que la edad sea de tipo entero.

In [10]:
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(column="patient_id", regex=r'^P\d{3}$')
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(column="doctor_id", regex=r'^D\d+$')
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column='age', type_='int64')
)

ExpectColumnValuesToBeOfType(id='ce1c4555-ef12-4496-aa62-0ff651940bb8', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column='age', mostly=1.0, type_='int64')

## 6. Unicidad (Uniqueness):

Aseguramos que los IDs de paciente sean únicos.

In [11]:
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column='patient_id')
)

ExpectColumnValuesToBeUnique(id='f04604cd-58dc-46e7-81b5-adf98ffb2650', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, row_condition=None, condition_parser=None, column='patient_id', mostly=1.0)

In [21]:
# Crear un batch con el DataFrame directamente
batch_request = {
    "datasource_name": "pandas_datasource",
    "data_connector_name": "default_runtime_data_connector_name",
    "data_asset_name": "patient_data_asset",
    "batch_data": df
}

In [26]:
datasource_config = {
    "name": "pandas_datasource",
    "class_name": "Datasource",
    "execution_engine": {
        "class_name": "PandasExecutionEngine"
    },
    "data_connectors": {
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_identifier_name"]
        }
    }
}

In [27]:
# Crear un RuntimeBatchRequest con el DataFrame directamente
batch_request = gx.core.batch.RuntimeBatchRequest(
    datasource_name="pandas_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="patient_data_asset",  # Puedes nombrar este asset como quieras
    runtime_parameters={"batch_data": df},  # El DataFrame se pasa aquí
    batch_identifiers={"default_identifier_name": "batch_001"}
)

In [28]:
# Obtener el validador
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite=suite
)

DatasourceError: Cannot initialize datasource pandas_datasource, error: The given datasource could not be retrieved from the DataContext; please confirm that your configuration is accurate.

In [12]:
validation_definition = context.validation_definitions.add(
    gx.core.validation_definition.ValidationDefinition(
        name = "validation definition",
        data = batch_definition,
        suite = suite
    )
)

In [13]:
'''
Crea y ejecuta un punto de comprobación (Checkpoint) para validar los datos basándose en la definición de validación suministrada. 
.describe() es un método práctico para ver un resumen de los resultados del punto de comprobación.
'''

checkpoint = context.checkpoints.add(
    gx.checkpoint.checkpoint.Checkpoint(
        name="checkpoint", validation_definitions=[validation_definition]
    )
)

checkpoint_result = checkpoint.run()
print(checkpoint_result.describe())

BuildBatchRequestError: Bad input to build_batch_request: options must contain exactly 1 key, 'dataframe'.