In [22]:
import great_expectations as gx
from pathlib import Path
import pandas as pd

In [2]:
context = gx.get_context(
    context_root_dir="../gx-data-context"
)

In [4]:
print("Data Context cargado desde:", context.root_directory)

Data Context cargado desde: /Users/alexander.pelaez/Desktop/pragma-data-lake/notebooks/../gx-data-context


## Create Data Source

In [3]:
source_folder = "../data"

data_source_name = "my_filesystem_data_source_1"

In [4]:
data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, base_directory=source_folder
)

## Create a Data Asset

In [5]:
asset_name = "pragmaticos_test_parquet_file"

In [7]:
file_parquet_asset = data_source.add_parquet_asset(name=asset_name)

# Create a Batch Definition

In [8]:
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)

In [9]:
print(file_data_asset)

id: 84a0b58b-4fb4-416c-ac5a-911c8a6aee0a
name: pragmaticos_test_parquet_file
type: parquet



In [12]:
batch_definition_name = "batch_definition_1"

batch_definition_path = "pragmaticos_test.parquet"

In [13]:
batch_definition = file_data_asset.add_batch_definition_path(
    name=batch_definition_name, path=batch_definition_path
)

In [14]:
print(batch_definition)

id=UUID('d1c539d2-6b4d-4d61-8c4f-b3ff33b53da7') name='batch_definition_1' partitioner=FileNamePartitionerPath(regex=re.compile('pragmaticos_test.parquet$'), param_names=(), sort_ascending=True)


In [15]:
batch = batch_definition.get_batch()
print(batch.head())

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 410.48it/s]

   sap_personal_id estado             correo_corporativo tipo_identificacion  \
0         10002910      t  nombre.apellido@pragma.com.co   CitizenshipCardID   
1         10002913      t  nombre.apellido@pragma.com.co   CitizenshipCardID   
2         10002916      t  nombre.apellido@pragma.com.co   CitizenshipCardID   
3         10002919      t  nombre.apellido@pragma.com.co               CI ID   
4         10002922      t  nombre.apellido@pragma.com.co   CitizenshipCardID   

  identificacion_nacional primer_nombre segundo_nombre primer_apellido  \
0             23546584654      Nombre 1       Nombre 2      Apellido 1   
1              4654654654      Nombre 1       Nombre 2      Apellido 1   
2                65464654      Nombre 1       Nombre 2      Apellido 1   
3             8-6548-6546      Nombre 1       Nombre 2      Apellido 1   
4                65464654      Nombre 1       Nombre 2      Apellido 1   

  segundo_apellido                  nombre_formal  ...         grupo_perso




## Create an expectation

In [16]:
from great_expectations import expectations as gxe

In [19]:
exp_1 = gxe.ExpectColumnValuesToNotBeNull(
    column="sap_personal_id"
)

In [20]:
validation_results_1 = batch.validate(exp_1)

Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 516.19it/s]


In [21]:
print(validation_results_1)

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "batch_id": "my_filesystem_data_source_1-pragmaticos_test_parquet_file",
      "column": "sap_personal_id"
    },
    "meta": {}
  },
  "result": {
    "element_count": 27,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [23]:
df = pd.read_parquet("/Users/alexander.pelaez/Desktop/pragma-data-lake/data/pragmaticos_test.parquet")

In [24]:
df.head()

Unnamed: 0,sap_personal_id,estado,correo_corporativo,tipo_identificacion,identificacion_nacional,primer_nombre,segundo_nombre,primer_apellido,segundo_apellido,nombre_formal,...,grupo_personal,lider_nombre,lider_id,division_personal,categoria_puesto,kc,chapter,especialidad,tecnologia,seniority
0,10002910,t,nombre.apellido@pragma.com.co,CitizenshipCardID,23546584654,Nombre 1,Nombre 2,Apellido 1,Apellido 2,Luis Fernando Rodríguez Pérez,...,Activos,Rosalba de Armas,10000187.0,Colombia - Pragma,Operativos Asignables,Powerful Teams,Liderazgo,Líder de Equipo,,Advanced L3
1,10002913,t,nombre.apellido@pragma.com.co,CitizenshipCardID,4654654654,Nombre 1,Nombre 2,Apellido 1,Apellido 2,María José Sánchez Ramírez,...,Aprendices y pasantes,Oscar Alexander Orozco,10000621.0,Colombia - Pragma,Practicantes,Computer Science,Backend,Desarrollador,Java,Trainee L3
2,10002916,t,nombre.apellido@pragma.com.co,CitizenshipCardID,65464654,Nombre 1,Nombre 2,Apellido 1,Apellido 2,Juan Sebastián Morales Díaz,...,Activos,Julián David Reyes,10000623.0,Colombia - Pragma,Operativos Asignables,Data Science,Ciencia de Datos,Científico de Datos,,Senior L2
3,10002919,t,nombre.apellido@pragma.com.co,CI ID,8-6548-6546,Nombre 1,Nombre 2,Apellido 1,Apellido 2,Ana Lucía Herrera Gómez,...,Activos,Diana del Pilar Ramirez,10000094.0,Costa Rica,Transversales Operativos,Business Development,Business Developer,Hunter,,Senior L1
4,10002922,t,nombre.apellido@pragma.com.co,CitizenshipCardID,65464654,Nombre 1,Nombre 2,Apellido 1,Apellido 2,Carlos Andrés Navarro León,...,Activos,Ana Paulina Echeverri,10000300.0,Colombia - Pragma,Operativos Asignables,Computer Science,Calidad de Software,Automatizador,,Senior L1


In [32]:
df.edad.max()

43

## Crear más expectativas y agregarlas a una suite

In [61]:
# Create expectation suite

suite_name = "my_expectation_suite_1"

suite = gx.ExpectationSuite(name=suite_name)

In [62]:
suite = context.suites.add(suite)

### Crear más expectativas

In [None]:
exp_2 = gxe.ExpectColumnValuesToBeUnique(
    column="sap_personal_id"
)

exp_3 = gxe.ExpectColumnValuesToNotBeNull(
    column="correo_corporativo",
)

exp_4 = gxe.ExpectColumnValuesToNotBeNull(
    column="nacionalidad",
)

exp_5 = gxe.ExpectColumnValuesToNotBeNull(
    column="nacionalidad",
)

exp_6 = gxe.ExpectColumnValuesToNotBeNull(
    column="fecha_nacimiento",
)

exp_7 = gxe.ExpectColumnValuesToBeBetween(
    column="test2",
    min_value=18,
    max_value=40,
    strict_min=True,
    strict_max=True
)

### Agregar las expectativas a la suite

In [65]:
expectativas = [exp_2, exp_3, exp_4, exp_5, exp_6, exp_7]

for expectativa in expectativas:
    suite.add_expectation(expectativa)
    
print(f"Se han agregado {len(expectativas)} expectativas a la suite.")

Se han agregado 6 expectativas a la suite.


In [67]:
exp_1.id = None

In [68]:
suite.add_expectation(exp_1)

ExpectColumnValuesToNotBeNull(id='b440c134-43cb-44be-9c2d-03d04d493e3c', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='sap_personal_id', mostly=1, row_condition=None, condition_parser=None)

In [70]:
print(suite)

{
  "name": "my_expectation_suite_1",
  "id": "10be34aa-102d-4436-a69a-53f535572186",
  "expectations": [
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "sap_personal_id"
      },
      "meta": {},
      "id": "9c4f03e6-d20d-4907-9fa7-f32fca3ca4b0"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "correo_corporativo"
      },
      "meta": {},
      "id": "6e83979b-2348-43ca-8278-0d97e1b213e9"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "nacionalidad"
      },
      "meta": {},
      "id": "390a9650-502c-4cbc-93a5-174bd6b59edd"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "fecha_nacimiento"
      },
      "meta": {},
      "id": "b16be408-8d4f-4fa3-a881-1f476b6d3037"
    },
    {
      "type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "test2",


## Crear Validation Definition

In [72]:
definition_name = "my_validation_definition_1"

validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [73]:
validation_definition = context.validation_definitions.add(validation_definition)

In [74]:
validation_results = validation_definition.run()

Calculating Metrics:  80%|████████  | 28/35 [00:00<00:00, 1171.10it/s]


In [75]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "test2",
          "min_value": 18.0,
          "max_value": 40.0,
          "strict_min": true,
          "strict_max": true,
          "batch_id": "my_filesystem_data_source_1-pragmaticos_test_parquet_file"
        },
        "meta": {},
        "id": "6f86718d-ab3c-41ef-8139-222f569f4c6e"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "MetricConfigurationID(metric_name='column_values.nonnull.condition', metric_domain_kwargs_id='3182b2260ed7fe3ff61e87107236e3f1', metric_value_kwargs_id=())": {
          "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/alexander.pelaez/Desktop/pragma-data-lake/.venv/lib/python3.12/site-packages/great_expectations/execution_engine/execution_engine.py\", line 534, in _process_direct_and_bundled_metric_comp

In [76]:
exp_7.column="edad"
exp_7.save()

In [77]:
validation_results = validation_definition.run()

Calculating Metrics: 100%|██████████| 35/35 [00:00<00:00, 5827.27it/s]


In [78]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_unique",
        "kwargs": {
          "batch_id": "my_filesystem_data_source_1-pragmaticos_test_parquet_file",
          "column": "sap_personal_id"
        },
        "meta": {},
        "id": "9c4f03e6-d20d-4907-9fa7-f32fca3ca4b0"
      },
      "result": {
        "element_count": 27,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {


## Crear un checkpoint con acciones

In [83]:
from great_expectations.checkpoint import UpdateDataDocsAction


In [90]:
validation_definitions_ = [
    context.validation_definitions.get(name="my_validation_definition_1")
] 

In [91]:
action_list = [
    UpdateDataDocsAction(
        name="update_all_data_docs"
    )
]

In [92]:
# Crear el checkpoint

checkpoint_name = "my_checkpoint_1"

checkpoint = gx.Checkpoint(
    name=checkpoint_name,
    validation_definitions=validation_definitions_,
    actions=action_list,
    result_format={"result_format": "COMPLETE"},
)

In [93]:
# Guardar el checkpoint en el data context

context.checkpoints.add(checkpoint)

Checkpoint(name='my_checkpoint_1', validation_definitions=[ValidationDefinition(name='my_validation_definition_1', data=BatchDefinition(id=UUID('d1c539d2-6b4d-4d61-8c4f-b3ff33b53da7'), name='batch_definition_1', partitioner=FileNamePartitionerPath(regex=re.compile('pragmaticos_test.parquet$'), param_names=(), sort_ascending=True)), suite={
  "name": "my_expectation_suite_1",
  "id": "10be34aa-102d-4436-a69a-53f535572186",
  "expectations": [
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "sap_personal_id"
      },
      "meta": {},
      "id": "9c4f03e6-d20d-4907-9fa7-f32fca3ca4b0"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "correo_corporativo"
      },
      "meta": {},
      "id": "6e83979b-2348-43ca-8278-0d97e1b213e9"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "nacionalidad"
      },
      "meta": {},
      "id": "390a965

In [94]:
validation_results = checkpoint.run()

Calculating Metrics: 100%|██████████| 35/35 [00:00<00:00, 5702.32it/s]


In [95]:
print(validation_results)

run_id={"run_name": null, "run_time": "2025-05-03T20:55:55.959444-05:00"} run_results={ValidationResultIdentifier::my_expectation_suite_1/__none__/20250504T015555.959444Z/my_filesystem_data_source_1-pragmaticos_test_parquet_file: {
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_unique",
        "kwargs": {
          "batch_id": "my_filesystem_data_source_1-pragmaticos_test_parquet_file",
          "column": "sap_personal_id"
        },
        "meta": {},
        "id": "9c4f03e6-d20d-4907-9fa7-f32fca3ca4b0"
      },
      "result": {
        "element_count": 27,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": [],
  

In [96]:
context.open_data_docs()

In [97]:
print(context.variables.config.stores["expectations_store"])
print(context.variables.config.stores["validation_definition_store"])
print(context.variables.config.stores["checkpoint_store"])
print(context.variables.config.stores["validation_results_store"])

{'class_name': 'ExpectationsStore', 'store_backend': {'class_name': 'TupleFilesystemStoreBackend', 'base_directory': 'expectations/'}}
{'class_name': 'ValidationDefinitionStore', 'store_backend': {'class_name': 'TupleFilesystemStoreBackend', 'base_directory': 'validation_definitions/'}}
{'class_name': 'CheckpointStore', 'store_backend': {'class_name': 'TupleFilesystemStoreBackend', 'suppress_store_backend_id': True, 'base_directory': 'checkpoints/'}}
{'class_name': 'ValidationResultsStore', 'store_backend': {'class_name': 'TupleFilesystemStoreBackend', 'base_directory': 'uncommitted/validations/'}}
