In [2]:
import pandas as pd
import great_expectations as gx

In [3]:
df_peserta = pd.read_csv("data/raw/peserta.csv")
df_fktp = pd.read_csv("data/raw/fktp.csv")
df_fkrtl = pd.read_csv("data/raw/fkrtl.csv")

  df_fktp = pd.read_csv("data/raw/fktp.csv")
  df_fkrtl = pd.read_csv("data/raw/fkrtl.csv")


# Data Validation using Great Expectations

In [31]:
context = gx.get_context(mode="file")
data_source = context.data_sources.add_or_update_pandas("bpjs_datasource")

## Kepesertaan Validation

In [32]:
suite = context.suites.add_or_update(gx.ExpectationSuite("peserta_validation"))

peserta_asset = data_source.add_dataframe_asset("peserta_asset")
batch_definition = peserta_asset.add_batch_definition_whole_dataframe(name="peserta_batch_definition")
peserta_batch = batch_definition.get_batch(batch_parameters={"dataframe": df_peserta})

validator = context.get_validator(datasource_name="bpjs_datasource", batch=peserta_batch, expectation_suite_name="peserta_validation")
validator.expect_column_values_to_not_be_null('PSTV01')
validator.expect_column_values_to_not_be_null('PSTV02')
validator.expect_column_values_to_be_in_set("PSTV05", ["LAKI-LAKI", "PEREMPUAN"])
validator.expect_column_values_to_be_of_type("PSTV15", "float")

context.suites.add_or_update(validator.expectation_suite)

Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 1090.18it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 749.92it/s] 
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 160.00it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 999.60it/s]


{
  "name": "peserta_validation",
  "id": "5053f7eb-a988-4e8e-a75d-e0a6f77fc1f5",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV01"
      },
      "meta": {},
      "id": "e48218f3-01b7-4ffb-a882-70bf7cad6f77",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV02"
      },
      "meta": {},
      "id": "732d557a-574e-4844-95ae-b73438e85ebf",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "column": "PSTV05",
        "value_set": [
          "LAKI-LAKI",
          "PEREMPUAN"
        ]
      },
      "meta": {},
      "id": "179a491d-6f5f-40b0-947f-4ddb78cdf367",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "PSTV15",
        "type_": "float"
      },
      "meta": {},
      "i

## FKTP Validation

In [33]:
suite = context.suites.add_or_update(gx.ExpectationSuite("fktp_validation"))

fktp_asset = data_source.add_dataframe_asset("fktp_asset")
batch_definition = fktp_asset.add_batch_definition_whole_dataframe(name="fktp_batch_definition")
fktp_batch = batch_definition.get_batch(batch_parameters={"dataframe": df_fktp})

validator_fktp = context.get_validator(datasource_name="bpjs_datasource", batch=fktp_batch, expectation_suite_name="fktp_validation")
validator_fktp.expect_column_values_to_not_be_null('PSTV01')
validator_fktp.expect_column_values_to_not_be_null('PSTV02')
validator_fktp.expect_column_values_to_not_be_null('FKP02')
validator_fktp.expect_column_values_to_be_of_type("PSTV15", "float")
context.suites.add_or_update(validator_fktp.expectation_suite)

Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 753.31it/s] 
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 858.02it/s] 
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 171.42it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 1011.65it/s]


{
  "name": "fktp_validation",
  "id": "2f3ad9b7-0b64-498f-b544-a652f0166ffb",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV01"
      },
      "meta": {},
      "id": "efb57ba8-c38e-41e6-b1c5-fbb7124869eb",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV02"
      },
      "meta": {},
      "id": "de367c54-84f7-41fa-a32b-832ec52d3b3f",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "FKP02"
      },
      "meta": {},
      "id": "2d0fbd75-7072-4544-8e14-15f9805865b5",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "PSTV15",
        "type_": "float"
      },
      "meta": {},
      "id": "304519c9-de77-4407-aba9-f79f2e8209a0",
      "severity": "critical"
    }
  

## FKRTL Validation

In [34]:
suite = context.suites.add_or_update(gx.ExpectationSuite("fkrtl_validation"))

fkrtl_asset = data_source.add_dataframe_asset("fkrtl_asset")
batch_definition = fkrtl_asset.add_batch_definition_whole_dataframe(name="fkrtl_batch_definition")
fkrtl_batch = batch_definition.get_batch(batch_parameters={"dataframe": df_fkrtl})

validator_fkrtl = context.get_validator(datasource_name="bpjs_datasource", batch=fkrtl_batch, expectation_suite_name="fkrtl_validation")
validator_fkrtl.expect_column_values_to_not_be_null('PSTV01')
validator_fkrtl.expect_column_values_to_not_be_null('PSTV02')
validator_fkrtl.expect_column_values_to_not_be_null('FKL02')
validator_fkrtl.expect_column_values_to_be_of_type("PSTV15", "float")
validator_fkrtl.expect_column_values_to_not_be_null('FKL17A')
validator_fkrtl.expect_column_values_to_not_be_null('FKL30')
validator_fkrtl.expect_column_values_to_be_of_type("FKL47", "int")
validator_fkrtl.expect_column_values_to_be_of_type("FKL48", "int")

context.suites.add_or_update(validator_fkrtl.expectation_suite)

Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 400.30it/s] 
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 399.99it/s] 
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 114.21it/s] 
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 142.86it/s]
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 79.37it/s] 
Calculating Metrics: 100%|██████████| 6/6 [00:00<00:00, 75.20it/s]  
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 166.65it/s]
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 142.92it/s]


{
  "name": "fkrtl_validation",
  "id": "ed756f36-9992-4190-bce1-6f5a3cb29b6e",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV01"
      },
      "meta": {},
      "id": "717e4433-57c2-43ac-b5ab-23e65dd6298c",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "PSTV02"
      },
      "meta": {},
      "id": "c3a2fbc1-f3ec-4247-a2e3-beab3af715ba",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "FKL02"
      },
      "meta": {},
      "id": "77e2c842-cbf6-47b4-b0c4-b545c54eb8b1",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "PSTV15",
        "type_": "float"
      },
      "meta": {},
      "id": "47a7adb3-4915-4b5f-b9a2-76cbdec5972a",
      "severity": "critical"
    },


In [35]:
def get_failed_expectations(checkpoint_result):
    failures = []
    for validation_result in checkpoint_result.run_results.values():
        for exp_result in validation_result.results:
            if not exp_result.success:
                failures.append({
                    "expectation": exp_result.expectation_config.type,
                    "column": exp_result.expectation_config.kwargs.get("column"),
                    "unexpected_count": exp_result.result.get("unexpected_count"),
                    "unexpected_percent": exp_result.result.get("unexpected_percent"),
                })
    return pd.DataFrame(failures)

In [36]:
cp_peserta = context.checkpoints.add_or_update(
    gx.Checkpoint(
        name="checkpoint_peserta",
        validation_definitions=[
            gx.ValidationDefinition(
                name="peserta_audit",
                data=peserta_asset.get_batch_definition("peserta_batch_definition"),
                suite=context.suites.get("peserta_validation"),
            ),
        ],
    )
)
cp_fktp = context.checkpoints.add_or_update(
    gx.Checkpoint(
        name="checkpoint_fktp",
        validation_definitions=[
            gx.ValidationDefinition(
                name="fktp_audit",
                data=fktp_asset.get_batch_definition("fktp_batch_definition"),
                suite=context.suites.get("fktp_validation"),
            ),
        ],
    )
)

cp_fkrtl = context.checkpoints.add_or_update(
    gx.Checkpoint(
        name="checkpoint_fkrtl",
        validation_definitions=[
            gx.ValidationDefinition(
                name="fkrtl_audit",
                data=fkrtl_asset.get_batch_definition("fkrtl_batch_definition"),
                suite=context.suites.get("fkrtl_validation"),
            ),
        ],
    )
)

result_peserta = cp_peserta.run(batch_parameters={"dataframe": df_peserta})
result_fktp = cp_fktp.run(batch_parameters={"dataframe": df_fktp})
result_fkrtl = cp_fkrtl.run(batch_parameters={"dataframe": df_fkrtl})

failed_peserta_df = get_failed_expectations(result_peserta)
failed_fktp_df = get_failed_expectations(result_fktp)
failed_fkrtl_df = get_failed_expectations(result_fkrtl)

context.build_data_docs()
context.open_data_docs()

Calculating Metrics: 100%|██████████| 23/23 [00:00<00:00, 115.96it/s]
Calculating Metrics: 100%|██████████| 19/19 [00:00<00:00, 204.23it/s]
Calculating Metrics: 100%|██████████| 29/29 [00:01<00:00, 28.05it/s] 


In [None]:
def show_failed_validation(result_df: pd.DataFrame):
    if not result_df.empty:
        display(result_df)

In [52]:
show_failed_validation(failed_peserta_df)
show_failed_validation(failed_fktp_df)
show_failed_validation(failed_fkrtl_df)

Unnamed: 0,expectation,column,unexpected_count,unexpected_percent
0,expect_column_values_to_not_be_null,FKL30,405169,49.013901


In [38]:
context.suites.get("peserta_validation").save()
context.suites.get("fktp_validation").save()
context.suites.get("fkrtl_validation").save()