In [4]:
pip install great-expectations

Collecting great-expectations
  Using cached great_expectations-1.2.4-py3-none-any.whl (5.0 MB)
Collecting pydantic>=1.10.7
  Using cached pydantic-2.10.1-py3-none-any.whl (455 kB)
Collecting ruamel.yaml>=0.16
  Using cached ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
Collecting marshmallow<4.0.0,>=3.7.1
  Using cached marshmallow-3.23.1-py3-none-any.whl (49 kB)
Collecting altair<5.0.0,>=4.2.1
  Using cached altair-4.2.2-py3-none-any.whl (813 kB)
Collecting posthog<3,>=2.1.0
  Using cached posthog-2.5.0-py2.py3-none-any.whl (36 kB)
Collecting tzlocal>=1.2
  Using cached tzlocal-5.2-py3-none-any.whl (17 kB)
Collecting monotonic>=1.5
  Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting backoff>=1.10.0
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting pydantic-core==2.27.1
  Using cached pydantic_core-2.27.1-cp39-none-win_amd64.whl (2.0 MB)
Collecting annotated-types>=0.6.0
  Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)
Collecting ruamel.yam



In [2]:
from great_expectations.dataset import PandasDataset
import pandas as pd

# Loading the actual dataset
actual_data = PandasDataset(pd.read_csv("Clean_data.csv"))

# Basic Expectations
def test_actual_data_integrity():
    # Checking existence and null values for key columns
    assert actual_data.expect_column_to_exist("sideb").success
    assert actual_data.expect_column_to_exist("sidea").success
    assert actual_data.expect_column_values_to_not_be_null("country_primary").success
    assert actual_data.expect_column_values_to_not_be_null("NSAdyad_id").success

# Schema Validation
def test_actual_data_schema():
    schema = {
        "sideb": "string",
        "sideb_full": "string",
        "country_primary": "string",
        "sidea": "string",
        "sideb_id": "int",
        "NSAdyad_id": "int",
        "frontline_prev_best": "float",
        "frontline": "float",
        "lead": "float"
    }
    for col, dtype in schema.items():
        assert actual_data.expect_column_values_to_be_of_type(col, dtype).success

# Relationship Checks
def test_actual_data_relationships():
    # Example: frontline must always be greater than or equal to frontline_prev_best
    assert actual_data.expect_column_pair_values_a_to_be_greater_than_or_equal_to_b(
        "frontline", "frontline_prev_best"
    ).success

    # Example: lead values should be greater than zero
    assert actual_data.expect_column_values_to_be_greater_than(
        "lead", 0
    ).success

    # Example: Check correlation between columns, if applicable
    assert actual_data.expect_column_pair_values_to_be_in_set(
        "sideb", "sidea"
    ).success

if __name__ == "__main__":
    test_actual_data_integrity()
    test_actual_data_schema()
    test_actual_data_relationships()

    print("All tests passed successfully!")
