In [5]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath("../"))

from src.tdq.tdq_checks import (
    schema_check, null_check, duplicate_check,
    freshness_check, datatype_validity
)

visitor_df = pd.read_csv("../data/visitor_events.csv")
applications_df = pd.read_csv("../data/applications.csv")
accounts_df = pd.read_csv("../data/accounts.csv")
transactions_df = pd.read_csv("../data/transactions.csv")
marketing_df = pd.read_csv("../data/marketing_source.csv")

datasets = {
    "visitor_events": visitor_df,
    "applications": applications_df,
    "accounts": accounts_df,
    "transactions": transactions_df,
    "marketing_source": marketing_df
}


In [6]:
expected_schema = {
    "visitor_events": {
        "event_id": "object",
        "visitor_id": "int64",
        "event_type": "object",
        "device_type": "object",
        "marketing_source": "object",
        "event_timestamp": "object",
        "session_id": "object",
        "geo_country": "object",
        "geo_city": "object",
    },
    "applications": {
        "application_id": "object",
        "visitor_id": "int64",
        "application_date": "object",
        "status": "object",
        "credit_score": "float64",
        "income": "int64",
        "loan_amount": "int64",
        "product_type": "object",
        "source_channel": "object",
    },
    "accounts": {
        "account_id": "object",
        "application_id": "object",
        "account_open_date": "object",
        "account_type": "object",
        "initial_deposit": "int64",
        "kyc_status": "object",
    },
    "transactions": {
        "transaction_id": "object",
        "account_id": "object",
        "transaction_timestamp": "object",
        "amount": "float64",
        "transaction_type": "object",
        "merchant_category": "object",
        "channel": "object",
    },
    "marketing_source": {
        "source": "object",
        "channel_cost": "float64",
        "target_demographic": "object",
    }
}


In [7]:
tdq_report = []

for name, df in datasets.items():

    # Defaults (prevent variable errors)
    missing, extra, dtype_mismatch = set(), set(), {}
    nulls = pd.Series(dtype=float)
    dups = 0
    freshness = None
    type_issues = {}

    # 1. Schema
    missing, extra, dtype_mismatch = schema_check(df, expected_schema[name])

    # 2. Null %
    nulls = null_check(df)

    # 3. Duplicates
    dups = duplicate_check(df)

    # 4. Freshness
    ts_cols = [c for c in df.columns if "timestamp" in c or "date" in c]
    if ts_cols:
        df[ts_cols[0]] = pd.to_datetime(df[ts_cols[0]], errors="coerce")
        freshness = freshness_check(df, ts_cols[0])
    else:
        freshness = {"null_timestamps": None, "max_timestamp": None}

    # 5. Data-type validation
    numeric_cols = [
        col for col, typ in expected_schema[name].items()
        if "int" in typ or "float" in typ
    ]
    type_issues = datatype_validity(df, numeric_cols)

    # store
    tdq_report.append({
        "table": name,
        "schema_miss": missing,
        "schema_extra": extra,
        "dtype_mismatch": dtype_mismatch,
        "null_percent": nulls.to_dict(),
        "duplicates": dups,
        "freshness": freshness,
        "type_issues": type_issues,
    })


In [8]:
summary_rows = []

for result in tdq_report:
    summary_rows.append({
        "table": result["table"],
        "missing_columns": len(result["schema_miss"]),
        "extra_columns": len(result["schema_extra"]),
        "dtype_mismatch": len(result["dtype_mismatch"]),
        "null_issues": sum(v > 0 for v in result["null_percent"].values()),
        "duplicate_rows": result["duplicates"],
        "null_timestamps": (
            result["freshness"]["null_timestamps"]
            if isinstance(result["freshness"], dict)
            else None
        ),
    })

summary_df = pd.DataFrame(summary_rows)
summary_df


Unnamed: 0,table,missing_columns,extra_columns,dtype_mismatch,null_issues,duplicate_rows,null_timestamps
0,visitor_events,0,0,0,1,50,1001.0
1,applications,0,0,0,1,50,0.0
2,accounts,0,0,0,0,30,0.0
3,transactions,0,0,0,1,100,1997.0
4,marketing_source,0,0,0,1,0,
