In [None]:
!pip install great_expectations

In [None]:
# pyright: reportPrivateImportUsage=false

import json
from pathlib import Path
from typing import Dict, Any, List, Tuple

import pandas as pd
import great_expectations as gx

# ======================================================
# Google Drive
# ======================================================
from google.colab import drive
drive.mount("/content/drive")

# ======================================================
# Paths (Drive)
# ======================================================
BASE = Path("/content/drive/MyDrive/dadosfera")

RAW_DIR = BASE / "cdm_bronze"
REPORT_DIR = BASE / "reports" / "data_quality_precleaning"
REPORT_DIR.mkdir(parents=True, exist_ok=True)

print("CDM Bronze:", RAW_DIR)
print("Reports:", REPORT_DIR)

DATASETS = {
    "orders": "order.csv",
    "order_items": "order_item.csv",
    "payments": "payment.csv",
    "reviews": "review.csv",
    "customers": "customer.csv",
    "sellers": "seller.csv",
    "products": "product.csv",
    "geolocation": "geo_zip.csv",
}

# ======================================================
# IO utils
# ======================================================
def load_csv(filename: str) -> pd.DataFrame:
    path = RAW_DIR / filename
    if not path.exists():
        raise FileNotFoundError(f"CSV não encontrado: {path}")
    return pd.read_csv(path, low_memory=False)


def save_json(obj: Any, path: Path):
    def _default(o):
        if hasattr(o, "to_json_dict"):
            return o.to_json_dict()
        if hasattr(o, "dict"):
            return o.dict()
        if hasattr(o, "__dict__"):
            return o.__dict__
        return str(o)

    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False, default=_default)


def write_summary_md(summary_rows: List[Dict[str, Any]], path: Path):
    lines = []
    lines.append("# Relatório de Qualidade de Dados – Great Expectations (CDM Bronze)\n")
    lines.append(
        "Validações executadas sobre os dados da camada CDM Bronze, antes do processo de limpeza e normalização (pre-cleaning).\n"
    )
    lines.append("| Dataset | Sucesso | % Sucesso | Expectations | OK | Falhas |")
    lines.append("|---|---:|---:|---:|---:|---:|")
    for r in summary_rows:
        lines.append(
            f"| {r['dataset']} | {'Ok' if r['success'] else 'Not Okay'} | {r['success_percent']}% | "
            f"{r['evaluated_expectations']} | {r['successful_expectations']} | {r['unsuccessful_expectations']} |"
        )
    path.write_text("\n".join(lines), encoding="utf-8")


# ======================================================
# Great Expectations
# ======================================================
def get_or_create_pandas_datasource(context, name: str = "pandas"):
    try:
        return context.data_sources.get(name)
    except Exception:
        return context.data_sources.add_pandas(name)


def make_batch_from_df(context, df: pd.DataFrame, dataset_name: str):
    datasource = get_or_create_pandas_datasource(context, "pandas")
    asset_name = f"{dataset_name}_asset"
    batch_def_name = "whole_dataframe"

    try:
        asset = datasource.get_asset(asset_name)
    except Exception:
        asset = datasource.add_dataframe_asset(name=asset_name)

    try:
        batch_def = asset.get_batch_definition(batch_def_name)
    except Exception:
        batch_def = asset.add_batch_definition_whole_dataframe(name=batch_def_name)

    batch = batch_def.get_batch(batch_parameters={"dataframe": df})
    return batch


def run_expectations(batch, expectations: List[Any]) -> Dict[str, Any]:
    results = []
    ok = 0

    for exp in expectations:
        r = batch.validate(exp)

        if hasattr(r, "to_json_dict"):
            r_json = r.to_json_dict()
        elif isinstance(r, dict):
            r_json = r
        else:
            r_json = {"success": bool(getattr(r, "success", False)), "result": str(r)}

        if bool(r_json.get("success", False)):
            ok += 1

        results.append(r_json)

    total = len(expectations)
    fail = total - ok
    success_percent = round((ok / total) * 100, 2) if total else 0.0

    return {
        "success": fail == 0,
        "statistics": {
            "evaluated_expectations": total,
            "successful_expectations": ok,
            "unsuccessful_expectations": fail,
            "success_percent": success_percent,
        },
        "expectation_results": results,
    }


def summarize_result(dataset_name: str, result_json: Dict[str, Any]) -> Dict[str, Any]:
    stats = result_json.get("statistics", {})
    return {
        "dataset": dataset_name,
        "success": bool(result_json.get("success", False)),
        "success_percent": round(float(stats.get("success_percent", 0.0)), 2),
        "evaluated_expectations": int(stats.get("evaluated_expectations", 0)),
        "successful_expectations": int(stats.get("successful_expectations", 0)),
        "unsuccessful_expectations": int(stats.get("unsuccessful_expectations", 0)),
    }

# ======================================================
# Expectations (as mesmas – CDM Bronze)
# ======================================================
def expectations_orders():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id"),
        gx.expectations.ExpectColumnValuesToBeUnique(column="order_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="created_at"),
        gx.expectations.ExpectColumnValuesToBeInSet(
            column="status",
            value_set=["delivered", "shipped", "canceled", "invoiced", "processing", "unavailable", "approved"],
        ),
    ]


def expectations_order_items():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="order_item_id"),
        gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=["order_id", "order_item_id"]),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="product_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="seller_id"),
        gx.expectations.ExpectColumnValuesToBeBetween(column="item_price", min_value=0),
        gx.expectations.ExpectColumnValuesToBeBetween(column="freight_value", min_value=0),
    ]


def expectations_payments():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="payment_seq"),
        gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=["order_id", "payment_seq"]),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="payment_type", mostly=0.98),
        gx.expectations.ExpectColumnValuesToBeBetween(column="installments", min_value=0, mostly=0.98),
        gx.expectations.ExpectColumnValuesToBeBetween(column="payment_value", min_value=0),
    ]


def expectations_reviews():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id", mostly=0.98),
        gx.expectations.ExpectColumnValuesToBeBetween(column="score", min_value=1, max_value=5, mostly=0.95),
    ]


def expectations_customers():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id"),
        gx.expectations.ExpectColumnValuesToBeUnique(column="customer_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="state", mostly=0.98),
        gx.expectations.ExpectColumnValueLengthsToBeBetween(column="state", min_value=2, max_value=2, mostly=0.98),
    ]


def expectations_sellers():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="seller_id"),
        gx.expectations.ExpectColumnValuesToBeUnique(column="seller_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="state", mostly=0.98),
        gx.expectations.ExpectColumnValueLengthsToBeBetween(column="state", min_value=2, max_value=2, mostly=0.98),
    ]


def expectations_products():
    exps = [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="product_id"),
        gx.expectations.ExpectColumnValuesToBeUnique(column="product_id"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="category_pt", mostly=0.95),
    ]
    for col in ["weight_g", "length_cm", "height_cm", "width_cm"]:
        exps.append(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0, mostly=0.98))
    return exps


def expectations_geolocation():
    return [
        gx.expectations.ExpectColumnValuesToNotBeNull(column="zip_prefix"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="lat_avg"),
        gx.expectations.ExpectColumnValuesToNotBeNull(column="lng_avg"),
        gx.expectations.ExpectColumnValuesToBeBetween(column="lat_avg", min_value=-35, max_value=6, mostly=0.98),
        gx.expectations.ExpectColumnValuesToBeBetween(column="lng_avg", min_value=-75, max_value=-30, mostly=0.98),
    ]

# ======================================================
# Main
# ======================================================
if __name__ == "__main__":
    context = gx.get_context()

    jobs = [
        ("orders", DATASETS["orders"], expectations_orders),
        ("order_items", DATASETS["order_items"], expectations_order_items),
        ("payments", DATASETS["payments"], expectations_payments),
        ("reviews", DATASETS["reviews"], expectations_reviews),
        ("customers", DATASETS["customers"], expectations_customers),
        ("sellers", DATASETS["sellers"], expectations_sellers),
        ("products", DATASETS["products"], expectations_products),
        ("geolocation", DATASETS["geolocation"], expectations_geolocation),
    ]

    full_results = {}
    summary_rows = []

    for dataset_name, filename, exp_factory in jobs:
        print(f"Validando: {dataset_name}")
        df = load_csv(filename)

        batch = make_batch_from_df(context, df, dataset_name)
        result = run_expectations(batch, exp_factory())

        full_results[dataset_name] = result
        summary_rows.append(summarize_result(dataset_name, result))

    full_json = REPORT_DIR / "ge_cdm_bronze_full_results.json"
    summary_json = REPORT_DIR / "ge_cdm_bronze_summary.json"
    summary_md = REPORT_DIR / "ge_cdm_bronze_summary.md"

    save_json(full_results, full_json)
    save_json(summary_rows, summary_json)
    write_summary_md(summary_rows, summary_md)

    print("\nRelatórios gerados:")
    print("-", full_json)
    print("-", summary_json)
    print("-", summary_md)
