In [1]:
from pathlib import Path
import pandas as pd, json, os, glob

# Project roots
PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
PROC = PROJ / "data" / "processed"
RAW  = PROJ / "data" / "raw"
MODELS = PROJ / "models"
TOOLS  = PROJ / "tools"
FIGS   = PROC / "figs"

def exists(p): 
    p = Path(p); 
    return p.exists() and (p.stat().st_size > 0)

# --- Required files checklist ---
required = {
    "models": [
        MODELS/"preprocess.joblib",
        MODELS/"rf_treated.joblib",
        MODELS/"rf_control.joblib",
        MODELS/"meta.json",
    ],
    "tools": [
        TOOLS/"score_contacts.py",
    ],
    "processed csvs": [
        PROC/"hillstrom_uplift_curve.csv",
        PROC/"hillstrom_uplift_deciles.csv",
        PROC/"policy_curve_email.csv",
        PROC/"policy_contact_list.csv",
        PROC/"policy_sensitivity.csv",
        PROC/"hillstrom_uplift_scores_test.csv",
        PROC/"nb_scored_test.csv",                # from notebook scoring of test-only
        PROC/"cli_sanity_contacts_test.csv",      # from CLI scoring of test-only
        PROC/"required_features_schema.csv",
        PROC/"scored_contacts.csv",               # your new “real” scoring
        PROC/"scored_contacts_with_flag.csv",     # same with should_contact flag
    ],
}

optional_globs = {
    "figures (optional)": [
        str(FIGS/"policy_sensitivity_heatmap.png"),
        str(FIGS/"campaign_economics.png"),
    ],
    "packs / reports (optional)": [
        str(PROC/"policy_pack.xlsx"),
        str(PROC/"reports/*.html"),
        str(PROC/"contact_pack_*.xlsx"),
        str(PROC/"contact_pack_*.md"),
    ],
}

print("# CHECKLIST")
missing = False
for bucket, files in required.items():
    print(f"\n[{bucket}]")
    for f in files:
        ok = exists(f)
        print("  ✓" if ok else "  ✗", f)
        missing |= (not ok)

for bucket, patterns in optional_globs.items():
    print(f"\n[{bucket}]")
    for pat in patterns:
        matches = [m for m in glob.glob(pat) if exists(m)]
        ok = len(matches) > 0
        print("  ✓" if ok else "  –", pat, ("" if ok else "(none found)"))

# --- Parity sanity: notebook vs CLI on TEST-ONLY ---
print("\n# PARITY (TEST-ONLY)")
try:
    nb  = pd.read_csv(PROC/"nb_scored_test.csv")[["row_id","uplift_hat"]]
    cli = pd.read_csv(PROC/"cli_sanity_contacts_test.csv")[["row_id","uplift_hat"]].rename(columns={"uplift_hat":"uplift_cli"})
    cmp = nb.merge(cli, on="row_id", how="inner")
    corr = cmp[["uplift_hat","uplift_cli"]].corr().iloc[0,1]
    k = 1000
    nb_top  = set(cmp.sort_values("uplift_hat", ascending=False).head(k)["row_id"])
    cli_top = set(cmp.sort_values("uplift_cli",  ascending=False).head(k)["row_id"])
    overlap = len(nb_top & cli_top)/k
    print(f"Rows in common: {len(cmp)} / {len(nb)}")
    print(f"Pearson corr: {corr:.6f}  |  Top-{k} overlap: {overlap:.3f}")
except Exception as e:
    print("Parity check skipped:", e)


# CHECKLIST

[models]
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\models\preprocess.joblib
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\models\rf_treated.joblib
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\models\rf_control.joblib
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\models\meta.json

[tools]
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\tools\score_contacts.py

[processed csvs]
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\hillstrom_uplift_curve.csv
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\hillstrom_uplift_deciles.csv
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\policy_curve_email.csv
  ✓ C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\

In [2]:
from pathlib import Path
import pandas as pd, json, joblib

PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
PROC = PROJ / "data" / "processed"

nb  = pd.read_csv(PROC / "nb_scored_test.csv")[["row_id","uplift_hat"]]
cli = pd.read_csv(PROC / "cli_all_scored_test.csv")[["row_id","uplift_hat"]].rename(columns={"uplift_hat":"uplift_cli"})

cmp = nb.merge(cli, on="row_id", how="inner")
print("Rows in common:", len(cmp), "/", len(nb))
print("Pearson corr:  ", round(cmp["uplift_hat"].corr(cmp["uplift_cli"]), 6))

k = 1000
nb_top  = set(cmp.sort_values("uplift_hat", ascending=False).head(k)["row_id"])
cli_top = set(cmp.sort_values("uplift_cli",  ascending=False).head(k)["row_id"])
print(f"Top-{k} overlap:", round(len(nb_top & cli_top) / k, 3))


Rows in common: 16000 / 16000
Pearson corr:   1.0
Top-1000 overlap: 1.0
