## Clean scaffold + docs + dataset bootstrap.

In [1]:
from pathlib import Path
import textwrap, json

BASE = Path.cwd() / "cardiometrix"

# Folders we’ll use throughout the project
for p in [
    BASE / "data" / "raw",
    BASE / "data" / "processed",
    BASE / "data" / "external",
    BASE / "indices",
    BASE / "registry",
    BASE / "src",
    BASE / "docs",
    BASE / "notebooks",
]:
    p.mkdir(parents=True, exist_ok=True)

# --- PRD (short + focused) ---
prd = textwrap.dedent("""\
# CardioMetrix — PRD (Week 1)

## Objective
Early triage for:
- **Diabetes** (binary) — risk-factor model (no fasting glucose used by the model).
- **Hypertension** (binary) — may use SBP/DBP (clinical signal).

Outputs: calibrated probabilities, thresholded decisions, explanations, fairness slices.

## Success Metrics
- AUROC (per label) ≥ 0.85 (stretch), strong PR-AUC.
- Calibration (ECE/Brier) acceptable.
- Honest generalization via stratified splits; document fairness gaps.

## Ethics & Guardrails
- Decision support only, not medical advice.
- No leakage (esp. diabetes: exclude fasting_glucose as a feature).
- Group-aware thresholds for diabetes (by sex) with recall floor; fallback to global.

## Data
Public tabular datasets (no PII). Reproducible pipeline: raw → harmonized → labeled → splits.
""").strip()
(BASE / "docs" / "PRD.md").write_text(prd, encoding="utf-8")

# --- Data Dictionary (we’ll evolve it) ---
data_dict = textwrap.dedent("""\
column,type,units,allowed_range,missing_policy,notes
age,float,years,0-120,impute_median,
sex,category,,{M,F,NaN},impute_mode,one-hot later
bmi,float,kg/m^2,10-60,impute_median,compute later if height/weight present
sbp,float,mmHg,70-250,impute_median,
dbp,float,mmHg,40-150,impute_median,
tc,float,mg/dL,80-400,impute_median,optional
fasting_glucose,float,mg/dL,50-400,impute_median,NOT used by diabetes model (avoid leakage)
extra__pregnancies,float,count,0-20,impute_median,Pima only
extra__diabetespedigreefunction,float,score,0-3,impute_median,Pima only
source_dataset,category,,{pima_diabetes,uci_heart,uci_ckd},required,provenance
label_diabetes,int,,{0,1},derived,threshold policy documented
label_hypertension,int,,{0,1},derived,threshold policy documented
""").strip()
(BASE / "docs" / "DATA_DICTIONARY.csv").write_text(data_dict, encoding="utf-8")

# --- Label policy (deterministic & reproducible) ---
label_policy = textwrap.dedent("""\
# LABEL_POLICY.md

## Diabetes (label_diabetes)
1 if (HbA1c ≥ 6.5%) OR (fasting_glucose ≥ 126 mg/dL) OR dataset diagnosis flag == 1; else 0.
*Model features exclude fasting_glucose to avoid leakage.*

## Hypertension (label_hypertension)
1 if (SBP ≥ 140) OR (DBP ≥ 90); else 0. (Standard adult threshold, documented.)

## ASCVD proxy (optional educational regression target)
Sigmoid combination of age/sbp/tc/hdl → calibrated; not a clinical calculator.

## Hygiene
- Drop physiologically impossible values.
- Persist indices; no patient overlap across splits.
""").strip()
(BASE / "docs" / "LABEL_POLICY.md").write_text(label_policy, encoding="utf-8")

# --- Split strategy ---
split_doc = textwrap.dedent("""\
# SPLIT_STRATEGY.md

- Stratify on joint label: "00","01","10","11".
- Train 70% / Val 15% / Test 15%, fixed seed.
- Persist row indices to /indices/{train,val,test}.csv
""").strip()
(BASE / "docs" / "SPLIT_STRATEGY.md").write_text(split_doc, encoding="utf-8")

print("✅ Scaffold ready at:", BASE.resolve())
print("Docs written: PRD.md, DATA_DICTIONARY.csv, LABEL_POLICY.md, SPLIT_STRATEGY.md")


✅ Scaffold ready at: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix
Docs written: PRD.md, DATA_DICTIONARY.csv, LABEL_POLICY.md, SPLIT_STRATEGY.md


Dataset manifest (what we expect to download)

In [3]:
# Step-1 / Cell 2: dataset manifest for reproducibility (filenames & column mapping hints)
from pathlib import Path
import json

BASE = Path.cwd() / "cardiometrix"
manifest = {
  "sources": [
    {
      "slug": "uciml/pima-indians-diabetes-database",
      "raw_file_expected": "diabetes.csv",
      "mapping_hints": {
        "Age":"age",
        "BMI":"bmi",
        "BloodPressure":"dbp",
        "Glucose":"fasting_glucose",
        "Pregnancies":"extra__pregnancies",
        "DiabetesPedigreeFunction":"extra__diabetespedigreefunction",
        "Outcome":"diag_flag_diabetes"
      }
    },
    {
      "slug": "mragpavank/heart-diseaseuci",
      "raw_file_expected": "heart.csv",
      "mapping_hints": {
        "age":"age",
        "sex":"sex (0=F,1=M → map to F/M)",
        "trestbps":"sbp",
        "chol":"tc",
        "target|num":"heart_label_raw"
      }
    },
    {
      "slug": "mansoordaku/ckdisease",
      "raw_file_expected": "ckd.csv",
      "mapping_hints": {
        "age":"age",
        "bp":"sbp (note: may be mean BP; document assumption)",
        "bgr":"fasting_glucose (random proxy)",
        "sc":"extra__sc",
        "htn":"extra__htn (yes/no)",
        "dm":"extra__dm (yes/no)",
        "classification":"ckd_raw (ckd/notckd)"
      }
    }
  ]
}
(BASE / "docs" / "DATASET_MANIFEST.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("✅ Wrote docs/DATASET_MANIFEST.json")


✅ Wrote docs/DATASET_MANIFEST.json


In [4]:
from pathlib import Path
import pandas as pd

RAW = Path.cwd() / "cardiometrix" / "data" / "raw"

def check(name):
    fp = RAW / name
    print(f"{name}: {'FOUND' if fp.exists() else 'MISSING'}")
    if fp.exists():
        try:
            display(pd.read_csv(fp).head())
        except Exception as e:
            print("Preview error:", e)

for name in ["diabetes.csv", "heart.csv", "ckd.csv"]:
    check(name)


diabetes.csv: FOUND


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


heart.csv: FOUND


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


ckd.csv: FOUND


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


## Step 2

— Imports, folders, schema, helpers

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# Project paths
BASE = Path.cwd() / "cardiometrix"
RAW  = BASE / "data" / "raw"
PROC = BASE / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

# Core schema for the harmonized table (features only; labels come in Step-3)
CORE_COLS = [
    "age","sex","height_cm","weight_kg","bmi",
    "sbp","dbp","hr",
    "tc","hdl","ldl","tg",
    "fasting_glucose","hba1c",
    "egfr","ckd",
    "smoker","activity_level","family_history_dm","pregnant",
    "source_dataset"
]

# Numeric columns where we'll add missingness indicators and apply sanity bounds
NUM_COLS = [
    "age","height_cm","weight_kg","bmi",
    "sbp","dbp","hr",
    "tc","hdl","ldl","tg",
    "fasting_glucose","hba1c","egfr"
]

# Conservative physiological bounds; out-of-range → set to NaN (we won't silently clip)
BOUNDS = {
    "age": (0, 120),
    "height_cm": (120, 220),
    "weight_kg": (30, 250),
    "bmi": (10, 60),
    "sbp": (70, 250),
    "dbp": (40, 150),
    "hr":  (30, 200),
    "tc":  (80, 400),
    "hdl": (10, 120),
    "ldl": (30, 300),
    "tg":  (30, 1000),
    "fasting_glucose": (50, 400),
    "hba1c": (3.5, 15),
    "egfr": (5, 120),
}

def coerce_numeric(df, cols):
    """Best-effort numeric cast; non-numeric → NaN."""
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def apply_bounds(df):
    """Out-of-range numeric values → NaN."""
    for c, (lo, hi) in BOUNDS.items():
        if c in df.columns:
            bad = (df[c] < lo) | (df[c] > hi)
            df.loc[bad, c] = np.nan
    return df

def add_missing_indicators(df, cols):
    """For each numeric feature, add is_missing__<col> (0/1)."""
    for c in cols:
        if c in df.columns:
            df[f"is_missing__{c}"] = df[c].isna().astype(int)
        else:
            df[f"is_missing__{c}"] = 1
    return df

def ensure_core_columns(df):
    """Guarantee CORE_COLS exist and order columns; keep extras with 'extra__' prefix."""
    for c in CORE_COLS:
        if c not in df.columns:
            df[c] = np.nan
    ordered = CORE_COLS + [c for c in df.columns if c not in CORE_COLS]
    return df[ordered]


Map Pima (diabetes.csv) → schema

In [None]:
pima_fp = RAW / "diabetes.csv"
assert pima_fp.exists(), "Missing data/raw/diabetes.csv — re-run Step-1 downloads."

pima = pd.read_csv(pima_fp)

# Build harmonized view:
# NOTE: We DO keep fasting_glucose here for labeling later, but we won't use it as a diabetes model feature.
pima_m = pd.DataFrame({
    "age": pima.get("Age"),
    "sex": np.nan,                         # not provided
    "height_cm": np.nan,
    "weight_kg": np.nan,
    "bmi": pima.get("BMI"),
    "sbp": np.nan,
    "dbp": pima.get("BloodPressure"),
    "hr": np.nan,
    "tc": np.nan, "hdl": np.nan, "ldl": np.nan, "tg": np.nan,
    "fasting_glucose": pima.get("Glucose"),  # kept for labeling (not a feature for diabetes model)
    "hba1c": np.nan,
    "egfr": np.nan,
    "ckd": np.nan,                        # unknown in Pima
    "smoker": np.nan, "activity_level": np.nan, "family_history_dm": np.nan, "pregnant": np.nan,
    "source_dataset": "pima_diabetes"
})

# Attach useful extras for later modeling/EDA (namespaced)
extras = pima[["Pregnancies","DiabetesPedigreeFunction","Outcome"]].copy()
extras.columns = ["extra__pregnancies","extra__diabetespedigreefunction","extra__outcome"]
pima_m = pd.concat([pima_m, extras], axis=1)

# Clean + QA
pima_m = coerce_numeric(pima_m, NUM_COLS)
pima_m = apply_bounds(pima_m)
pima_m = add_missing_indicators(pima_m, NUM_COLS)
pima_m = ensure_core_columns(pima_m)

print("PIMA mapped shape:", pima_m.shape)
pima_m.head()


PIMA mapped shape: (768, 38)


Unnamed: 0,age,sex,height_cm,weight_kg,bmi,sbp,dbp,hr,tc,hdl,...,is_missing__sbp,is_missing__dbp,is_missing__hr,is_missing__tc,is_missing__hdl,is_missing__ldl,is_missing__tg,is_missing__fasting_glucose,is_missing__hba1c,is_missing__egfr
0,50.0,,,,33.6,,72.0,,,,...,1,0,1,1,1,1,1,0,1,1
1,31.0,,,,26.6,,66.0,,,,...,1,0,1,1,1,1,1,0,1,1
2,32.0,,,,23.3,,64.0,,,,...,1,0,1,1,1,1,1,0,1,1
3,21.0,,,,28.1,,66.0,,,,...,1,0,1,1,1,1,1,0,1,1
4,33.0,,,,43.1,,40.0,,,,...,1,0,1,1,1,1,1,0,1,1


Map UCI Heart (heart.csv) → schema

In [None]:
heart_fp = RAW / "heart.csv"
assert heart_fp.exists(), "Missing data/raw/heart.csv — re-run Step-1 downloads."

heart = pd.read_csv(heart_fp)

# Sex: 1=male, 0=female (map to 'M'/'F')
def map_sex(v):
    try:
        v = int(v)
        return "M" if v == 1 else "F"
    except Exception:
        return np.nan

sex_col = heart["sex"].map(map_sex) if "sex" in heart.columns else np.nan

heart_m = pd.DataFrame({
    "age": heart.get("age"),
    "sex": sex_col,
    "height_cm": np.nan,
    "weight_kg": np.nan,
    "bmi": np.nan,
    "sbp": heart.get("sbp") if "sbp" in heart.columns else heart.get("trestbps"),
    "dbp": np.nan,                        # dataset lacks DBP
    "hr": np.nan,                         # 'thalach' = max HR on stress test, not resting HR
    "tc": heart.get("tc") if "tc" in heart.columns else heart.get("chol"),
    "hdl": np.nan, "ldl": np.nan, "tg": np.nan,
    "fasting_glucose": np.nan,           # 'fbs' is a boolean flag, not a value
    "hba1c": np.nan,
    "egfr": np.nan,
    "ckd": np.nan,
    "smoker": np.nan, "activity_level": np.nan, "family_history_dm": np.nan, "pregnant": np.nan,
    "source_dataset": "uci_heart"
})

# Keep a handful of raw columns as extras (namespaced)
keep_cols = [c for c in ["cp","fbs","thalach","oldpeak","slope","ca","thal","target","num"] if c in heart.columns]
if keep_cols:
    h_ex = heart[keep_cols].copy()
    h_ex.columns = [f"extra__{c}" for c in keep_cols]
    heart_m = pd.concat([heart_m, h_ex], axis=1)

# Clean + QA
heart_m = coerce_numeric(heart_m, NUM_COLS)
heart_m = apply_bounds(heart_m)
heart_m = add_missing_indicators(heart_m, NUM_COLS)
heart_m = ensure_core_columns(heart_m)

print("HEART mapped shape:", heart_m.shape)
heart_m.head()


HEART mapped shape: (303, 43)


Unnamed: 0,age,sex,height_cm,weight_kg,bmi,sbp,dbp,hr,tc,hdl,...,is_missing__sbp,is_missing__dbp,is_missing__hr,is_missing__tc,is_missing__hdl,is_missing__ldl,is_missing__tg,is_missing__fasting_glucose,is_missing__hba1c,is_missing__egfr
0,63.0,M,,,,145.0,,,233.0,,...,0,1,1,0,1,1,1,1,1,1
1,37.0,M,,,,130.0,,,250.0,,...,0,1,1,0,1,1,1,1,1,1
2,41.0,F,,,,130.0,,,204.0,,...,0,1,1,0,1,1,1,1,1,1
3,56.0,M,,,,120.0,,,236.0,,...,0,1,1,0,1,1,1,1,1,1
4,57.0,F,,,,120.0,,,354.0,,...,0,1,1,0,1,1,1,1,1,1


— Map CKD (ckd.csv) → schema

In [None]:
ckd_fp = RAW / "ckd.csv"
assert ckd_fp.exists(), "Missing data/raw/ckd.csv — re-run Step-1 downloads."

ckd = pd.read_csv(ckd_fp)

# Normalize categorical yes/no-ish strings
def norm_str(x):
    return str(x).strip().lower() if pd.notna(x) else x

ckd_norm = ckd.copy()
for c in ckd_norm.columns:
    if ckd_norm[c].dtype == object:
        ckd_norm[c] = ckd_norm[c].apply(norm_str)

# classification → ckd flag (1=ckd, 0=notckd)
ckd_flag = None
if "classification" in ckd_norm.columns:
    ckd_flag = ckd_norm["classification"].map(lambda s: 1 if s=="ckd" else (0 if s=="notckd" else np.nan))

# bp column: some CKD sources use 'mean BP'; we treat it as SBP proxy and document this assumption.
ckd_m = pd.DataFrame({
    "age": ckd_norm.get("age"),
    "sex": np.nan,                          # usually missing in this CSV
    "height_cm": np.nan,
    "weight_kg": np.nan,
    "bmi": np.nan,
    "sbp": ckd_norm.get("bp"),
    "dbp": np.nan,
    "hr": np.nan,
    "tc": np.nan, "hdl": np.nan, "ldl": np.nan, "tg": np.nan,
    "fasting_glucose": ckd_norm.get("bgr"),  # random glucose; kept as numeric for EDA/labels
    "hba1c": np.nan,
    "egfr": np.nan,                          # not reliably derivable here
    "ckd": ckd_flag,
    "smoker": np.nan, "activity_level": np.nan, "family_history_dm": np.nan, "pregnant": np.nan,
    "source_dataset": "uci_ckd"
})

# Useful extras from CKD
extra_cols = [c for c in ["sc","htn","dm","bgr"] if c in ckd_norm.columns]
if extra_cols:
    ex = ckd_norm[extra_cols].copy()
    ex.columns = [f"extra__{c}" for c in extra_cols]
    ckd_m = pd.concat([ckd_m, ex], axis=1)

# Clean + QA
ckd_m = coerce_numeric(ckd_m, NUM_COLS)
ckd_m = apply_bounds(ckd_m)
ckd_m = add_missing_indicators(ckd_m, NUM_COLS)
ckd_m = ensure_core_columns(ckd_m)

print("CKD mapped shape:", ckd_m.shape)
ckd_m.head()


CKD mapped shape: (400, 39)


Unnamed: 0,age,sex,height_cm,weight_kg,bmi,sbp,dbp,hr,tc,hdl,...,is_missing__sbp,is_missing__dbp,is_missing__hr,is_missing__tc,is_missing__hdl,is_missing__ldl,is_missing__tg,is_missing__fasting_glucose,is_missing__hba1c,is_missing__egfr
0,48.0,,,,,80.0,,,,,...,0,1,1,1,1,1,1,0,1,1
1,7.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1
2,62.0,,,,,80.0,,,,,...,0,1,1,1,1,1,1,1,1,1
3,48.0,,,,,70.0,,,,,...,0,1,1,1,1,1,1,0,1,1
4,51.0,,,,,80.0,,,,,...,0,1,1,1,1,1,1,0,1,1


Concatenate → save processed/harmonized.csv

In [None]:
harmonized = pd.concat([pima_m, heart_m, ckd_m], ignore_index=True)

# Normalize sex values to {M,F,NaN}
harmonized["sex"] = harmonized["sex"].map(lambda x: x if x in ["M","F"] else np.nan)

# Save
out_fp = PROC / "harmonized.csv"
harmonized.to_csv(out_fp, index=False)
print("✅ Wrote:", out_fp, "shape:", harmonized.shape)

# Quick peek
harmonized.sample(min(5, len(harmonized)), random_state=42)


✅ Wrote: c:\Users\hp\Desktop\Cardio Metrix\cardiometrix\data\processed\harmonized.csv shape: (1471, 50)


Unnamed: 0,age,sex,height_cm,weight_kg,bmi,sbp,dbp,hr,tc,hdl,...,extra__thalach,extra__oldpeak,extra__slope,extra__ca,extra__thal,extra__target,extra__sc,extra__htn,extra__dm,extra__bgr
852,42.0,F,,,,102.0,,,265.0,,...,122.0,0.6,1.0,0.0,2.0,1.0,,,,
184,40.0,,,,27.6,,74.0,,,,...,,,,,,,,,,
1223,39.0,,,,,70.0,,,,,...,,,,,,,0.8,no,yes,121.0
67,54.0,,,,42.7,,92.0,,,,...,,,,,,,,,,
220,21.0,,,,34.6,,60.0,,,,...,,,,,,,,,,


QA summary (missingness, sources, quick validity)

In [None]:
df = pd.read_csv(PROC / "harmonized.csv")

print("Rows:", len(df), "| Columns:", df.shape[1])
print("\nSource counts:\n", df["source_dataset"].value_counts(dropna=False))

# Numeric missingness report
num_cols_present = [c for c in NUM_COLS if c in df.columns]
miss = (df[num_cols_present].isna().mean()*100).round(1).sort_values(ascending=False)
print("\nNumeric missingness (%):\n", miss)

# Sex distribution
print("\nSex distribution:\n", df["sex"].value_counts(dropna=False))

# Sanity spot-check: extreme outliers removed?
for c in ["age","sbp","dbp","bmi","fasting_glucose","tc"]:
    if c in df.columns:
        q = df[c].quantile([0.01,0.5,0.99])
        print(f"\n{c} quantiles (1%,50%,99%):\n{q}")


Rows: 1471 | Columns: 50

Source counts:
 source_dataset
pima_diabetes    768
uci_ckd          400
uci_heart        303
Name: count, dtype: int64

Numeric missingness (%):
 height_cm          100.0
weight_kg          100.0
tg                 100.0
ldl                100.0
hdl                100.0
hr                 100.0
egfr               100.0
hba1c              100.0
tc                  79.7
sbp                 58.2
dbp                 50.4
bmi                 48.6
fasting_glucose     24.7
age                  0.6
dtype: float64

Sex distribution:
 sex
NaN    1168
M       207
F        96
Name: count, dtype: int64

age quantiles (1%,50%,99%):
0.01    15.0
0.50    42.0
0.99    76.0
Name: age, dtype: float64

sbp quantiles (1%,50%,99%):
0.01     70.0
0.50    100.0
0.99    178.0
Name: sbp, dtype: float64

dbp quantiles (1%,50%,99%):
0.01     48.0
0.50     72.0
0.99    106.0
Name: dbp, dtype: float64

bmi quantiles (1%,50%,99%):
0.01    19.500
0.50    32.300
0.99    49.835
Name: bmi, dty

— Labels + EDA + Leakage checks (clean v2)

In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

PROC = Path.cwd() / "cardiometrix" / "data" / "processed"
df = pd.read_csv(PROC / "harmonized.csv")

# --- Helpers from extras ---
# Pima's Outcome (1/0) as diagnosis hint
if "extra__outcome" in df.columns:
    df["diag_flag_diabetes"] = pd.to_numeric(df["extra__outcome"], errors="coerce")
else:
    df["diag_flag_diabetes"] = np.nan

# CKD flags (yes/no -> 1/0)
for raw_flag in ["extra__htn", "extra__dm"]:
    if raw_flag in df.columns:
        df[raw_flag] = df[raw_flag].map({"yes":1, "no":0, "1":1, "0":0}).astype("float")
    else:
        df[raw_flag] = np.nan

# --- Label policies (documented in docs/LABEL_POLICY.md) ---
# Diabetes = 1 if (HbA1c >= 6.5) OR (fasting_glucose >= 126) OR (diag_flag_diabetes == 1)
diab_by_hba1c = (df["hba1c"] >= 6.5)
diab_by_fpg   = (df["fasting_glucose"] >= 126)
diab_by_diag  = (df["diag_flag_diabetes"] == 1)
df["label_diabetes"] = (
    diab_by_hba1c.fillna(False) |
    diab_by_fpg.fillna(False)   |
    diab_by_diag.fillna(False)
).astype(int)

# Hypertension = 1 if (SBP >= 140) OR (DBP >= 90) OR CKD raw 'htn' flag == 1
htn_by_sbp = (df["sbp"] >= 140)
htn_by_dbp = (df["dbp"] >= 90)
htn_by_flag = (df["extra__htn"] == 1)
df["label_hypertension"] = (
    htn_by_sbp.fillna(False) |
    htn_by_dbp.fillna(False) |
    htn_by_flag.fillna(False)
).astype(int)

# Optional educational target: ASCVD proxy in [0,1] (NOT a clinical calc)
age = df["age"].clip(0,120)
sbp = df["sbp"].clip(70,250)
tc  = df["tc"].clip(80,400)
hdl = df["hdl"].clip(10,120)
age_c = (age - 55.0).fillna(0)
sbp_c = (sbp - 130.0).fillna(0)
tc_c  = (tc  - 200.0).fillna(0)
hdl_c = (hdl - 50.0 ).fillna(0)
logit = 0.03*age_c + 0.02*sbp_c + 0.005*tc_c - 0.01*hdl_c
df["ascvd_risk"] = (1/(1+np.exp(-logit))).astype(float)

# Persist
out_fp = PROC / "harmonized_labeled.csv"
df.to_csv(out_fp, index=False)
print("✅ Wrote:", out_fp, "shape:", df.shape)
display(df[["label_diabetes","label_hypertension","ascvd_risk"]].head())


✅ Wrote: c:\Users\hp\Desktop\Cardio Metrix\cardiometrix\data\processed\harmonized_labeled.csv shape: (1471, 54)


Unnamed: 0,label_diabetes,label_hypertension,ascvd_risk
0,1,0,0.46257
1,0,0,0.327393
2,1,0,0.334033
3,0,0,0.265027
4,1,0,0.34074


— EDA: source distribution, class balance, availability

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

PROC = Path.cwd() / "cardiometrix" / "data" / "processed"
df = pd.read_csv(PROC / "harmonized_labeled.csv")

print("Rows:", len(df))
print("\nSource distribution:\n", df["source_dataset"].value_counts(dropna=False))

print("\nClass balance:")
for lab in ["label_diabetes","label_hypertension"]:
    vc = df[lab].value_counts(normalize=True).mul(100).round(1).to_dict()
    print(f"  {lab}: {vc}")

print("\nASCVD proxy stats:")
print(df["ascvd_risk"].describe(percentiles=[0.1,0.25,0.5,0.75,0.9]).round(3))

print("\nFeature availability (% non-missing):")
num_cols = ["age","sbp","dbp","bmi","tc","hdl","ldl","tg","fasting_glucose","hba1c","egfr"]
avail = (1 - df[num_cols].isna().mean()).mul(100).round(1)
print(avail.sort_values(ascending=False))


Rows: 1471

Source distribution:
 source_dataset
pima_diabetes    768
uci_ckd          400
uci_heart        303
Name: count, dtype: int64

Class balance:
  label_diabetes: {0: 63.6, 1: 36.4}
  label_hypertension: {0: 79.3, 1: 20.7}

ASCVD proxy stats:
count    1471.000
mean        0.378
std         0.140
min         0.088
10%         0.255
25%         0.283
50%         0.338
75%         0.455
90%         0.582
max         0.869
Name: ascvd_risk, dtype: float64

Feature availability (% non-missing):
age                99.4
fasting_glucose    75.3
bmi                51.4
dbp                49.6
sbp                41.8
tc                 20.3
hdl                 0.0
ldl                 0.0
tg                  0.0
hba1c               0.0
egfr                0.0
dtype: float64


— Leakage scan (simple heuristics)

In [13]:
import pandas as pd
from pathlib import Path

PROC = Path.cwd() / "cardiometrix" / "data" / "processed"
df = pd.read_csv(PROC / "harmonized_labeled.csv")

# Look for suspicious label-like columns present as features
sus = [c for c in df.columns if any(k in c.lower() for k in ["label","outcome","target"])]
print("Suspicious columns (inspect manually):", sus)

# Correlation of extras with labels (flags—not proof of leakage)
cand_extras = [c for c in df.columns if c.startswith("extra__")]
corrs = {}
for c in cand_extras:
    try:
        cor = df[[c,"label_diabetes","label_hypertension"]].corr(numeric_only=True)
        corrs[c] = cor.iloc[0,1:].abs().max()
    except Exception:
        pass
corrs = pd.Series(corrs).sort_values(ascending=False)
print("\nTop extras correlated with labels (flag only):")
print(corrs.head(12))


Suspicious columns (inspect manually): ['extra__outcome', 'extra__target', 'label_diabetes', 'label_hypertension']

Top extras correlated with labels (flag only):
extra__htn                         1.000000
extra__outcome                     0.722649
extra__dm                          0.607034
extra__bgr                         0.468643
extra__sc                          0.278817
extra__pregnancies                 0.205168
extra__oldpeak                     0.203584
extra__diabetespedigreefunction    0.163762
extra__target                      0.125314
extra__thal                        0.099384
extra__slope                       0.088956
extra__fbs                         0.071498
dtype: float64


— Acceptance checks

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

PROC = Path.cwd() / "cardiometrix" / "data" / "processed"
df = pd.read_csv(PROC / "harmonized_labeled.csv")

# Labels exist and are 0/1
assert set(df["label_diabetes"].unique()) <= {0,1}, "label_diabetes not binary"
assert set(df["label_hypertension"].unique()) <= {0,1}, "label_hypertension not binary"

# ASCVD proxy bounds
assert df["ascvd_risk"].between(0,1).all(), "ascvd_risk out of [0,1] bounds"

print("✅ Step-3 acceptance checks passed.")
display(df[["source_dataset","age","sbp","dbp","fasting_glucose","label_diabetes","label_hypertension","ascvd_risk"]].sample(8, random_state=7))


✅ Step-3 acceptance checks passed.


Unnamed: 0,source_dataset,age,sbp,dbp,fasting_glucose,label_diabetes,label_hypertension,ascvd_risk
71,pima_diabetes,26.0,,64.0,139.0,1,0,0.295254
259,pima_diabetes,51.0,,76.0,155.0,1,0,0.470036
363,pima_diabetes,67.0,,78.0,146.0,1,0,0.58904
185,pima_diabetes,41.0,,68.0,194.0,1,0,0.396517
1016,uci_heart,54.0,192.0,,,0,1,0.835484
412,pima_diabetes,22.0,,84.0,143.0,1,0,0.270912
1370,uci_ckd,73.0,,,127.0,1,0,0.631812
715,pima_diabetes,34.0,,50.0,187.0,1,0,0.347511


## Step-4

— Load labeled data & build stratify key

In [17]:
from pathlib import Path
import pandas as pd
import numpy as np

BASE = Path.cwd() / "cardiometrix"
PROC = BASE / "data" / "processed"
IDX  = BASE / "indices"
IDX.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "harmonized_labeled.csv")
print("Labeled shape:", df.shape)

# Joint label key: "00","01","10","11" for (diabetes, hypertension)
y_d = df["label_diabetes"].astype(int)
y_h = df["label_hypertension"].astype(int)
strat_key = (y_d.astype(str) + y_h.astype(str))
print("Stratify key counts:\n", strat_key.value_counts().sort_index())


Labeled shape: (1471, 54)
Stratify key counts:
 00    756
01    180
10    411
11    124
Name: count, dtype: int64


— Stratified split (70/15/15) & save indices


In [18]:
from sklearn.model_selection import train_test_split

SEED = 42
idx_all = np.arange(len(df))

# Train vs Temp (Val+Test)
idx_train, idx_temp, strat_train, strat_temp = train_test_split(
    idx_all, strat_key.values,
    test_size=0.30, random_state=SEED, stratify=strat_key.values
)

# Val vs Test (split temp in half -> 15%/15%)
idx_val, idx_test, strat_val, strat_test = train_test_split(
    idx_temp, strat_temp,
    test_size=0.50, random_state=SEED, stratify=strat_temp
)

# Persist to /indices
pd.Series(idx_train).to_csv(IDX / "train_idx.csv", index=False, header=False)
pd.Series(idx_val).to_csv(IDX / "val_idx.csv",   index=False, header=False)
pd.Series(idx_test).to_csv(IDX / "test_idx.csv", index=False, header=False)

print("✅ Saved indices to:", IDX.resolve())
print(f"Sizes → train: {len(idx_train)} | val: {len(idx_val)} | test: {len(idx_test)}")


✅ Saved indices to: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\indices
Sizes → train: 1029 | val: 221 | test: 221


— Sanity checks (distributions + overlap)

In [19]:
import pandas as pd

def read_idx(name): 
    return pd.read_csv(IDX / f"{name}_idx.csv", header=None)[0].tolist()

idx_train = read_idx("train")
idx_val   = read_idx("val")
idx_test  = read_idx("test")

splits = {
    "train": df.iloc[idx_train].copy(),
    "val":   df.iloc[idx_val].copy(),
    "test":  df.iloc[idx_test].copy(),
}

def summarize(name, d):
    print(f"\n=== {name.upper()} ===")
    print("Rows:", len(d))
    # joint label distribution
    jk = (d["label_diabetes"].astype(int).astype(str) + d["label_hypertension"].astype(int).astype(str))
    print("Joint label (%)", (jk.value_counts(normalize=True)*100).round(1).sort_index().to_dict())
    # individual label balance
    for col in ["label_diabetes","label_hypertension"]:
        print(f"{col} (%)", (d[col].value_counts(normalize=True)*100).round(1).to_dict())
    # source distribution
    print("Sources (%)", (d["source_dataset"].value_counts(normalize=True)*100).round(1).to_dict())

for k, d in splits.items():
    summarize(k, d)

# Overlap check
overlap_tv = set(idx_train) & set(idx_val)
overlap_tt = set(idx_train) & set(idx_test)
overlap_vt = set(idx_val) & set(idx_test)
print("\nOverlaps (should be 0 0 0):", len(overlap_tv), len(overlap_tt), len(overlap_vt))



=== TRAIN ===
Rows: 1029
Joint label (%) {'00': 51.4, '01': 12.2, '10': 27.9, '11': 8.5}
label_diabetes (%) {0: 63.7, 1: 36.3}
label_hypertension (%) {0: 79.3, 1: 20.7}
Sources (%) {'pima_diabetes': 53.4, 'uci_ckd': 26.9, 'uci_heart': 19.6}

=== VAL ===
Rows: 221
Joint label (%) {'00': 51.1, '01': 12.2, '10': 28.1, '11': 8.6}
label_diabetes (%) {0: 63.3, 1: 36.7}
label_hypertension (%) {0: 79.2, 1: 20.8}
Sources (%) {'pima_diabetes': 48.4, 'uci_ckd': 29.4, 'uci_heart': 22.2}

=== TEST ===
Rows: 221
Joint label (%) {'00': 51.6, '01': 12.2, '10': 28.1, '11': 8.1}
label_diabetes (%) {0: 63.8, 1: 36.2}
label_hypertension (%) {0: 79.6, 1: 20.4}
Sources (%) {'pima_diabetes': 50.2, 'uci_ckd': 26.2, 'uci_heart': 23.5}

Overlaps (should be 0 0 0): 0 0 0


— Materialize split CSVs for quick loading later

In [20]:
for name, idxs in [("train", idx_train), ("val", idx_val), ("test", idx_test)]:
    out = df.iloc[idxs].copy()
    out.to_csv(PROC / f"harmonized_labeled__{name}.csv", index=False)
    print(f"✅ Wrote {name} split:", (PROC / f"harmonized_labeled__{name}.csv").resolve(), "shape:", out.shape)


✅ Wrote train split: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\data\processed\harmonized_labeled__train.csv shape: (1029, 54)
✅ Wrote val split: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\data\processed\harmonized_labeled__val.csv shape: (221, 54)
✅ Wrote test split: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\data\processed\harmonized_labeled__test.csv shape: (221, 54)


### Rolling into Week-2. We’ll keep it laser-focused and clean:
* Step-1: Target-specific feature specs + preprocessors (no leakage).
* Step-2: Train LogReg (baseline) and XGBoost (primary) on train; eval on val.
* Step-3: Calibrate with isotonic (val) + pick thresholds (val) + evaluate on test.
* Step-4: Persist artifacts for the Flask app.

— Feature specs (no leakage) + preprocessors

In [21]:
from pathlib import Path
import pandas as pd, joblib

BASE = Path.cwd() / "cardiometrix"
PROC = BASE / "data" / "processed"
REG  = BASE / "registry"
REG.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC / "harmonized_labeled.csv")

# Diabetes: exclude fasting_glucose to avoid label leakage
spec = {
    "diabetes": {
        "NUM": ["age","bmi","dbp","extra__pregnancies","extra__diabetespedigreefunction"],
        "CAT": ["sex","source_dataset"]
    },
    "hypertension": {
        "NUM": ["age","bmi","sbp","dbp"],
        "CAT": ["sex","source_dataset"]
    }
}
joblib.dump(spec, REG / "feature_spec_per_target.joblib")
print("Saved feature spec:", (REG / "feature_spec_per_target.joblib").resolve())
spec


Saved feature spec: C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\registry\feature_spec_per_target.joblib


{'diabetes': {'NUM': ['age',
   'bmi',
   'dbp',
   'extra__pregnancies',
   'extra__diabetespedigreefunction'],
  'CAT': ['sex', 'source_dataset']},
 'hypertension': {'NUM': ['age', 'bmi', 'sbp', 'dbp'],
  'CAT': ['sex', 'source_dataset']}}

— Build preprocessors (impute + onehot + scale numerics)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

def make_preprocessor(num_cols, cat_cols):
    num = Pipeline([("impute", SimpleImputer(strategy="median")),
                    ("scale",  StandardScaler())])
    cat = Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
    return ColumnTransformer([("num", num, num_cols), ("cat", cat, cat_cols)], remainder="drop")

prep_diab = make_preprocessor(spec["diabetes"]["NUM"], spec["diabetes"]["CAT"])
prep_htn  = make_preprocessor(spec["hypertension"]["NUM"], spec["hypertension"]["CAT"])

joblib.dump(prep_diab, REG / "preprocessor__diabetes.joblib")
joblib.dump(prep_htn,  REG / "preprocessor__hypertension.joblib")
print("Saved preprocessors (unfitted).")


Saved preprocessors (unfitted).


— Materialize split frames & (re)fit preprocessors on TRAIN only

In [23]:
import pandas as pd

def read_idx(name):
    return pd.read_csv(BASE / "indices" / f"{name}_idx.csv", header=None)[0].tolist()

idx_tr, idx_va, idx_te = read_idx("train"), read_idx("val"), read_idx("test")

cols_d = spec["diabetes"]["NUM"] + spec["diabetes"]["CAT"]
cols_h = spec["hypertension"]["NUM"] + spec["hypertension"]["CAT"]

df_tr = pd.read_csv(PROC / "harmonized_labeled__train.csv")
df_va = pd.read_csv(PROC / "harmonized_labeled__val.csv")
df_te = pd.read_csv(PROC / "harmonized_labeled__test.csv")

# Fit preprocessors on train
prep_diab_f = joblib.load(REG / "preprocessor__diabetes.joblib")
prep_htn_f  = joblib.load(REG / "preprocessor__hypertension.joblib")

Xtr_d = prep_diab_f.fit_transform(df_tr[cols_d]); ytr_d = df_tr["label_diabetes"].astype(int).values
Xva_d = prep_diab_f.transform(df_va[cols_d]);      yva_d = df_va["label_diabetes"].astype(int).values
Xte_d = prep_diab_f.transform(df_te[cols_d]);      yte_d = df_te["label_diabetes"].astype(int).values

Xtr_h = prep_htn_f.fit_transform(df_tr[cols_h]);   ytr_h = df_tr["label_hypertension"].astype(int).values
Xva_h = prep_htn_f.transform(df_va[cols_h]);       yva_h = df_va["label_hypertension"].astype(int).values
Xte_h = prep_htn_f.transform(df_te[cols_h]);       yte_h = df_te["label_hypertension"].astype(int).values

# Save the *fitted* preprocessors we will ship with the app
joblib.dump(prep_diab_f, REG / "prod_preprocessor__diabetes.joblib")
joblib.dump(prep_htn_f,  REG / "prod_preprocessor__hypertension.joblib")
print("Saved fitted preprocessors for production.")


Saved fitted preprocessors for production.


— Train LR baseline + XGBoost primary; validate

In [24]:
# W2 / S2 / Cell 1 — Metrics helper
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.calibration import calibration_curve
import numpy as np

def eval_binary(y_true, y_prob, name):
    y_prob = np.clip(y_prob, 1e-7, 1-1e-7)
    auroc = roc_auc_score(y_true, y_prob)
    ap    = average_precision_score(y_true, y_prob)
    brier = brier_score_loss(y_true, y_prob)
    pt, pp = calibration_curve(y_true, y_prob, n_bins=10, strategy="uniform")
    ece = np.abs(pp - pt).mean()
    print(f"{name} → AUROC={auroc:.3f} | PR-AUC={ap:.3f} | Brier={brier:.3f} | ECE≈{ece:.3f}")
    return dict(auroc=auroc, ap=ap, brier=brier, ece=ece)


In [25]:
# W2 / S2 / Cell 2 — Logistic Regression baselines
from sklearn.linear_model import LogisticRegression

lr_d = LogisticRegression(max_iter=500, class_weight="balanced").fit(Xtr_d, ytr_d)
lr_h = LogisticRegression(max_iter=500, class_weight="balanced").fit(Xtr_h, ytr_h)

eval_binary(yva_d, lr_d.predict_proba(Xva_d)[:,1], "LR[diabetes] (val)")
eval_binary(yva_h, lr_h.predict_proba(Xva_h)[:,1], "LR[hypertension] (val)")


LR[diabetes] (val) → AUROC=0.818 | PR-AUC=0.664 | Brier=0.176 | ECE≈0.116
LR[hypertension] (val) → AUROC=0.931 | PR-AUC=0.744 | Brier=0.123 | ECE≈0.200


{'auroc': 0.9314906832298137,
 'ap': 0.7441619096094235,
 'brier': 0.12312573482474788,
 'ece': np.float64(0.20007698555535242)}

In [26]:
# W2 / S2 / Cell 3 — XGBoost primary models
from xgboost import XGBClassifier

xgb_d = XGBClassifier(
    n_estimators=400, max_depth=3, learning_rate=0.07,
    subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
    objective="binary:logistic", eval_metric="logloss",
    n_jobs=-1, tree_method="hist"
).fit(Xtr_d, ytr_d)

xgb_h = XGBClassifier(
    n_estimators=400, max_depth=4, learning_rate=0.05,
    subsample=0.9, colsample_bytree=0.8, reg_lambda=1.0,
    objective="binary:logistic", eval_metric="logloss",
    n_jobs=-1, tree_method="hist"
).fit(Xtr_h, ytr_h)

eval_binary(yva_d, xgb_d.predict_proba(Xva_d)[:,1], "XGB[diabetes] (val)")
eval_binary(yva_h, xgb_h.predict_proba(Xva_h)[:,1], "XGB[hypertension] (val)")


XGB[diabetes] (val) → AUROC=0.793 | PR-AUC=0.638 | Brier=0.179 | ECE≈0.089
XGB[hypertension] (val) → AUROC=0.965 | PR-AUC=0.881 | Brier=0.062 | ECE≈0.137


{'auroc': 0.9652795031055901,
 'ap': 0.881333206245507,
 'brier': 0.06191158009826589,
 'ece': np.float64(0.13656171593339445)}

— Calibration + thresholds + test evaluation

In [27]:
# W2 / S3 / Cell 1 — Isotonic calibration on validation
from sklearn.isotonic import IsotonicRegression

pva_d_raw = xgb_d.predict_proba(Xva_d)[:,1]
pva_h_raw = xgb_h.predict_proba(Xva_h)[:,1]

iso_d = IsotonicRegression(out_of_bounds="clip").fit(pva_d_raw, yva_d)
iso_h = IsotonicRegression(out_of_bounds="clip").fit(pva_h_raw, yva_h)

# Calibrated probs on val/test
pva_d = iso_d.predict(pva_d_raw)
pva_h = iso_h.predict(pva_h_raw)
pte_d = iso_d.predict(xgb_d.predict_proba(Xte_d)[:,1])
pte_h = iso_h.predict(xgb_h.predict_proba(Xte_h)[:,1])

print("Calibrated (val):")
_ = eval_binary(yva_d, pva_d, "XGB[diabetes] cal")
_ = eval_binary(yva_h, pva_h, "XGB[hypertension] cal")


Calibrated (val):
XGB[diabetes] cal → AUROC=0.811 | PR-AUC=0.648 | Brier=0.165 | ECE≈0.000
XGB[hypertension] cal → AUROC=0.970 | PR-AUC=0.864 | Brier=0.055 | ECE≈0.000


In [28]:
# W2 / S3 / Cell 2 — Pick thresholds on validation (maximize F1)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np, json

def pick_thr_max_f1(y, p):
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.05, 0.95, 19):
        f1 = f1_score(y, (p>=t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_t, best_f1 = float(t), float(f1)
    return best_t, best_f1

thr_d, f1_d = pick_thr_max_f1(yva_d, pva_d)
thr_h, f1_h = pick_thr_max_f1(yva_h, pva_h)
print(f"Chosen thresholds (val): diabetes={thr_d:.2f} (F1={f1_d:.3f}) | htn={thr_h:.2f} (F1={f1_h:.3f})")

# Test evaluation
print("\nTEST — Diabetes")
yhat_d = (pte_d >= thr_d).astype(int)
print(confusion_matrix(yte_d, yhat_d))
print(classification_report(yte_d, yhat_d, digits=3, zero_division=0))

print("\nTEST — Hypertension")
yhat_h = (pte_h >= thr_h).astype(int)
print(confusion_matrix(yte_h, yhat_h))
print(classification_report(yte_h, yhat_h, digits=3, zero_division=0))


Chosen thresholds (val): diabetes=0.20 (F1=0.688) | htn=0.35 (F1=0.804)

TEST — Diabetes
[[81 60]
 [11 69]]
              precision    recall  f1-score   support

           0      0.880     0.574     0.695       141
           1      0.535     0.863     0.660        80

    accuracy                          0.679       221
   macro avg      0.708     0.718     0.678       221
weighted avg      0.755     0.679     0.683       221


TEST — Hypertension
[[164  12]
 [  9  36]]
              precision    recall  f1-score   support

           0      0.948     0.932     0.940       176
           1      0.750     0.800     0.774        45

    accuracy                          0.905       221
   macro avg      0.849     0.866     0.857       221
weighted avg      0.908     0.905     0.906       221



— Persist “production” artifacts (for Flask)

In [30]:
# W2 / S4 — Save everything we need to serve predictions
import joblib, json
joblib.dump(xgb_d, REG / "prod_xgb__diabetes.joblib")
joblib.dump(xgb_h, REG / "prod_xgb__hypertension.joblib")
joblib.dump(iso_d, REG / "prod_calibrator__diabetes.joblib")
joblib.dump(iso_h, REG / "prod_calibrator__hypertension.joblib")

policy = {
    "diabetes": {"threshold_global": float(thr_d)},
    "hypertension": {"threshold_global": float(thr_h)}
}
with open(REG / "prod_thresholds.json","w") as f:
    json.dump(policy, f, indent=2)

print("✅ Saved production artifacts to registry/:")
for p in ["prod_preprocessor__diabetes.joblib","prod_preprocessor__hypertension.joblib",
          "prod_xgb__diabetes.joblib","prod_xgb__hypertension.joblib",
          "prod_calibrator__diabetes.joblib","prod_calibrator__hypertension.joblib",
          "prod_thresholds.json"]:
    print(" -", (REG / p).name)


✅ Saved production artifacts to registry/:
 - prod_preprocessor__diabetes.joblib
 - prod_preprocessor__hypertension.joblib
 - prod_xgb__diabetes.joblib
 - prod_xgb__hypertension.joblib
 - prod_calibrator__diabetes.joblib
 - prod_calibrator__hypertension.joblib
 - prod_thresholds.json


In [32]:
# W4-1: create Flask API (predictor + app) under cardiometrix/app/
from pathlib import Path
import textwrap, json

BASE = Path.cwd() / "cardiometrix"
APP  = BASE / "app"
REG  = BASE / "registry"
APP.mkdir(parents=True, exist_ok=True)

# --- predictor.py: loads artifacts once; validates + predicts; optional SHAP ---
predictor_py = textwrap.dedent("""\
import json
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import joblib

# Optional (SHAP): if not installed, we degrade gracefully
try:
    import shap
    HAS_SHAP = True
except Exception:
    HAS_SHAP = False

# ---------- Paths ----------
BASE = Path(__file__).resolve().parents[1]
REG  = BASE / "registry"

# ---------- Load artifacts (once) ----------
# Fitted preprocessors
PRE_D = joblib.load(REG / "prod_preprocessor__diabetes.joblib")
PRE_H = joblib.load(REG / "prod_preprocessor__hypertension.joblib")

# XGB models
XGB_D = joblib.load(REG / "prod_xgb__diabetes.joblib")
XGB_H = joblib.load(REG / "prod_xgb__hypertension.joblib")

# Isotonic calibrators
ISO_D = joblib.load(REG / "prod_calibrator__diabetes.joblib")
ISO_H = joblib.load(REG / "prod_calibrator__hypertension.joblib")

# Threshold policy
with open(REG / "prod_thresholds.json", "r") as f:
    THRESH = json.load(f)

# ---------- Feature specs (must match training) ----------
SPEC = joblib.load(REG / "feature_spec_per_target.joblib")
COLS_D = SPEC["diabetes"]["NUM"] + SPEC["diabetes"]["CAT"]
COLS_H = SPEC["hypertension"]["NUM"] + SPEC["hypertension"]["CAT"]

# ---------- Utilities ----------
def _as_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def _norm_sex(x):
    if x is None: return np.nan
    s = str(x).strip().upper()
    if s in ["M","MALE","1"]: return "M"
    if s in ["F","FEMALE","0"]: return "F"
    return np.nan

def _default_source(x):
    s = "external"
    if x is None or str(x).strip() == "":
        return s
    return str(x)

def _build_row(payload: Dict[str, Any], target: str) -> Dict[str, Any]:
    # Minimal schema: we only request what each target needs (others are ignored)
    if target == "diabetes":
        row = {
            "age": _as_float(payload.get("age")),
            "bmi": _as_float(payload.get("bmi")),
            "dbp": _as_float(payload.get("dbp")),
            "extra__pregnancies": _as_float(payload.get("extra__pregnancies")),
            "extra__diabetespedigreefunction": _as_float(payload.get("extra__diabetespedigreefunction")),
            "sex": _norm_sex(payload.get("sex")),
            "source_dataset": _default_source(payload.get("source_dataset")),
        }
        # Fill any missing keys expected by preprocessor
        for k in COLS_D:
            if k not in row:
                row[k] = np.nan
        return row

    elif target == "hypertension":
        row = {
            "age": _as_float(payload.get("age")),
            "bmi": _as_float(payload.get("bmi")),
            "sbp": _as_float(payload.get("sbp")),
            "dbp": _as_float(payload.get("dbp")),
            "sex": _norm_sex(payload.get("sex")),
            "source_dataset": _default_source(payload.get("source_dataset")),
        }
        for k in COLS_H:
            if k not in row:
                row[k] = np.nan
        return row

    else:
        raise ValueError("Unknown target: " + target)

def _prep_matrix(row: Dict[str, Any], target: str) -> np.ndarray:
    import pandas as pd
    if target == "diabetes":
        df = pd.DataFrame([row], columns=COLS_D)
        X = PRE_D.transform(df)
    else:
        df = pd.DataFrame([row], columns=COLS_H)
        X = PRE_H.transform(df)
    return X

def _calibrated_prob(model, calibrator, X: np.ndarray) -> float:
    # XGBoost predict_proba → isotonic predict → scalar
    p_raw = model.predict_proba(X)[:, 1]
    p_cal = calibrator.predict(p_raw)
    return float(np.clip(p_cal[0], 1e-7, 1 - 1e-7))

def _feat_names(pre, num: List[str], cat: List[str]) -> List[str]:
    # Build post-transform feature names for SHAP mapping
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    cat_names = list(ohe.get_feature_names_out(cat))
    return list(num) + cat_names

# Lazily build SHAP explainers (small inputs → fast)
_EXPLAINERS = {"diabetes": None, "hypertension": None}
_FEATNAMES  = {"diabetes": None, "hypertension": None}

def _get_explainer(target: str):
    if not HAS_SHAP:
        return None, None
    if target == "diabetes":
        if _EXPLAINERS["diabetes"] is None:
            _EXPLAINERS["diabetes"] = shap.TreeExplainer(XGB_D)
            _FEATNAMES["diabetes"]   = _feat_names(PRE_D, SPEC["diabetes"]["NUM"], SPEC["diabetes"]["CAT"])
        return _EXPLAINERS["diabetes"], _FEATNAMES["diabetes"]
    else:
        if _EXPLAINERS["hypertension"] is None:
            _EXPLAINERS["hypertension"] = shap.TreeExplainer(XGB_H)
            _FEATNAMES["hypertension"]  = _feat_names(PRE_H, SPEC["hypertension"]["NUM"], SPEC["hypertension"]["CAT"])
        return _EXPLAINERS["hypertension"], _FEATNAMES["hypertension"]

def _top_factors(target: str, X: np.ndarray, k: int = 5) -> List[Tuple[str, float]]:
    expl, names = _get_explainer(target)
    if expl is None:
        return []
    vals = expl.shap_values(X)  # shape (1, n_features)
    if isinstance(vals, list):  # safety for some xgb versions
        vals = vals[0]
    vals = np.abs(vals).reshape(-1)
    idx = np.argsort(-vals)[:k]
    out = []
    for i in idx:
        out.append((names[i], float(vals[i])))
    return out

def predict(payload: Dict[str, Any]) -> Dict[str, Any]:
    \"\"\"Run both targets (diabetes, hypertension) on one payload.
    Expects keys like: age, bmi, sbp, dbp, sex, extra__pregnancies, extra__diabetespedigreefunction, source_dataset(optional).
    Missing values allowed; preprocessors will impute.
    \"\"\"
    results = {}

    # ---- Diabetes ----
    row_d = _build_row(payload, "diabetes")
    X_d   = _prep_matrix(row_d, "diabetes")
    p_d   = _calibrated_prob(XGB_D, ISO_D, X_d)
    thr_d = float(THRESH["diabetes"]["threshold_global"])
    y_d   = int(p_d >= thr_d)
    results["diabetes"] = {
        "prob": p_d,
        "threshold": thr_d,
        "decision": y_d,
        "top_factors": _top_factors("diabetes", X_d, k=5)
    }

    # ---- Hypertension ----
    row_h = _build_row(payload, "hypertension")
    X_h   = _prep_matrix(row_h, "hypertension")
    p_h   = _calibrated_prob(XGB_H, ISO_H, X_h)
    thr_h = float(THRESH["hypertension"]["threshold_global"])
    y_h   = int(p_h >= thr_h)
    results["hypertension"] = {
        "prob": p_h,
        "threshold": thr_h,
        "decision": y_h,
        "top_factors": _top_factors("hypertension", X_h, k=5)
    }

    # Echo back sanitized/normalized inputs for traceability
    results["input_used"] = {
        "diabetes_cols": COLS_D,
        "hypertension_cols": COLS_H
    }
    return results
""")

# --- app_flask.py: Flask server wiring /predict endpoint ---
app_py = textwrap.dedent("""\
from flask import Flask, request, jsonify
from flask_cors import CORS

from .predictor import predict

app = Flask(__name__)
CORS(app)

@app.get("/health")
def health():
    return jsonify({"status":"ok"}), 200

@app.post("/predict")
def predict_endpoint():
    try:
        payload = request.get_json(force=True) or {}
    except Exception:
        return jsonify({"error":"Invalid JSON payload"}), 400

    # Minimal input validation: must include age and at least one of sbp/dbp for HTN; and bmi/dbp for diabetes
    missing = []
    if "age" not in payload: missing.append("age")
    if ("sbp" not in payload) and ("dbp" not in payload): 
        # allow it, but warn; hypertension will impute NaN → weaker prediction
        pass
    if "bmi" not in payload:
        # allow; imputed, but we note it
        pass

    if missing:
        return jsonify({"error":"Missing required fields", "missing":missing}), 400

    try:
        out = predict(payload)
        # Add a disclaimer
        out["disclaimer"] = (
            "Educational decision support. Not a diagnosis. "
            "Use clinical judgment and confirm with appropriate tests."
        )
        return jsonify(out), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    # Local dev run: python -m cardiometrix.app.app_flask
    app.run(host="0.0.0.0", port=8000, debug=True)
""")

# write files
(APP / "predictor.py").write_text(predictor_py, encoding="utf-8")
(APP / "app_flask.py").write_text(app_py, encoding="utf-8")

print("✅ Wrote:")
print(" -", (APP / "predictor.py").resolve())
print(" -", (APP / "app_flask.py").resolve())
print("\\nNext: run the server with `python -m cardiometrix.app.app_flask` from your project root.")


✅ Wrote:
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\predictor.py
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\app_flask.py
\nNext: run the server with `python -m cardiometrix.app.app_flask` from your project root.


In [33]:
# W4-2A: Create Flask templates (Bootstrap) + static assets (Chart.js wiring)
from pathlib import Path
import textwrap

BASE = Path.cwd() / "cardiometrix"
APP  = BASE / "app"
TPL  = APP / "templates"
STC  = APP / "static"
(TPL).mkdir(parents=True, exist_ok=True)
(STC / "css").mkdir(parents=True, exist_ok=True)
(STC / "js").mkdir(parents=True, exist_ok=True)

# ---------- base.html ----------
base_html = textwrap.dedent("""\
<!doctype html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>CardioMetrix</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="{{ url_for('static', filename='css/app.css') }}" rel="stylesheet">
  </head>
  <body class="bg-light">
    <nav class="navbar navbar-expand-lg navbar-dark bg-primary">
      <div class="container">
        <a class="navbar-brand fw-semibold" href="/">CardioMetrix</a>
        <span class="navbar-text small">Decision support · Not medical advice</span>
      </div>
    </nav>
    <main class="container py-4">
      {% block content %}{% endblock %}
    </main>
    <footer class="container pb-4">
      <div class="text-muted small">
        <strong>Disclaimer:</strong> Educational decision support. Not a diagnosis. Use clinical judgment.
      </div>
    </footer>
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <script src="{{ url_for('static', filename='js/app.js') }}"></script>
  </body>
</html>
""")

# ---------- index.html ----------
index_html = textwrap.dedent("""\
{% extends "base.html" %}
{% block content %}
<div class="row g-4">
  <div class="col-lg-4">
    <div class="card shadow-sm">
      <div class="card-body">
        <h5 class="card-title mb-3">Input</h5>
        <form id="predict-form" class="vstack gap-3">
          <div class="row g-2">
            <div class="col">
              <label class="form-label">Age (years)</label>
              <input type="number" step="1" min="0" max="120" class="form-control" name="age" required>
            </div>
            <div class="col">
              <label class="form-label">Sex</label>
              <select class="form-select" name="sex">
                <option value="">Unknown</option>
                <option value="F">Female</option>
                <option value="M">Male</option>
              </select>
            </div>
          </div>

          <div class="row g-2">
            <div class="col">
              <label class="form-label">BMI</label>
              <input type="number" step="0.1" min="10" max="60" class="form-control" name="bmi">
            </div>
            <div class="col">
              <label class="form-label">SBP (mmHg)</label>
              <input type="number" step="1" min="70" max="250" class="form-control" name="sbp">
            </div>
            <div class="col">
              <label class="form-label">DBP (mmHg)</label>
              <input type="number" step="1" min="40" max="150" class="form-control" name="dbp">
            </div>
          </div>

          <div class="row g-2">
            <div class="col">
              <label class="form-label">Pregnancies (Pima)</label>
              <input type="number" step="1" min="0" max="20" class="form-control" name="extra__pregnancies">
            </div>
            <div class="col">
              <label class="form-label">Pedigree (Pima)</label>
              <input type="number" step="0.001" min="0" max="3" class="form-control" name="extra__diabetespedigreefunction">
            </div>
          </div>

          <button class="btn btn-primary mt-2" type="submit">Predict</button>
          <div id="form-hint" class="form-text">
            Age is required. Other fields are optional (imputed if missing).
          </div>
        </form>
      </div>
    </div>
  </div>

  <div class="col-lg-8">
    <div id="results" class="d-none">
      <div class="row g-3">
        <!-- Diabetes card -->
        <div class="col-md-6">
          <div class="card result-card shadow-sm">
            <div class="card-body">
              <div class="d-flex justify-content-between align-items-center">
                <h6 class="mb-0">Diabetes</h6>
                <span id="d-decision" class="badge text-bg-secondary">—</span>
              </div>
              <div class="chart-wrap my-3">
                <canvas id="d-chart"></canvas>
              </div>
              <div class="small text-muted">
                Prob: <span id="d-prob">—</span> · Thr: <span id="d-thr">—</span>
              </div>
              <hr>
              <div>
                <div class="small fw-semibold mb-1">Top factors</div>
                <div id="d-factors" class="factor-list"></div>
              </div>
            </div>
          </div>
        </div>

        <!-- Hypertension card -->
        <div class="col-md-6">
          <div class="card result-card shadow-sm">
            <div class="card-body">
              <div class="d-flex justify-content-between align-items-center">
                <h6 class="mb-0">Hypertension</h6>
                <span id="h-decision" class="badge text-bg-secondary">—</span>
              </div>
              <div class="chart-wrap my-3">
                <canvas id="h-chart"></canvas>
              </div>
              <div class="small text-muted">
                Prob: <span id="h-prob">—</span> · Thr: <span id="h-thr">—</span>
              </div>
              <hr>
              <div>
                <div class="small fw-semibold mb-1">Top factors</div>
                <div id="h-factors" class="factor-list"></div>
              </div>
            </div>
          </div>
        </div>

      </div>
      <div class="alert alert-secondary mt-3 small" id="disclaimer">—</div>
    </div>

    <div id="empty-state" class="empty-state text-muted">
      Enter values and click Predict to see calibrated risks and decisions.
    </div>
  </div>
</div>
{% endblock %}
""")

# ---------- app.css ----------
app_css = textwrap.dedent("""\
.result-card .badge { font-size: 0.8rem; }
.chart-wrap { width: 100%; height: 180px; }
.empty-state { padding: 2rem; text-align: center; border: 1px dashed #ddd; border-radius: .5rem; background: #fff; }
.factor-list .badge { margin: 0 .25rem .25rem 0; }
.factor-badge { font-size: .75rem; }
""")

# ---------- app.js ----------
app_js = textwrap.dedent("""\
const fmtPct = (x) => (x*100).toFixed(1) + "%";

let dChart = null, hChart = null;

function renderDonut(ctxId, prob) {
  const el = document.getElementById(ctxId);
  const data = [prob, 1-prob];
  const cfg = {
    type: 'doughnut',
    data: {
      labels: ['Risk', ''],
      datasets: [{ data, borderWidth: 0 }]
    },
    options: {
      cutout: '70%',
      plugins: { legend: { display:false } },
      responsive: true,
      maintainAspectRatio: false
    }
  };
  const chart = new Chart(el, cfg);
  return chart;
}

function updateBadge(elId, decision) {
  const el = document.getElementById(elId);
  if (decision === 1) {
    el.className = "badge text-bg-danger";
    el.innerText = "High";
  } else {
    el.className = "badge text-bg-success";
    el.innerText = "Low";
  }
}

function renderFactors(elId, items) {
  const el = document.getElementById(elId);
  el.innerHTML = "";
  if (!items || items.length === 0) {
    el.innerHTML = '<span class="text-muted small">Install SHAP for factor insights.</span>';
    return;
  }
  items.slice(0,5).forEach(([name, val]) => {
    const badge = document.createElement("span");
    badge.className = "badge rounded-pill text-bg-light factor-badge";
    badge.innerText = name.replace("extra__", "");
    el.appendChild(badge);
  });
}

async function submitForm(e) {
  e.preventDefault();
  const form = document.getElementById("predict-form");
  const fd = new FormData(form);
  const payload = {};
  fd.forEach((v,k) => { if (v !== "") payload[k] = isNaN(v) ? v : Number(v); });

  const resp = await fetch("/predict", {
    method: "POST",
    headers: { "Content-Type":"application/json" },
    body: JSON.stringify(payload)
  });

  const resultsWrap = document.getElementById("results");
  const emptyState = document.getElementById("empty-state");
  const disclaimer = document.getElementById("disclaimer");

  if (!resp.ok) {
    const err = await resp.json().catch(()=>({error:"Unknown error"}));
    emptyState.innerText = "Error: " + (err.error || resp.statusText);
    resultsWrap.classList.add("d-none");
    emptyState.classList.remove("d-none");
    return;
  }

  const data = await resp.json();

  // Numbers
  const dProb = data.diabetes.prob, dThr = data.diabetes.threshold, dDec = data.diabetes.decision;
  const hProb = data.hypertension.prob, hThr = data.hypertension.threshold, hDec = data.hypertension.decision;

  // Update donuts
  if (dChart) dChart.destroy();
  if (hChart) hChart.destroy();
  dChart = renderDonut("d-chart", dProb);
  hChart = renderDonut("h-chart", hProb);

  // Badges, text, factors
  updateBadge("d-decision", dDec);
  updateBadge("h-decision", hDec);
  document.getElementById("d-prob").innerText = fmtPct(dProb);
  document.getElementById("d-thr").innerText  = fmtPct(dThr);
  document.getElementById("h-prob").innerText = fmtPct(hProb);
  document.getElementById("h-thr").innerText  = fmtPct(hThr);

  renderFactors("d-factors", data.diabetes.top_factors);
  renderFactors("h-factors", data.hypertension.top_factors);

  // Show disclaimer
  disclaimer.innerText = data.disclaimer || "Educational decision support.";

  // Toggle visibility
  resultsWrap.classList.remove("d-none");
  emptyState.classList.add("d-none");
}

document.addEventListener("DOMContentLoaded", () => {
  const form = document.getElementById("predict-form");
  form.addEventListener("submit", submitForm);
});
""")

# write files
(TPL / "base.html").write_text(base_html, encoding="utf-8")
(TPL / "index.html").write_text(index_html, encoding="utf-8")
(STC / "css" / "app.css").write_text(app_css, encoding="utf-8")
(STC / "js" / "app.js").write_text(app_js, encoding="utf-8")

print("✅ Wrote templates and static assets:")
print(" -", (TPL / "base.html").resolve())
print(" -", (TPL / "index.html").resolve())
print(" -", (STC / "css" / "app.css").resolve())
print(" -", (STC / "js" / "app.js").resolve())


✅ Wrote templates and static assets:
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\templates\base.html
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\templates\index.html
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\static\css\app.css
 - C:\Users\hp\Desktop\Cardio Metrix\cardiometrix\app\static\js\app.js


In [34]:
# W4-2B: Update Flask app to render the dashboard
from pathlib import Path
import textwrap

APP = Path.cwd() / "cardiometrix" / "app"

app_flask_html = textwrap.dedent("""\
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from .predictor import predict

app = Flask(__name__, template_folder="templates", static_folder="static")
CORS(app)

@app.get("/health")
def health():
    return jsonify({"status":"ok"}), 200

@app.get("/")
def index():
    return render_template("index.html")

@app.post("/predict")
def predict_endpoint():
    try:
        payload = request.get_json(force=True) or {}
    except Exception:
        return jsonify({"error":"Invalid JSON payload"}), 400

    if "age" not in payload:
        return jsonify({"error":"Missing required field: age"}), 400

    try:
        out = predict(payload)
        out["disclaimer"] = (
            "Educational decision support. Not a diagnosis. "
            "Use clinical judgment and confirm with appropriate tests."
        )
        return jsonify(out), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8000, debug=True)
""")

(APP / "app_flask.py").write_text(app_flask_html, encoding="utf-8")
print("✅ Updated app_flask.py with dashboard routes.")


✅ Updated app_flask.py with dashboard routes.
