<a href="https://colab.research.google.com/github/24215011123/DL_Lab-session_may-june2025/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 0: Imports & config

In [1]:
# =========================
# Full pipeline
# 70/15/15 split, preprocess fitted on train only, SMOTE on train only
# Models: LogReg, RF, XGB, LightGBM, ANN, DeepMLP, TabTransformer
# Saves outputs to ./outputs
# =========================

# Section 0: Imports & config
import os, json
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, brier_score_loss,
                             confusion_matrix, roc_curve, precision_recall_curve)
from sklearn.calibration import calibration_curve
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import shap
import warnings
warnings.filterwarnings("ignore")

# Seeds & directories
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
torch.manual_seed(SEED)
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

# Runtime knobs (lower for debug)
EPOCHS_TAB = 50
EPOCHS_DEEPMLP = 50
BATCH_TAB = 512
BATCH_DEEPMLP = 512

# Section 1: Load data

In [2]:
# Section 1: Load data
DATA_PATH = "/content/heart_disease_health_indicators_BRFSS2015.csv"
df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)

Raw shape: (253680, 22)


# Section 2–5: Features + Preprocessing + Split + SMOTE

In [3]:
# =========================
# Section 2–5: Features + Preprocessing + Split + SMOTE
# =========================
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# --- Define feature groups ---
binary_cols = ["HighBP","HighChol","CholCheck","Smoker","Stroke","Diabetes",
               "PhysActivity","Fruits","Veggies","HvyAlcoholConsump",
               "AnyHealthcare","NoDocbcCost","DiffWalk","Sex"]

numeric_cols = ["BMI","MentHlth","PhysHlth","Age"]

categorical_cols = ["GenHlth","Education","Income"]

all_cols = binary_cols + numeric_cols + categorical_cols
print("Using features (count):", len(all_cols))

# --- Target & raw features ---
TARGET = "HeartDiseaseorAttack"
y_raw = df[TARGET].values.astype(int)   # ensure integers
X_raw = df[all_cols].copy()

print("Raw shape:", X_raw.shape)

# --- Split 70/15/15 ---
X_train_raw, X_temp_raw, y_train_raw, y_temp_raw = train_test_split(
    X_raw, y_raw, test_size=0.30, stratify=y_raw, random_state=SEED
)
X_val_raw, X_test_raw, y_val_raw, y_test_raw = train_test_split(
    X_temp_raw, y_temp_raw, test_size=0.50, stratify=y_temp_raw, random_state=SEED
)
print("Raw splits:", X_train_raw.shape, X_val_raw.shape, X_test_raw.shape)

# --- Preprocessing with Imputation ---
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, numeric_cols),
    ("bin_cat", cat_transformer, binary_cols + categorical_cols)
], remainder="drop")

# Fit only on train
preprocessor.fit(X_train_raw)

X_train = preprocessor.transform(X_train_raw)
X_val   = preprocessor.transform(X_val_raw)
X_test  = preprocessor.transform(X_test_raw)

# Ensure dense
if hasattr(X_train, "toarray"):
    X_train = X_train.toarray()
    X_val   = X_val.toarray()
    X_test  = X_test.toarray()

print("Processed shapes:", X_train.shape, X_val.shape, X_test.shape)
print("NaN check:", np.isnan(X_train).sum(), np.isnan(X_val).sum(), np.isnan(X_test).sum())

# --- SMOTE on training only ---
sm = SMOTE(random_state=SEED)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train_raw)
print("After SMOTE:", X_train_res.shape, np.bincount(y_train_res.astype(int)))

# --- Save feature names ---
FEAT_NAMES = numeric_cols + binary_cols + categorical_cols
pd.Series(FEAT_NAMES).to_csv(os.path.join(OUTDIR,"features_used.csv"), index=False)


Using features (count): 21
Raw shape: (253680, 21)
Raw splits: (177576, 21) (38052, 21) (38052, 21)
Processed shapes: (177576, 21) (38052, 21) (38052, 21)
NaN check: 0 0 0
After SMOTE: (321702, 21) [160851 160851]


#Section 6: Metric helpers

In [4]:
#Section 6: Metric helpers
def compute_metrics(y_true, probs, thresh=0.5):
    preds = (probs >= thresh).astype(int)
    return {
        "acc": float(accuracy_score(y_true, preds)),
        "prec": float(precision_score(y_true, preds, zero_division=0)),
        "rec": float(recall_score(y_true, preds, zero_division=0)),
        "f1": float(f1_score(y_true, preds, zero_division=0)),
        "auc": float(roc_auc_score(y_true, probs)),
        "ap": float(average_precision_score(y_true, probs)),
        "brier": float(brier_score_loss(y_true, probs))
    }

#Section 7: Train classical models on X_train_res / evaluate on X_test

In [5]:
#Section 7: Train classical models on X_train_res / evaluate on X_test
results = {}

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=SEED)
logreg.fit(X_train_res, y_train_res)
probs_log = logreg.predict_proba(X_test)[:,1]
metrics_log = compute_metrics(y_test_raw, probs_log)
cm_log = confusion_matrix(y_test_raw, (probs_log>=0.5).astype(int))
results['LogReg'] = metrics_log
print("LogReg metrics:", metrics_log)

# Random Forest
rf = RandomForestClassifier(n_estimators=300, class_weight="balanced", n_jobs=-1, random_state=SEED)
rf.fit(X_train_res, y_train_res)
probs_rf = rf.predict_proba(X_test)[:,1]
metrics_rf = compute_metrics(y_test_raw, probs_rf)
cm_rf = confusion_matrix(y_test_raw, (probs_rf>=0.5).astype(int))
results['RandomForest'] = metrics_rf
print("RF metrics:", metrics_rf)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=SEED, n_jobs=-1)
xgb_model.fit(X_train_res, y_train_res)
probs_xgb = xgb_model.predict_proba(X_test)[:,1]
metrics_xgb = compute_metrics(y_test_raw, probs_xgb)
cm_xgb = confusion_matrix(y_test_raw, (probs_xgb>=0.5).astype(int))
results['XGBoost'] = metrics_xgb
print("XGB metrics:", metrics_xgb)

# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=SEED)
lgb_model.fit(X_train_res, y_train_res)
probs_lgb = lgb_model.predict_proba(X_test)[:,1]
metrics_lgb = compute_metrics(y_test_raw, probs_lgb)
cm_lgb = confusion_matrix(y_test_raw, (probs_lgb>=0.5).astype(int))
results['LightGBM'] = metrics_lgb
print("LGB metrics:", metrics_lgb)

LogReg metrics: {'acc': 0.7548617681068012, 'prec': 0.24731655815590356, 'rec': 0.7843191964285714, 'f1': 0.37605351170568563, 'auc': 0.8446944497043841, 'ap': 0.36197065181252813, 'brier': 0.1626198876815193}
RF metrics: {'acc': 0.8966151582045622, 'prec': 0.40179573512906847, 'rec': 0.19977678571428573, 'f1': 0.2668654491241148, 'auc': 0.8231896105885999, 'ap': 0.30583578408017187, 'brier': 0.0774669397596344}
XGB metrics: {'acc': 0.9059970566593083, 'prec': 0.5035246727089627, 'rec': 0.13950892857142858, 'f1': 0.2184837229626393, 'auc': 0.8436233216187065, 'ap': 0.3582914260042247, 'brier': 0.07134264188398001}
[LightGBM] [Info] Number of positive: 160851, number of negative: 160851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in th

# Section 8: Neural baselines (Keras)

In [12]:
# Section 8: Neural baselines (Keras)
input_dim = X_train_res.shape[1]

# Simple ANN
ann = Sequential([Dense(32, activation='relu', input_shape=(input_dim,)), Dense(1, activation='sigmoid')])
ann.compile(optimizer='adam', loss='binary_crossentropy')
ann.fit(X_train_res, y_train_res, validation_data=(X_val, y_val_raw), epochs=50, batch_size=512, verbose=1)
probs_ann = ann.predict(X_test).flatten()
metrics_ann = compute_metrics(y_test_raw, probs_ann)
cm_ann = confusion_matrix(y_test_raw, (probs_ann>=0.5).astype(int))
results['ANN'] = metrics_ann
print("ANN metrics:", metrics_ann)

# Deep MLP
deep_mlp = Sequential([
    Dense(256, activation='relu', input_shape=(input_dim,)),
    BatchNormalization(), Dropout(0.3),
    Dense(128, activation='relu'), BatchNormalization(), Dropout(0.3),
    Dense(64, activation='relu'), BatchNormalization(), Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
deep_mlp.compile(optimizer='adam', loss='binary_crossentropy')
deep_mlp.fit(X_train_res, y_train_res, validation_data=(X_val, y_val_raw), epochs=EPOCHS_DEEPMLP, batch_size=BATCH_DEEPMLP, verbose=1)
probs_dmlp = deep_mlp.predict(X_test).flatten()
metrics_dmlp = compute_metrics(y_test_raw, probs_dmlp)
cm_dmlp = confusion_matrix(y_test_raw, (probs_dmlp>=0.5).astype(int))
results['DeepMLP'] = metrics_dmlp
print("DeepMLP metrics:", metrics_dmlp)

# Save intermediate results
pd.DataFrame(results).T.to_csv(os.path.join(OUTDIR, "results_before_tab.csv"))

Epoch 1/5
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.5298 - val_loss: 0.4812
Epoch 2/5
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.4644 - val_loss: 0.4750
Epoch 3/5
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.4598 - val_loss: 0.4709
Epoch 4/5
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.4560 - val_loss: 0.4670
Epoch 5/5
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.4527 - val_loss: 0.4650
[1m1190/1190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
ANN metrics: {'acc': 0.7617733627667402, 'prec': 0.2513383540513565, 'rec': 0.7728794642857143, 'f1': 0.3793221499486477, 'auc': 0.8439556125557452, 'ap': 0.3586664068354871, 'brier': 0.15612085764817457}
Epoch 1/50
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.5033 - val_loss: 0.5491
E

# Section 9: TabTransformer (single split; robust attention)

In [11]:

# Section 9: TabTransformer (single split; robust attention)
class TabDatasetTorch(Dataset):
    def __init__(self, X_np, y_np):
        self.X = torch.tensor(X_np, dtype=torch.float32)
        self.y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class TabTransformerTorch(nn.Module):
    def __init__(self, input_dim, emb_dim=16, n_heads=2, mlp_hidden=64, dropout=0.1):
        super().__init__()
        self.proj = nn.Linear(1, emb_dim)
        self.mha = nn.MultiheadAttention(embed_dim=emb_dim, num_heads=n_heads, batch_first=True)
        self.ff = nn.Sequential(nn.Linear(emb_dim, emb_dim), nn.ReLU())
        self.classifier = nn.Sequential(
            nn.Linear(input_dim*emb_dim, mlp_hidden), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1)
        )
    def forward(self, x):
        b,f = x.shape
        x = x.unsqueeze(-1)            # (b,f,1)
        emb = self.proj(x)             # (b,f,emb_dim)
        attn_out, attn_weights = self.mha(emb, emb, emb, need_weights=True, average_attn_weights=False)
        out = self.ff(emb + attn_out)
        flat = out.reshape(b, -1)
        logits = self.classifier(flat)
        return logits, attn_weights, out

def evaluate_tabtransformer_single_split(X_train_res, y_train_res, X_val, y_val, X_test, y_test,
                                         n_epochs=EPOCHS_TAB, batch_size=BATCH_TAB, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    model = TabTransformerTorch(input_dim=X_train_res.shape[1]).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()

    train_ds = TabDatasetTorch(X_train_res, y_train_res)
    val_ds = TabDatasetTorch(X_val, y_val)
    test_ds = TabDatasetTorch(X_test, y_test)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    best_val_loss = np.inf
    best_state = None
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            logits, _, _ = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            opt.step()
            total_loss += loss.item()
        # validation loss
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits, _, _ = model(xb)
                loss = loss_fn(logits, yb)
                val_losses.append(loss.item())
        mean_val_loss = np.mean(val_losses) if len(val_losses)>0 else np.nan
        print(f"TabTransformer epoch {epoch+1}/{n_epochs} train_loss={total_loss/len(train_loader):.4f} val_loss={mean_val_loss:.4f}")
        if mean_val_loss < best_val_loss:
            best_val_loss = mean_val_loss
            best_state = model.state_dict().copy()

    # load best
    if best_state is not None:
        model.load_state_dict(best_state)

    # evaluate on test
    model.eval()
    all_probs, all_labels, attn_list = [], [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            logits, attn_weights, _ = model(xb)
            probs = torch.sigmoid(logits).cpu().numpy().flatten()
            all_probs.append(probs)
            all_labels.append(yb.numpy().flatten())

            # robust attention aggregation
            attn_np = attn_weights.cpu().numpy()
            if attn_np.ndim == 4:      # (batch, heads, f, f)
                attn_np2 = attn_np.mean(axis=1)        # -> (batch, f, f)
                attn_feature_importance = attn_np2.mean(axis=(0,2))  # -> (f,)
            elif attn_np.ndim == 3:    # (batch, f, f)
                attn_feature_importance = attn_np.mean(axis=(0,2))
            elif attn_np.ndim == 2:    # (batch, f)
                attn_feature_importance = attn_np.mean(axis=0)
            else:
                raise ValueError("Unexpected attn_weights shape: {}".format(attn_np.shape))
            attn_list.append(attn_feature_importance)

    probs = np.concatenate(all_probs)
    labels = np.concatenate(all_labels)
    metrics = compute_metrics(labels, probs)
    cm = confusion_matrix(labels, (probs >= 0.5).astype(int))
    attn_mean = np.mean(np.vstack(attn_list), axis=0)
    return metrics, cm, (labels, probs), attn_mean

metrics_tab, cm_tab, (labels_tab, probs_tab), attn_mean = evaluate_tabtransformer_single_split(
    X_train_res, y_train_res, X_val, y_val_raw, X_test, y_test_raw,
    n_epochs=EPOCHS_TAB, batch_size=BATCH_TAB
)
results['TabTransformer'] = metrics_tab
print("TabTransformer metrics:", metrics_tab)

TabTransformer epoch 1/50 train_loss=0.4559 val_loss=0.6062
TabTransformer epoch 2/50 train_loss=0.3221 val_loss=0.3518
TabTransformer epoch 3/50 train_loss=0.2757 val_loss=0.3180
TabTransformer epoch 4/50 train_loss=0.2668 val_loss=0.4317
TabTransformer epoch 5/50 train_loss=0.2638 val_loss=0.3209
TabTransformer epoch 6/50 train_loss=0.2615 val_loss=0.2847
TabTransformer epoch 7/50 train_loss=0.2593 val_loss=0.2977
TabTransformer epoch 8/50 train_loss=0.2562 val_loss=0.3342
TabTransformer epoch 9/50 train_loss=0.2550 val_loss=0.3216
TabTransformer epoch 10/50 train_loss=0.2656 val_loss=0.3478
TabTransformer epoch 11/50 train_loss=0.2533 val_loss=0.3243
TabTransformer epoch 12/50 train_loss=0.2662 val_loss=0.3142
TabTransformer epoch 13/50 train_loss=0.2576 val_loss=0.3029
TabTransformer epoch 14/50 train_loss=0.2630 val_loss=0.3445
TabTransformer epoch 15/50 train_loss=0.2505 val_loss=0.2752
TabTransformer epoch 16/50 train_loss=0.2527 val_loss=0.3209
TabTransformer epoch 17/50 train_

#Section 10: Explainability (SHAP for XGB, Attention for TabTransformer)
# SHAP (may be slow; sample test)

In [13]:
#Section 10: Explainability (SHAP for XGB, Attention for TabTransformer)
# SHAP (may be slow; sample test)
try:
    sample_n = min(2000, X_test.shape[0])
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_test[:sample_n])
    # Save SHAP plot
    plt.figure()
    shap.summary_plot(shap_values, X_test[:sample_n], feature_names=FEAT_NAMES, show=False)
    plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"shap_summary_xgb.png"), dpi=300); plt.close()
except Exception as e:
    print("SHAP failed or slow:", e)

# Tab attention
df_att = pd.DataFrame({"feature": FEAT_NAMES, "attn": attn_mean})
df_att = df_att.sort_values("attn", ascending=False)
df_att.to_csv(os.path.join(OUTDIR,"tab_attention.csv"), index=False)
plt.figure(figsize=(8,6))
sns.barplot(x="attn", y="feature", data=df_att.head(15))
plt.title("Top 15 Features by TabTransformer Attention")
plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"tab_attention_top15.png"), dpi=300); plt.close()

# SHAP vs Attention comparison (top 10)
try:
    shap_imp = np.abs(shap_values).mean(axis=0)
    df_shap = pd.DataFrame({"feature": FEAT_NAMES, "shap": shap_imp}).sort_values("shap", ascending=False)
    df_compare = pd.DataFrame({
        "SHAP_top10_feature": df_shap["feature"].head(10).values,
        "SHAP_value": df_shap["shap"].head(10).values,
        "ATTN_top10_feature": df_att["feature"].head(10).values,
        "ATTN_value": df_att["attn"].head(10).values
    })
    df_compare.to_csv(os.path.join(OUTDIR,"explain_compare_top10.csv"), index=False)
except Exception as e:
    print("SHAP vs Attn compare failed:", e)


# Section 11: Visualization & final table

In [14]:
# Section 11: Visualization & final table
probs_dict = {
    "LogReg": probs_log,
    "RandomForest": probs_rf,
    "XGBoost": probs_xgb,
    "LightGBM": probs_lgb,
    "ANN": probs_ann,
    "DeepMLP": probs_dmlp,
    "TabTransformer": probs_tab
}

# ROC
plt.figure(figsize=(7,6))
for name,p in probs_dict.items():
    fpr,tpr,_ = roc_curve(y_test_raw, p)
    aucv = roc_auc_score(y_test_raw, p)
    plt.plot(fpr,tpr,label=f"{name} (AUC={aucv:.3f})")
plt.plot([0,1],[0,1],'k--'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curves (Test)"); plt.legend(bbox_to_anchor=(1.05,1)); plt.tight_layout()
plt.savefig(os.path.join(OUTDIR,"roc_all.png"), dpi=300); plt.close()

# PR
plt.figure(figsize=(7,6))
for name,p in probs_dict.items():
    prec,rec,_ = precision_recall_curve(y_test_raw, p)
    ap = average_precision_score(y_test_raw, p)
    plt.plot(rec,prec,label=f"{name} (AP={ap:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curves (Test)"); plt.legend(bbox_to_anchor=(1.05,1)); plt.tight_layout()
plt.savefig(os.path.join(OUTDIR,"pr_all.png"), dpi=300); plt.close()

# Calibration (LogReg, XGB, Tab)
plt.figure(figsize=(6,6))
for name in ["LogReg","XGBoost","TabTransformer"]:
    p = probs_dict[name]
    frac_pos, mean_pred = calibration_curve(y_test_raw, p, n_bins=10)
    plt.plot(mean_pred, frac_pos, marker='o', label=name)
plt.plot([0,1],[0,1],'k--'); plt.xlabel("Mean predicted prob"); plt.ylabel("Fraction positives"); plt.title("Calibration"); plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTDIR,"calibration.png"), dpi=300); plt.close()

# Confusion matrices saved
cms = {"LogReg":cm_log,"RandomForest":cm_rf,"XGBoost":cm_xgb,"LightGBM":cm_lgb,"ANN":cm_ann,"DeepMLP":cm_dmlp,"TabTransformer":cm_tab}
for name,cm in cms.items():
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Pred 0","Pred 1"], yticklabels=["True 0","True 1"])
    plt.title(f"{name} Confusion Matrix"); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,f"cm_{name}.png"), dpi=300); plt.close()

# Final table
df_final = pd.DataFrame(results).T
df_final.to_csv(os.path.join(OUTDIR,"final_results_table.csv"))
print("Final results table:\n", df_final)

# Bootstrap AUC CI for Tab and XGB (test set)
def bootstrap_auc_ci(y_true, probs, n_boot=2000):
    rng = np.random.RandomState(SEED)
    aucs = []
    n = len(y_true)
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)
        if len(np.unique(y_true[idx]))<2:
            continue
        aucs.append(roc_auc_score(y_true[idx], probs[idx]))
    return np.mean(aucs), (np.percentile(aucs,2.5), np.percentile(aucs,97.5))

tab_mean_auc, tab_ci = bootstrap_auc_ci(y_test_raw, probs_tab)
xgb_mean_auc, xgb_ci = bootstrap_auc_ci(y_test_raw, probs_xgb)
print(f"TabTransformer AUC {tab_mean_auc:.4f}, 95% CI {tab_ci}")
print(f"XGBoost AUC       {xgb_mean_auc:.4f}, 95% CI {xgb_ci}")

# Save run metadata
meta = {"seed":SEED, "n_samples":X_train.shape[0]+X_val.shape[0]+X_test.shape[0], "n_features": X_train.shape[1],
        "train_shape": X_train_res.shape, "val_shape": X_val.shape, "test_shape": X_test.shape,
        "epochs_tab": EPOCHS_TAB, "epochs_deepmlp": EPOCHS_DEEPMLP}
with open(os.path.join(OUTDIR,"run_meta.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("All outputs saved to", OUTDIR)

Final results table:
                      acc      prec       rec        f1       auc        ap  \
LogReg          0.754809  0.247406  0.785156  0.376254  0.844712  0.361958   
RandomForest    0.896300  0.397971  0.196987  0.263531  0.823814  0.305898   
XGBoost         0.905971  0.502959  0.142299  0.221836  0.844094  0.357719   
LightGBM        0.905051  0.488519  0.172154  0.254590  0.844993  0.360254   
ANN             0.761773  0.251338  0.772879  0.379322  0.843956  0.358666   
DeepMLP         0.861479  0.333531  0.471540  0.390706  0.828693  0.331464   
TabTransformer  0.882687  0.358974  0.312500  0.334129  0.822618  0.307228   

                   brier  
LogReg          0.162647  
RandomForest    0.077457  
XGBoost         0.071370  
LightGBM        0.071725  
ANN             0.156121  
DeepMLP         0.096169  
TabTransformer  0.084953  
TabTransformer AUC 0.8226, 95% CI (np.float64(0.8159097965962926), np.float64(0.8290183304208256))
XGBoost AUC       0.8441, 95% CI (np.f

In [6]:
"""
Final consolidated pipeline (single script)
- Preprocessing (impute+scale), 70/15/15 split
- SMOTE on training only
- Models: LogReg, RF, XGB, LGBM, ANN, DeepMLP (focal loss), TabTransformer (PyTorch, optional focal loss)
- Ensemble (soft voting)
- Threshold tuning (val set maximize F1)
- Save results, plots, explainability artifacts
"""

# =========================
# Section 0 — Imports & Config
# =========================
import os, json, warnings, time
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, brier_score_loss,
                             confusion_matrix, roc_curve, precision_recall_curve)
from sklearn.calibration import calibration_curve
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
import tensorflow.keras.backend as K

# PyTorch for TabTransformer
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Explainability
import shap

warnings.filterwarnings("ignore")

# reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
torch.manual_seed(SEED)

# I/O
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

# runtime knobs (change for final run)
EPOCHS_TAB = 50
EPOCHS_DEEPMLP = 50
BATCH_TAB = 512
BATCH_DEEPMLP = 512

# =========================
# Section 1 — Load dataset & features
# =========================
DATA_PATH = "/content/heart_disease_health_indicators_BRFSS2015.csv"
df = pd.read_csv(DATA_PATH)
print("Raw shape:", df.shape)

binary_cols = ["HighBP","HighChol","CholCheck","Smoker","Stroke","Diabetes",
               "PhysActivity","Fruits","Veggies","HvyAlcoholConsump",
               "AnyHealthcare","NoDocbcCost","DiffWalk","Sex"]

numeric_cols = ["BMI","MentHlth","PhysHlth","Age"]
categorical_cols = ["GenHlth","Education","Income"]   # keep as integer labels

ALL_FEATURES = binary_cols + numeric_cols + categorical_cols
TARGET = "HeartDiseaseorAttack"

X_raw = df[ALL_FEATURES].copy()
y_raw = df[TARGET].values.astype(int)
print("Using features (count):", len(ALL_FEATURES))

# =========================
# Section 2 — 70/15/15 Split (train/val/test)
# =========================
X_train_raw, X_temp_raw, y_train_raw, y_temp_raw = train_test_split(
    X_raw, y_raw, test_size=0.30, stratify=y_raw, random_state=SEED
)
X_val_raw, X_test_raw, y_val_raw, y_test_raw = train_test_split(
    X_temp_raw, y_temp_raw, test_size=0.50, stratify=y_temp_raw, random_state=SEED
)
print("Split sizes (train/val/test):", X_train_raw.shape, X_val_raw.shape, X_test_raw.shape)

# =========================
# Section 3 — Preprocessing with Imputation (fit on train only)
# =========================
num_transformer = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_transformer = Pipeline([("imputer", SimpleImputer(strategy="most_frequent"))])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, numeric_cols),
    ("cat", cat_transformer, binary_cols + categorical_cols)
], remainder="drop")

preprocessor.fit(X_train_raw)

X_train = preprocessor.transform(X_train_raw)
X_val   = preprocessor.transform(X_val_raw)
X_test  = preprocessor.transform(X_test_raw)

# ensure dense arrays
if hasattr(X_train, "toarray"):
    X_train = X_train.toarray(); X_val = X_val.toarray(); X_test = X_test.toarray()

# save feature order (numeric scaled first, then passthrough)
FEAT_NAMES = numeric_cols + (binary_cols + categorical_cols)
pd.Series(FEAT_NAMES).to_csv(os.path.join(OUTDIR,"features_used.csv"), index=False)

print("Processed shapes:", X_train.shape, X_val.shape, X_test.shape)
print("NaNs after preprocess:", np.isnan(X_train).sum(), np.isnan(X_val).sum(), np.isnan(X_test).sum())

# =========================
# Section 4 — SMOTE on training set only
# =========================
sm = SMOTE(random_state=SEED)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train_raw.astype(int))
print("After SMOTE:", X_train_res.shape, np.bincount(y_train_res.astype(int)))

# =========================
# Section 5 — helpers (metrics, threshold tuning)
# =========================
def compute_metrics(y_true, probs, thresh=0.5):
    preds = (probs >= thresh).astype(int)
    return {
        "acc": float(accuracy_score(y_true, preds)),
        "prec": float(precision_score(y_true, preds, zero_division=0)),
        "rec": float(recall_score(y_true, preds, zero_division=0)),
        "f1": float(f1_score(y_true, preds, zero_division=0)),
        "auc": float(roc_auc_score(y_true, probs)),
        "ap": float(average_precision_score(y_true, probs)),
        "brier": float(brier_score_loss(y_true, probs))
    }

from sklearn.metrics import f1_score
def find_best_threshold(y_true, y_probs):
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.05, 0.95, 91):
        f1 = f1_score(y_true, (y_probs>=t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t, best_f1

# containers
results_baseline = {}
results_optimized = {}

def eval_and_store(name, probs_val, probs_test, y_val, y_test):
    # baseline (0.5)
    metrics_base = compute_metrics(y_test, probs_test, 0.5)
    results_baseline[name] = metrics_base
    # optimized threshold using val set
    best_t, best_f1 = find_best_threshold(y_val, probs_val)
    metrics_opt = compute_metrics(y_test, probs_test, best_t)
    results_optimized[name] = metrics_opt
    print(f"{name} | best_t={best_t:.3f} (val F1={best_f1:.3f}) | base_f1={metrics_base['f1']:.3f} opt_f1={metrics_opt['f1']:.3f}")
    return best_t

# =========================
# Section 6 — Classical ML: LogReg, RF, XGB, LGBM
# =========================
print("\nTraining classical models...")

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=SEED, class_weight="balanced")
logreg.fit(X_train_res, y_train_res)
probs_log_test = logreg.predict_proba(X_test)[:,1]
probs_log_val  = logreg.predict_proba(X_val)[:,1]
eval_and_store("LogReg", probs_log_val, probs_log_test, y_val_raw, y_test_raw)

# Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=SEED, class_weight="balanced", n_jobs=-1)
rf.fit(X_train_res, y_train_res)
probs_rf_test = rf.predict_proba(X_test)[:,1]
probs_rf_val  = rf.predict_proba(X_val)[:,1]
eval_and_store("RandomForest", probs_rf_val, probs_rf_test, y_val_raw, y_test_raw)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=SEED, n_jobs=-1)
xgb_model.fit(X_train_res, y_train_res)
probs_xgb_test = xgb_model.predict_proba(X_test)[:,1]
probs_xgb_val  = xgb_model.predict_proba(X_val)[:,1]
eval_and_store("XGBoost", probs_xgb_val, probs_xgb_test, y_val_raw, y_test_raw)

# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=SEED)
lgb_model.fit(X_train_res, y_train_res)
probs_lgb_test = lgb_model.predict_proba(X_test)[:,1]
probs_lgb_val  = lgb_model.predict_proba(X_val)[:,1]
eval_and_store("LightGBM", probs_lgb_val, probs_lgb_test, y_val_raw, y_test_raw)

# =========================
# Section 7 — Neural baselines (Keras): ANN and DeepMLP with focal loss
# =========================
print("\nTraining ANN + DeepMLP (Keras)...")

def focal_loss_keras(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        y_true = K.cast(y_true, "float32")
        bce = K.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating = K.pow((1 - p_t), gamma)
        return K.mean(alpha_factor * modulating * bce)
    return loss

input_dim = X_train_res.shape[1]

# Simple ANN (baseline)
ann = Sequential([Dense(32, activation='relu', input_shape=(input_dim,)), Dense(1, activation='sigmoid')])
ann.compile(optimizer='adam', loss='binary_crossentropy')
ann.fit(X_train_res, y_train_res, validation_data=(X_val, y_val_raw), epochs=5, batch_size=512, verbose=1)
probs_ann_test = ann.predict(X_test).flatten()
probs_ann_val  = ann.predict(X_val).flatten()
eval_and_store("ANN", probs_ann_val, probs_ann_test, y_val_raw, y_test_raw)

# Deep MLP with focal loss
deep_mlp = Sequential([
    Dense(256, activation='relu', input_shape=(input_dim,)), BatchNormalization(), Dropout(0.3),
    Dense(128, activation='relu'), BatchNormalization(), Dropout(0.3),
    Dense(64, activation='relu'), BatchNormalization(), Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
deep_mlp.compile(optimizer='adam', loss=focal_loss_keras(gamma=2., alpha=0.25))
deep_mlp.fit(X_train_res, y_train_res, validation_data=(X_val, y_val_raw), epochs=EPOCHS_DEEPMLP, batch_size=BATCH_DEEPMLP, verbose=1)
probs_dmlp_test = deep_mlp.predict(X_test).flatten()
probs_dmlp_val  = deep_mlp.predict(X_val).flatten()
eval_and_store("DeepMLP", probs_dmlp_val, probs_dmlp_test, y_val_raw, y_test_raw)

# =========================
# Section 8 — Ensemble (soft voting of XGB+RF+LGB)
# =========================
print("\nTraining Voting Ensemble (XGB+RF+LGB)...")
ensemble = VotingClassifier(
    estimators=[
        ("xgb", xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=SEED)),
        ("rf", RandomForestClassifier(n_estimators=300, random_state=SEED, class_weight="balanced")),
        ("lgb", lgb.LGBMClassifier(random_state=SEED))
    ],
    voting="soft", n_jobs=-1
)
ensemble.fit(X_train_res, y_train_res)
probs_ens_test = ensemble.predict_proba(X_test)[:,1]
probs_ens_val  = ensemble.predict_proba(X_val)[:,1]
eval_and_store("Ensemble", probs_ens_val, probs_ens_test, y_val_raw, y_test_raw)

# =========================
# Section 9 — TabTransformer (PyTorch) — single split, focal loss optional
# =========================
print("\nTraining TabTransformer (PyTorch)...")

class TabDatasetTorch(Dataset):
    def __init__(self, X_np, y_np):
        self.X = torch.tensor(X_np, dtype=torch.float32)
        self.y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class TabTransformerTorch(nn.Module):
    def __init__(self, input_dim, emb_dim=16, n_heads=2, mlp_hidden=64, dropout=0.1):
        super().__init__()
        self.proj = nn.Linear(1, emb_dim)
        self.mha = nn.MultiheadAttention(embed_dim=emb_dim, num_heads=n_heads, batch_first=True)
        self.ff = nn.Sequential(nn.Linear(emb_dim, emb_dim), nn.ReLU())
        self.classifier = nn.Sequential(
            nn.Linear(input_dim*emb_dim, mlp_hidden), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1)
        )
    def forward(self, x):
        b,f = x.shape
        x = x.unsqueeze(-1)            # (b,f,1)
        emb = self.proj(x)             # (b,f,emb_dim)
        attn_out, attn_weights = self.mha(emb, emb, emb, need_weights=True, average_attn_weights=False)
        out = self.ff(emb + attn_out)
        flat = out.reshape(b, -1)
        logits = self.classifier(flat)
        return logits, attn_weights, out

# PyTorch focal loss
class FocalLossTorch(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, logits, targets):
        bce = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
        probs = torch.sigmoid(logits)
        pt = torch.where(targets == 1, probs, 1 - probs)
        focal = self.alpha * (1 - pt) ** self.gamma * bce
        return focal.mean()

def train_tabtransformer(X_train_res, y_train_res, X_val, y_val, X_test, y_test,
                         n_epochs=EPOCHS_TAB, batch_size=BATCH_TAB, use_focal=False, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    model = TabTransformerTorch(input_dim=X_train_res.shape[1]).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = FocalLossTorch(alpha=0.25, gamma=2.0) if use_focal else nn.BCEWithLogitsLoss()

    train_ds = TabDatasetTorch(X_train_res, y_train_res)
    val_ds = TabDatasetTorch(X_val, y_val)
    test_ds = TabDatasetTorch(X_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    best_val_loss = np.inf; best_state = None
    for epoch in range(n_epochs):
        model.train(); total_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            logits, _, _ = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward(); opt.step()
            total_loss += loss.item()
        # val
        model.eval(); val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits, _, _ = model(xb)
                val_losses.append(loss_fn(logits,yb).item())
        mean_val_loss = np.mean(val_losses) if len(val_losses)>0 else np.nan
        print(f"Tab epoch {epoch+1}/{n_epochs} train_loss={total_loss/len(train_loader):.4f} val_loss={mean_val_loss:.4f}")
        if mean_val_loss < best_val_loss:
            best_val_loss = mean_val_loss
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)

    # test inference
    model.eval()
    all_probs, all_labels, attn_maps = [], [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            logits, attn_weights, _ = model(xb)
            probs = torch.sigmoid(logits).cpu().numpy().flatten()
            all_probs.append(probs)
            all_labels.append(yb.numpy().flatten())

            # robust attn aggregation
            attn_np = attn_weights.cpu().numpy()
            if attn_np.ndim == 4:
                attn_np2 = attn_np.mean(axis=1)     # (batch, f, f)
                attn_feat = attn_np2.mean(axis=(0,2))
            elif attn_np.ndim == 3:
                attn_feat = attn_np.mean(axis=(0,2))
            elif attn_np.ndim == 2:
                attn_feat = attn_np.mean(axis=0)
            else:
                raise ValueError("Unexpected attn shape: {}".format(attn_np.shape))
            attn_maps.append(attn_feat)

    probs_test = np.concatenate(all_probs)
    labels_test = np.concatenate(all_labels)
    metrics = compute_metrics(labels_test, probs_test)
    cm = confusion_matrix(labels_test, (probs_test>=0.5).astype(int))
    attn_mean = np.mean(np.vstack(attn_maps), axis=0) if len(attn_maps)>0 else np.zeros(X_train_res.shape[1])
    return metrics, cm, (labels_test, probs_test), attn_mean, model

metrics_tab, cm_tab, (labels_tab, probs_tab), attn_mean, tab_model = train_tabtransformer(
    X_train_res, y_train_res, X_val, y_val_raw, X_test, y_test_raw,
    n_epochs=EPOCHS_TAB, batch_size=BATCH_TAB, use_focal=True
)
eval_and_store("TabTransformer", probs_tab, probs_tab, labels_tab, labels_tab)  # use val/test= same for tab (we used val during training)

# save attention
df_att = pd.DataFrame({"feature": FEAT_NAMES, "attn": attn_mean})
df_att.sort_values("attn", ascending=False).to_csv(os.path.join(OUTDIR,"tab_attention.csv"), index=False)
plt.figure(figsize=(8,6)); sns.barplot(x="attn", y="feature", data=df_att.head(15)); plt.title("Top 15 Tab Attention"); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"tab_attention_top15.png")); plt.close()

# =========================
# Section 10 — SHAP for XGB (optional, may be slow)
# =========================
try:
    sample_n = min(2000, X_test.shape[0])
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_test[:sample_n])
    plt.figure()
    shap.summary_plot(shap_values, X_test[:sample_n], feature_names=FEAT_NAMES, show=False)
    plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"shap_summary_xgb.png"), dpi=300); plt.close()
except Exception as e:
    print("SHAP skipped or failed:", e)

# =========================
# Section 11 — Plots & Final Tables (Baseline & Optimized)
# =========================
probs_dict_test = {
    "LogReg": probs_log_test,
    "RandomForest": probs_rf_test,
    "XGBoost": probs_xgb_test,
    "LightGBM": probs_lgb_test,
    "ANN": probs_ann_test,
    "DeepMLP": probs_dmlp_test,
    "Ensemble": probs_ens_test,
    "TabTransformer": probs_tab
}

# ROC plot
plt.figure(figsize=(8,6))
for name, p in probs_dict_test.items():
    fpr, tpr, _ = roc_curve(y_test_raw, p)
    aucv = roc_auc_score(y_test_raw, p)
    plt.plot(fpr, tpr, label=f"{name} AUC={aucv:.3f}")
plt.plot([0,1],[0,1],'k--'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC (Test)"); plt.legend(bbox_to_anchor=(1.05,1)); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"roc_all.png")); plt.close()

# PR plot
plt.figure(figsize=(8,6))
for name, p in probs_dict_test.items():
    prec, rec, _ = precision_recall_curve(y_test_raw, p)
    ap = average_precision_score(y_test_raw, p)
    plt.plot(rec, prec, label=f"{name} AP={ap:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR (Test)"); plt.legend(bbox_to_anchor=(1.05,1)); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"pr_all.png")); plt.close()

# Calibration (LogReg, XGB, Tab)
plt.figure(figsize=(6,6))
for name in ["LogReg","XGBoost","TabTransformer"]:
    p = probs_dict_test[name]
    frac_pos, mean_pred = calibration_curve(y_test_raw, p, n_bins=10)
    plt.plot(mean_pred, frac_pos, marker='o', label=name)
plt.plot([0,1],[0,1],'k--'); plt.xlabel("Mean pred prob"); plt.ylabel("Fraction positives"); plt.title("Calibration"); plt.legend(); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,"calibration.png")); plt.close()

# Confusion matrices
cms = {
    "LogReg": confusion_matrix(y_test_raw, (probs_log_test>=0.5).astype(int)),
    "RandomForest": confusion_matrix(y_test_raw, (probs_rf_test>=0.5).astype(int)),
    "XGBoost": confusion_matrix(y_test_raw, (probs_xgb_test>=0.5).astype(int)),
    "LightGBM": confusion_matrix(y_test_raw, (probs_lgb_test>=0.5).astype(int)),
    "ANN": confusion_matrix(y_test_raw, (probs_ann_test>=0.5).astype(int)),
    "DeepMLP": confusion_matrix(y_test_raw, (probs_dmlp_test>=0.5).astype(int)),
    "Ensemble": confusion_matrix(y_test_raw, (probs_ens_test>=0.5).astype(int)),
    "TabTransformer": cm_tab
}
for name, cm in cms.items():
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Pred0","Pred1"], yticklabels=["True0","True1"])
    plt.title(f"{name} Confusion Matrix"); plt.tight_layout(); plt.savefig(os.path.join(OUTDIR,f"cm_{name}.png")); plt.close()

# Final CSVs for baseline & optimized results
df_base = pd.DataFrame(results_baseline).T
df_opt  = pd.DataFrame(results_optimized).T
df_base.to_csv(os.path.join(OUTDIR,"results_baseline.csv"))
df_opt.to_csv(os.path.join(OUTDIR,"results_optimized.csv"))
df_compare = df_base.add_suffix("_base").merge(df_opt.add_suffix("_opt"), left_index=True, right_index=True)
df_compare.to_csv(os.path.join(OUTDIR,"results_improved.csv"))

print("\nBaseline (threshold=0.5):\n", df_base)
print("\nOptimized (val-tuned threshold):\n", df_opt)
print("\nSide-by-side saved to outputs/results_improved.csv")

# =========================
# Section 12 — simple bootstrap for AUC CI (Tab vs XGB)
# =========================
def bootstrap_auc_ci(y, probs, n_boot=1000):
    rng = np.random.RandomState(SEED)
    aucs = []
    n = len(y)
    for i in range(n_boot):
        idx = rng.randint(0, n, n)
        if len(np.unique(y[idx])) < 2: continue
        aucs.append(roc_auc_score(y[idx], probs[idx]))
    return np.mean(aucs), (np.percentile(aucs, 2.5), np.percentile(aucs,97.5))

tab_mu, tab_ci = bootstrap_auc_ci(y_test_raw, probs_tab)
xgb_mu, xgb_ci = bootstrap_auc_ci(y_test_raw, probs_xgb_test)
print(f"TabTransformer AUC {tab_mu:.4f}, 95% CI {tab_ci}")
print(f"XGBoost AUC       {xgb_mu:.4f}, 95% CI {xgb_ci}")

# Save metadata
meta = {
    "seed": SEED, "n_samples": int(X_train.shape[0] + X_val.shape[0] + X_test.shape[0]),
    "n_features": int(X_train.shape[1]), "train_shape": X_train_res.shape, "val_shape": X_val.shape, "test_shape": X_test.shape,
    "epochs_tab": EPOCHS_TAB, "epochs_deepmlp": EPOCHS_DEEPMLP
}
with open(os.path.join(OUTDIR,"run_meta.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("All outputs saved to", OUTDIR)


Raw shape: (253680, 22)
Using features (count): 21
Split sizes (train/val/test): (177576, 21) (38052, 21) (38052, 21)
Processed shapes: (177576, 21) (38052, 21) (38052, 21)
NaNs after preprocess: 0 0 0
After SMOTE: (321702, 21) [160851 160851]

Training classical models...
LogReg | best_t=0.690 (val F1=0.413) | base_f1=0.376 opt_f1=0.417
RandomForest | best_t=0.220 (val F1=0.378) | base_f1=0.267 opt_f1=0.390
XGBoost | best_t=0.210 (val F1=0.409) | base_f1=0.218 opt_f1=0.407
[LightGBM] [Info] Number of positive: 160851, number of negative: 160851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 321702, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM | best