In [None]:
# =============================================================================
# HEART DISEASE PREDICTION — POST-EDA MODELING PIPELINE
# =============================================================================
# Dataset   : Heart Disease (Clinical Features)
# Target    : Heart Disease (Absence = 0, Presence = 1)
# Objective : Predict coronary artery disease probability (ROC-AUC)
# Models    : LightGBM · XGBoost · CatBoost · Stacking Ensemble
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings("ignore")

# =============================================================================
# SECTION 1 — DATA LOADING & PREPROCESSING
# =============================================================================
print("=" * 70)
print("SECTION 1: DATA LOADING & PREPROCESSING")
print("=" * 70)

df    = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

# ── Target encoding ──────────────────────────────────────────────────────────
df["Heart Disease"] = df["Heart Disease"].map({"Absence": 0, "Presence": 1})

# ── Drop rows where target is NaN (1 row) ────────────────────────────────────
df.dropna(subset=["Heart Disease"], inplace=True)
df["Heart Disease"] = df["Heart Disease"].astype(int)

# ── Drop ID column ────────────────────────────────────────────────────────────
df.drop(columns=["id"], inplace=True)

print(f"Train shape : {df.shape}")
print(f"Test  shape : {test.shape}")
print(f"Target distribution:\n{df['Heart Disease'].value_counts()}\n")


# =============================================================================
# SECTION 2 — FEATURE GROUPS & PREPROCESSOR DEFINITIONS
# =============================================================================
print("=" * 70)
print("SECTION 2: FEATURE GROUPS & PREPROCESSORS")
print("=" * 70)

# ── Feature groups (based on EDA findings) ───────────────────────────────────
numeric_features = [
    "Age", "BP", "Cholesterol", "Max HR", "ST depression"
]

binary_features = [
    "Sex", "FBS over 120", "Exercise angina"
]

categorical_features = [
    "Chest pain type",   # nominal — 4 unordered categories
    "EKG results",       # nominal — values 0/1/2 are labels, not a scale
    "Slope of ST",       # nominal — non-linear jump between levels (EDA)
    "Thallium"           # nominal — non-consecutive codes 3/6/7
]

ordinal_features = [
    "Number of vessels fluro"   # ordinal — monotonic risk increase 0→3
]

TARGET = "Heart Disease"

# ── Preprocessor for tree-based models (no scaling needed) ───────────────────
preprocessor_tree = ColumnTransformer(transformers=[
    ("num", "passthrough",                                  numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"),         categorical_features),
    ("bin", "passthrough",                                  binary_features),
    ("ord", "passthrough",                                  ordinal_features),
])

# ── Preprocessor for linear models (StandardScaler + OneHot) ─────────────────
preprocessor_linear = ColumnTransformer(transformers=[
    ("num", StandardScaler(),                               numeric_features),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("bin", "passthrough",                                  binary_features),
    ("ord", "passthrough",                                  ordinal_features),
])

print("Feature groups defined.")
print(f"  Numeric     : {len(numeric_features)} features")
print(f"  Categorical : {len(categorical_features)} features (OneHot encoded)")
print(f"  Binary      : {len(binary_features)} features (pass-through)")
print(f"  Ordinal     : {len(ordinal_features)} feature  (pass-through)\n")


# =============================================================================
# SECTION 3 — CROSS-VALIDATION SETUP
# =============================================================================
print("=" * 70)
print("SECTION 3: CROSS-VALIDATION SETUP")
print("=" * 70)

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_test    = test.drop(columns=["id"])
test_ids  = test["id"]

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Strategy : 5-Fold Stratified K-Fold (maintains class ratio per fold)")
print(f"Train size: {len(X):,}  |  Positive rate: {y.mean():.4f}\n")


# =============================================================================
# SECTION 4 — MODEL 1: LightGBM (Optuna-Tuned Parameters)
# =============================================================================
print("=" * 70)
print("SECTION 4: MODEL 1 — LightGBM Classifier")
print("=" * 70)
print("  Best hyperparameters sourced from Optuna search (30 trials).")
print("  Scoring metric: ROC-AUC  |  Validation: 5-Fold OOF\n")

lgb_model = LGBMClassifier(
    n_estimators       = 1661,
    learning_rate      = 0.012503626241860565,
    num_leaves         = 31,
    max_depth          = 6,
    min_child_samples  = 43,
    subsample          = 0.643072395692159,
    colsample_bytree   = 0.7211161958467457,
    reg_alpha          = 1.0504857541588257,
    reg_lambda         = 0.11946226125639381,
    random_state       = 42,
    n_jobs             = -1,
    verbosity          = -1,
)

lgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_tree),
    ("model",        lgb_model),
])

# OOF evaluation
oof_lgb = np.zeros(len(X))
for fold, (tr_idx, val_idx) in enumerate(SKF.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx],  X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx],  y.iloc[val_idx]
    lgb_pipeline.fit(X_tr, y_tr)
    oof_lgb[val_idx] = lgb_pipeline.predict_proba(X_val)[:, 1]
    print(f"  Fold {fold+1} AUC: {roc_auc_score(y_val, oof_lgb[val_idx]):.5f}")

lgb_oof_auc = roc_auc_score(y, oof_lgb)
print(f"\n  LightGBM OOF ROC-AUC : {lgb_oof_auc:.5f}\n")

# Full refit on all training data
lgb_pipeline.fit(X, y)
lgb_test_preds = lgb_pipeline.predict_proba(X_test)[:, 1]


# =============================================================================
# SECTION 5 — MODEL 2: XGBoost (Optuna-Tuned Parameters)
# =============================================================================
print("=" * 70)
print("SECTION 5: MODEL 2 — XGBoost Classifier")
print("=" * 70)
print("  Best hyperparameters sourced from Optuna search (Trial 0).")
print("  Scoring metric: ROC-AUC  |  Validation: 5-Fold OOF\n")

xgb_model = XGBClassifier(
    n_estimators     = 972,
    learning_rate    = 0.08233334476657686,
    max_depth        = 3,
    subsample        = 0.6967792979720865,
    colsample_bytree = 0.7773146292728021,
    reg_alpha        = 1.911349598671315,
    reg_lambda       = 0.6194119678307304,
    tree_method      = "hist",
    eval_metric      = "logloss",
    random_state     = 42,
    n_jobs           = -1,
)

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_tree),
    ("model",        xgb_model),
])

# OOF evaluation
oof_xgb = np.zeros(len(X))
for fold, (tr_idx, val_idx) in enumerate(SKF.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx],  X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx],  y.iloc[val_idx]
    xgb_pipeline.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb_pipeline.predict_proba(X_val)[:, 1]
    print(f"  Fold {fold+1} AUC: {roc_auc_score(y_val, oof_xgb[val_idx]):.5f}")

xgb_oof_auc = roc_auc_score(y, oof_xgb)
print(f"\n  XGBoost OOF ROC-AUC  : {xgb_oof_auc:.5f}\n")

# Full refit
xgb_pipeline.fit(X, y)
xgb_test_preds = xgb_pipeline.predict_proba(X_test)[:, 1]


# =============================================================================
# SECTION 6 — MODEL 3: CatBoost (Optuna-Tuned Parameters)
# =============================================================================
print("=" * 70)
print("SECTION 6: MODEL 3 — CatBoost Classifier")
print("=" * 70)
print("  Best hyperparameters sourced from Optuna search (Trial 3).")
print("  CatBoost handles categoricals natively — no OneHot encoding needed.")
print("  Scoring metric: ROC-AUC  |  Validation: 5-Fold OOF\n")

# CatBoost requires categorical columns as strings
cat_cols_cb = [
    "Sex", "Chest pain type", "FBS over 120", "EKG results",
    "Exercise angina", "Slope of ST", "Number of vessels fluro", "Thallium"
]

def prepare_catboost_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    for col in cat_cols_cb:
        data[col] = data[col].astype(str)
    return data

X_cb      = prepare_catboost_data(X)
X_test_cb = prepare_catboost_data(X_test)

cat_model = CatBoostClassifier(
    iterations          = 2443,
    learning_rate       = 0.028617286398439353,
    depth               = 6,
    l2_leaf_reg         = 3.5313325975665264,
    bagging_temperature = 0.5274409717782269,
    random_strength     = 0.03843459373261249,
    random_seed         = 42,
    verbose             = 0,
)

# OOF evaluation (using CatBoost Pool API for proper categorical handling)
oof_cat = np.zeros(len(X_cb))
for fold, (tr_idx, val_idx) in enumerate(SKF.split(X_cb, y)):
    X_tr, X_val = X_cb.iloc[tr_idx],  X_cb.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx],     y.iloc[val_idx]

    train_pool = Pool(X_tr,  y_tr,  cat_features=cat_cols_cb)
    val_pool   = Pool(X_val, y_val, cat_features=cat_cols_cb)

    fold_model = CatBoostClassifier(
        iterations          = 2443,
        learning_rate       = 0.028617286398439353,
        depth               = 6,
        l2_leaf_reg         = 3.5313325975665264,
        bagging_temperature = 0.5274409717782269,
        random_strength     = 0.03843459373261249,
        random_seed         = 42,
        verbose             = 0,
    )
    fold_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=200)
    oof_cat[val_idx] = fold_model.predict_proba(val_pool)[:, 1]
    print(f"  Fold {fold+1} AUC: {roc_auc_score(y_val, oof_cat[val_idx]):.5f}")

cat_oof_auc = roc_auc_score(y, oof_cat)
print(f"\n  CatBoost OOF ROC-AUC : {cat_oof_auc:.5f}\n")

# Full refit
full_pool       = Pool(X_cb, y, cat_features=cat_cols_cb)
test_pool       = Pool(X_test_cb, cat_features=cat_cols_cb)
cat_model.fit(full_pool)
cat_test_preds  = cat_model.predict_proba(test_pool)[:, 1]


# =============================================================================
# SECTION 7 — MODEL COMPARISON SUMMARY
# =============================================================================
print("=" * 70)
print("SECTION 7: INDIVIDUAL MODEL COMPARISON")
print("=" * 70)

model_summary = pd.DataFrame({
    "Model"       : ["LightGBM", "XGBoost", "CatBoost"],
    "OOF ROC-AUC" : [lgb_oof_auc, xgb_oof_auc, cat_oof_auc],
})
model_summary = model_summary.sort_values("OOF ROC-AUC", ascending=False).reset_index(drop=True)
print(model_summary.to_string(index=False))
print()


# =============================================================================
# SECTION 8 — BLENDING: WEIGHTED AVERAGE ENSEMBLE
# =============================================================================
print("=" * 70)
print("SECTION 8: BLENDING — Weighted Average Ensemble (LGB + XGB + CAT)")
print("=" * 70)
print("  Strategy: Sweep blend weights and pick best OOF AUC combination.")
print("  Only two weights needed — third is derived (sums to 1.0).\n")

best_blend_auc    = 0
best_blend_weights = (0.33, 0.33, 0.34)

for w_lgb in np.arange(0.1, 0.7, 0.1):
    for w_xgb in np.arange(0.1, 0.7, 0.1):
        w_cat = round(1.0 - w_lgb - w_xgb, 2)
        if w_cat < 0.05 or w_cat > 0.8:
            continue
        blend = w_lgb * oof_lgb + w_xgb * oof_xgb + w_cat * oof_cat
        auc   = roc_auc_score(y, blend)
        if auc > best_blend_auc:
            best_blend_auc     = auc
            best_blend_weights = (w_lgb, w_xgb, w_cat)

w_lgb, w_xgb, w_cat = best_blend_weights
print(f"  Best blend weights → LGB: {w_lgb:.1f} | XGB: {w_xgb:.1f} | CAT: {w_cat:.2f}")
print(f"  Blended OOF ROC-AUC     : {best_blend_auc:.5f}\n")

blend_test_preds = w_lgb * lgb_test_preds + w_xgb * xgb_test_preds + w_cat * cat_test_preds


# =============================================================================
# SECTION 9 — STACKING: META-LEARNER ENSEMBLE
# =============================================================================
print("=" * 70)
print("SECTION 9: STACKING ENSEMBLE — HistGradientBoosting Meta-Learner")
print("=" * 70)
print("  Base learners : LightGBM · XGBoost · CatBoost (OOF predictions)")
print("  Meta-learner  : HistGradientBoostingClassifier")
print("  Note: Meta-learner is trained on OOF predictions to avoid leakage.\n")

# Stack OOF predictions as features for meta-learner
stacked_train = np.column_stack([oof_lgb, oof_xgb, oof_cat])
stacked_test  = np.column_stack([lgb_test_preds, xgb_test_preds, cat_test_preds])

meta_model = HistGradientBoostingClassifier(
    max_depth     = 3,
    learning_rate = 0.05,
    max_iter      = 300,
    random_state  = 42,
)
meta_model.fit(stacked_train, y)

stacked_oof_preds  = meta_model.predict_proba(stacked_train)[:, 1]
stacked_oof_auc    = roc_auc_score(y, stacked_oof_preds)
stacked_test_preds = meta_model.predict_proba(stacked_test)[:, 1]

print(f"  Stacked OOF ROC-AUC : {stacked_oof_auc:.5f}\n")


# =============================================================================
# SECTION 10 — FINAL RESULTS SUMMARY
# =============================================================================
print("=" * 70)
print("SECTION 10: FINAL RESULTS SUMMARY")
print("=" * 70)

results_df = pd.DataFrame({
    "Approach"    : ["LightGBM", "XGBoost", "CatBoost",
                     "Blend (LGB+XGB+CAT)", "Stacking Ensemble"],
    "OOF ROC-AUC" : [lgb_oof_auc, xgb_oof_auc, cat_oof_auc,
                     best_blend_auc, stacked_oof_auc],
})
results_df = results_df.sort_values("OOF ROC-AUC", ascending=False).reset_index(drop=True)
results_df["OOF ROC-AUC"] = results_df["OOF ROC-AUC"].map("{:.5f}".format)
print(results_df.to_string(index=False))
print()

# Identify best approach
best_idx      = results_df["OOF ROC-AUC"].astype(float).idxmax()
best_approach = results_df.loc[best_idx, "Approach"]
best_auc      = results_df.loc[best_idx, "OOF ROC-AUC"]
print(f"  ✓ Best approach : {best_approach}  (OOF AUC = {best_auc})\n")


# =============================================================================
# SECTION 11 — SUBMISSION FILES
# =============================================================================
print("=" * 70)
print("SECTION 11: GENERATING SUBMISSION FILES")
print("=" * 70)

def save_submission(preds: np.ndarray, ids: pd.Series, filename: str) -> None:
    sub = pd.DataFrame({"id": ids, "Heart Disease": preds})
    sub.to_csv(filename, index=False)
    print(f"  Saved: {filename}  ({len(sub):,} rows)")

save_submission(lgb_test_preds,     test_ids, "submission_lightgbm.csv")
save_submission(xgb_test_preds,     test_ids, "submission_xgboost.csv")
save_submission(cat_test_preds,     test_ids, "submission_catboost.csv")
save_submission(blend_test_preds,   test_ids, "submission_blend.csv")
save_submission(stacked_test_preds, test_ids, "submission_stacked.csv")

print("\nAll submissions generated successfully.")
print("=" * 70)