In [1]:
pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [2]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl (97.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.10


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings("ignore")

In [4]:
print("=" * 70)
print("SECTION 1: DATA LOADING & PREPROCESSING")
print("=" * 70)

df    = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

# ── Target encoding ──────────────────────────────────────────────────────────
df["Heart Disease"] = df["Heart Disease"].map({"Absence": 0, "Presence": 1})

# ── Drop rows where target is NaN (1 row) ────────────────────────────────────
df.dropna(subset=["Heart Disease"], inplace=True)
df["Heart Disease"] = df["Heart Disease"].astype(int)

# ── Drop ID column ────────────────────────────────────────────────────────────
df.drop(columns=["id"], inplace=True)

print(f"Train shape : {df.shape}")
print(f"Test  shape : {test.shape}")
print(f"Target distribution:\n{df['Heart Disease'].value_counts()}\n")


SECTION 1: DATA LOADING & PREPROCESSING
Train shape : (630000, 14)
Test  shape : (270000, 14)
Target distribution:
Heart Disease
0    347546
1    282454
Name: count, dtype: int64



In [5]:
print("=" * 70)
print("SECTION 2: FEATURE GROUPS & PREPROCESSORS")
print("=" * 70)

# ── Feature groups (based on EDA findings) ───────────────────────────────────
numeric_features = [
    "Age", "BP", "Cholesterol", "Max HR", "ST depression"
]

binary_features = [
    "Sex", "FBS over 120", "Exercise angina"
]

categorical_features = [
    "Chest pain type",   # nominal — 4 unordered categories
    "EKG results",       # nominal — values 0/1/2 are labels, not a scale
    "Slope of ST",       # nominal — non-linear jump between levels (EDA)
    "Thallium"           # nominal — non-consecutive codes 3/6/7
]

ordinal_features = [
    "Number of vessels fluro"   # ordinal — monotonic risk increase 0→3
]

TARGET = "Heart Disease"

# ── Preprocessor for tree-based models (no scaling needed) ───────────────────
preprocessor_tree = ColumnTransformer(transformers=[
    ("num", "passthrough",                                  numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"),         categorical_features),
    ("bin", "passthrough",                                  binary_features),
    ("ord", "passthrough",                                  ordinal_features),
])

# ── Preprocessor for linear models (StandardScaler + OneHot) ─────────────────
preprocessor_linear = ColumnTransformer(transformers=[
    ("num", StandardScaler(),                               numeric_features),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("bin", "passthrough",                                  binary_features),
    ("ord", "passthrough",                                  ordinal_features),
])

print("Feature groups defined.")
print(f"  Numeric     : {len(numeric_features)} features")
print(f"  Categorical : {len(categorical_features)} features (OneHot encoded)")
print(f"  Binary      : {len(binary_features)} features (pass-through)")
print(f"  Ordinal     : {len(ordinal_features)} feature  (pass-through)\n")


SECTION 2: FEATURE GROUPS & PREPROCESSORS
Feature groups defined.
  Numeric     : 5 features
  Categorical : 4 features (OneHot encoded)
  Binary      : 3 features (pass-through)
  Ordinal     : 1 feature  (pass-through)



In [6]:
print("=" * 70)
print("SECTION 3: CROSS-VALIDATION SETUP")
print("=" * 70)

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_test    = test.drop(columns=["id"])
test_ids  = test["id"]

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Strategy : 5-Fold Stratified K-Fold (maintains class ratio per fold)")
print(f"Train size: {len(X):,}  |  Positive rate: {y.mean():.4f}\n")


SECTION 3: CROSS-VALIDATION SETUP
Strategy : 5-Fold Stratified K-Fold (maintains class ratio per fold)
Train size: 630,000  |  Positive rate: 0.4483



In [8]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-3.9.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.9.0 (from mlflow)
  Downloading mlflow_skinny-3.9.0-py3-none-any.whl.metadata (32 kB)
Collecting mlflow-tracing==3.9.0 (from mlflow)
  Downloading mlflow_tracing-3.9.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.4 (from mlflow)
  Downloading huey-2.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting skops<1 (from mlflow)
  Downloading skops-0.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting cachetools<7,>=5.0.0 (from mlflow-skinny==3.9.0->mlflow)
  Downloading 

In [9]:
import mlflow

In [11]:
import os

In [12]:
print("=" * 70)
print("SECTION 4: MLFLOW & NPY SETUP")
print("=" * 70)

EXPERIMENT_NAME = "Heart_Disease_Prediction"
mlflow.set_experiment(EXPERIMENT_NAME)

# Change to /content/drive/MyDrive/npy_results to persist across Colab sessions
NPY_DIR = "/content/npy_results"
os.makedirs(NPY_DIR, exist_ok=True)

print(f"MLflow experiment : {EXPERIMENT_NAME}")
print(f"NPY save directory: {NPY_DIR}\n")


def save_npy(array: np.ndarray, name: str) -> str:
    """Save array as .npy file and return its path for MLflow logging."""
    path = os.path.join(NPY_DIR, f"{name}.npy")
    np.save(path, array)
    print(f"    [NPY] Saved → {name}.npy  shape={array.shape}")
    return path


SECTION 4: MLFLOW & NPY SETUP
MLflow experiment : Heart_Disease_Prediction
NPY save directory: /content/npy_results



In [15]:
# =============================================================================
# SECTION 5 — MODEL 1: LightGBM  (Optuna-Tuned)
# =============================================================================
print("=" * 70)
print("SECTION 5: MODEL 1 — LightGBM Classifier")
print("=" * 70)
print("  Hyperparameters: Optuna search, 30 trials, best Trial 0")
print("  Metric: ROC-AUC  |  CV: 5-Fold OOF\n")

lgb_params = dict(
    n_estimators      = 1661,
    learning_rate     = 0.012503626241860565,
    num_leaves        = 31,
    max_depth         = 6,
    min_child_samples = 43,
    subsample         = 0.643072395692159,
    colsample_bytree  = 0.7211161958467457,
    reg_alpha         = 1.0504857541588257,
    reg_lambda        = 0.11946226125639381,
    random_state      = 42,
    n_jobs            = -1,
    verbosity         = -1,  # GPU acceleration (Colab T4/A100)
    gpu_platform_id   = 0,
    gpu_device_id     = 0,
)

with mlflow.start_run(run_name="LightGBM") as run:
    lgb_run_id = run.info.run_id
    mlflow.log_params(lgb_params)
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("cv_folds",   5)

    lgb_pipeline = Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model",        LGBMClassifier(**lgb_params)),
    ])

    oof_lgb = np.zeros(len(X))
    for fold, (tr_idx, val_idx) in enumerate(SKF.split(X, y)):
        X_tr, X_val = X.iloc[tr_idx],  X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx],  y.iloc[val_idx]
        lgb_pipeline.fit(X_tr, y_tr)
        oof_lgb[val_idx] = lgb_pipeline.predict_proba(X_val)[:, 1]
        fold_auc = roc_auc_score(y_val, oof_lgb[val_idx])
        mlflow.log_metric(f"fold_{fold+1}_auc", fold_auc)
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")

    lgb_oof_auc = roc_auc_score(y, oof_lgb)
    mlflow.log_metric("oof_roc_auc", lgb_oof_auc)
    print(f"\n  LightGBM OOF ROC-AUC : {lgb_oof_auc:.5f}")

    # Full refit on all training data
    lgb_pipeline.fit(X, y)
    lgb_test_preds = lgb_pipeline.predict_proba(X_test)[:, 1]

    # Save .npy immediately after run — safe against Colab disconnects
    mlflow.log_artifact(save_npy(oof_lgb,        "oof_lgb"),        artifact_path="npy")
    mlflow.log_artifact(save_npy(lgb_test_preds, "test_preds_lgb"), artifact_path="npy")
    print(f"  MLflow Run ID : {lgb_run_id}\n")



SECTION 5: MODEL 1 — LightGBM Classifier
  Hyperparameters: Optuna search, 30 trials, best Trial 0
  Metric: ROC-AUC  |  CV: 5-Fold OOF

  Fold 1 AUC: 0.95563
  Fold 2 AUC: 0.95459
  Fold 3 AUC: 0.95539
  Fold 4 AUC: 0.95495
  Fold 5 AUC: 0.95579

  LightGBM OOF ROC-AUC : 0.95527
    [NPY] Saved → oof_lgb.npy  shape=(630000,)
    [NPY] Saved → test_preds_lgb.npy  shape=(270000,)
  MLflow Run ID : 10ec9ac61f3b484c92953673a40cb830



In [16]:
# =============================================================================
# SECTION 6 — MODEL 2: XGBoost  (Optuna-Tuned)
# =============================================================================
print("=" * 70)
print("SECTION 6: MODEL 2 — XGBoost Classifier")
print("=" * 70)
print("  Hyperparameters: Optuna search, best Trial 0")
print("  Metric: ROC-AUC  |  CV: 5-Fold OOF\n")

xgb_params = dict(
    n_estimators     = 972,
    learning_rate    = 0.08233334476657686,
    max_depth        = 3,
    subsample        = 0.6967792979720865,
    colsample_bytree = 0.7773146292728021,
    reg_alpha        = 1.911349598671315,
    reg_lambda       = 0.6194119678307304,
    tree_method      = "hist",
    eval_metric      = "logloss",
    device           = "cuda",          # GPU acceleration (Colab T4/A100)
    random_state     = 42,
    n_jobs           = -1,
)

with mlflow.start_run(run_name="XGBoost") as run:
    xgb_run_id = run.info.run_id
    mlflow.log_params(xgb_params)
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("cv_folds",   5)

    xgb_pipeline = Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model",        XGBClassifier(**xgb_params)),
    ])

    oof_xgb = np.zeros(len(X))
    for fold, (tr_idx, val_idx) in enumerate(SKF.split(X, y)):
        X_tr, X_val = X.iloc[tr_idx],  X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx],  y.iloc[val_idx]
        xgb_pipeline.fit(X_tr, y_tr)
        oof_xgb[val_idx] = xgb_pipeline.predict_proba(X_val)[:, 1]
        fold_auc = roc_auc_score(y_val, oof_xgb[val_idx])
        mlflow.log_metric(f"fold_{fold+1}_auc", fold_auc)
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")

    xgb_oof_auc = roc_auc_score(y, oof_xgb)
    mlflow.log_metric("oof_roc_auc", xgb_oof_auc)
    print(f"\n  XGBoost OOF ROC-AUC : {xgb_oof_auc:.5f}")

    # Full refit
    xgb_pipeline.fit(X, y)
    xgb_test_preds = xgb_pipeline.predict_proba(X_test)[:, 1]

    # Save .npy immediately after run
    mlflow.log_artifact(save_npy(oof_xgb,        "oof_xgb"),        artifact_path="npy")
    mlflow.log_artifact(save_npy(xgb_test_preds, "test_preds_xgb"), artifact_path="npy")
    print(f"  MLflow Run ID : {xgb_run_id}\n")



SECTION 6: MODEL 2 — XGBoost Classifier
  Hyperparameters: Optuna search, best Trial 0
  Metric: ROC-AUC  |  CV: 5-Fold OOF

  Fold 1 AUC: 0.95585
  Fold 2 AUC: 0.95480
  Fold 3 AUC: 0.95553
  Fold 4 AUC: 0.95516
  Fold 5 AUC: 0.95595

  XGBoost OOF ROC-AUC : 0.95545
    [NPY] Saved → oof_xgb.npy  shape=(630000,)
    [NPY] Saved → test_preds_xgb.npy  shape=(270000,)
  MLflow Run ID : 535c855619fe48bc9e5b4f2beb683cb6



In [18]:
# =============================================================================
# SECTION 7 — MODEL 3: CatBoost  (Optuna-Tuned)
# =============================================================================
print("=" * 70)
print("SECTION 7: MODEL 3 — CatBoost Classifier")
print("=" * 70)
print("  Hyperparameters: Optuna search, best Trial 3")
print("  Handles categoricals natively via Pool API — no OneHot needed")
print("  Metric: ROC-AUC  |  CV: 5-Fold OOF\n")

# CatBoost Pool API requires categorical columns as strings
cat_cols_cb = [
    "Sex", "Chest pain type", "FBS over 120", "EKG results",
    "Exercise angina", "Slope of ST", "Number of vessels fluro", "Thallium"
]

def prepare_catboost(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    for col in cat_cols_cb:
        data[col] = data[col].astype(str)
    return data

X_cb      = prepare_catboost(X)
X_test_cb = prepare_catboost(X_test)

cat_params = dict(
    iterations          = 2443,
    learning_rate       = 0.028617286398439353,
    depth               = 6,
    l2_leaf_reg         = 3.5313325975665264,
    bagging_temperature = 0.5274409717782269,
    random_strength     = 0.03843459373261249,
          # GPU acceleration (Colab T4/A100)
    devices             = "0",
    random_seed         = 42,
    verbose             = 0,
)

with mlflow.start_run(run_name="CatBoost") as run:
    cat_run_id = run.info.run_id
    mlflow.log_params(cat_params)
    mlflow.log_param("model_type",   "CatBoost")
    mlflow.log_param("cv_folds",     5)
    mlflow.log_param("cat_features", str(cat_cols_cb))

    oof_cat = np.zeros(len(X_cb))
    for fold, (tr_idx, val_idx) in enumerate(SKF.split(X_cb, y)):
        X_tr, X_val = X_cb.iloc[tr_idx],  X_cb.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx],     y.iloc[val_idx]

        train_pool = Pool(X_tr,  y_tr,  cat_features=cat_cols_cb)
        val_pool   = Pool(X_val, y_val, cat_features=cat_cols_cb)

        fold_model = CatBoostClassifier(**cat_params)
        fold_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=200)
        oof_cat[val_idx] = fold_model.predict_proba(val_pool)[:, 1]
        fold_auc = roc_auc_score(y_val, oof_cat[val_idx])
        mlflow.log_metric(f"fold_{fold+1}_auc", fold_auc)
        print(f"  Fold {fold+1} AUC: {fold_auc:.5f}")

    cat_oof_auc = roc_auc_score(y, oof_cat)
    mlflow.log_metric("oof_roc_auc", cat_oof_auc)
    print(f"\n  CatBoost OOF ROC-AUC : {cat_oof_auc:.5f}")

    # Full refit
    full_pool      = Pool(X_cb,      y,  cat_features=cat_cols_cb)
    test_pool      = Pool(X_test_cb,     cat_features=cat_cols_cb)
    cat_model_full = CatBoostClassifier(**cat_params)
    cat_model_full.fit(full_pool)
    cat_test_preds = cat_model_full.predict_proba(test_pool)[:, 1]

    # Save .npy immediately after run
    mlflow.log_artifact(save_npy(oof_cat,        "oof_cat"),        artifact_path="npy")
    mlflow.log_artifact(save_npy(cat_test_preds, "test_preds_cat"), artifact_path="npy")
    print(f"  MLflow Run ID : {cat_run_id}\n")


SECTION 7: MODEL 3 — CatBoost Classifier
  Hyperparameters: Optuna search, best Trial 3
  Handles categoricals natively via Pool API — no OneHot needed
  Metric: ROC-AUC  |  CV: 5-Fold OOF

  Fold 1 AUC: 0.95582


KeyboardInterrupt: 

In [19]:
# =============================================================================
# SECTION 8 — INDIVIDUAL MODEL COMPARISON
# =============================================================================
print("=" * 70)
print("SECTION 8: INDIVIDUAL MODEL COMPARISON")
print("=" * 70)

model_summary = pd.DataFrame({
    "Model"       : ["LightGBM", "XGBoost", ],
    "OOF ROC-AUC" : [lgb_oof_auc, xgb_oof_auc, ],
}).sort_values("OOF ROC-AUC", ascending=False).reset_index(drop=True)

print(model_summary.to_string(index=False))
print()

SECTION 8: INDIVIDUAL MODEL COMPARISON
   Model  OOF ROC-AUC
 XGBoost     0.955455
LightGBM     0.955269



In [23]:
# =============================================================================
# SECTION 9 — BLENDING: WEIGHTED AVERAGE ENSEMBLE
# =============================================================================
print("=" * 70)
print("SECTION 9: BLENDING — Weighted Average Ensemble (LGB + XGB + CAT)")
print("=" * 70)
print("  Sweep all weight combos (step 0.1), pick best OOF AUC\n")

best_blend_auc     = 0.0
best_blend_weights = (0.33, 0.33, 0.34)

for w_lgb in np.arange(0.1, 0.7, 0.1):
    for w_xgb in np.arange(0.1, 0.7, 0.1):

        blend = w_lgb * oof_lgb + w_xgb * oof_xgb
        auc   = roc_auc_score(y, blend)
        if auc > best_blend_auc:
            best_blend_auc     = auc
            best_blend_weights = (w_lgb, w_xgb)

w_lgb, w_xgb = best_blend_weights
blend_oof_preds  = w_lgb * oof_lgb        + w_xgb * oof_xgb
blend_test_preds = w_lgb * lgb_test_preds + w_xgb * xgb_test_preds

print(f"  Best weights → LGB: {w_lgb:.1f} | XGB: {w_xgb:.1f} ")
print(f"  Blended OOF ROC-AUC: {best_blend_auc:.5f}")

with mlflow.start_run(run_name="Blend_LGB_XGB") as run:
    blend_run_id = run.info.run_id
    mlflow.log_param("model_type", "WeightedBlend")
    mlflow.log_param("weight_lgb", round(float(w_lgb), 2))
    mlflow.log_param("weight_xgb", round(float(w_xgb), 2))
    mlflow.log_metric("oof_roc_auc", best_blend_auc)

    # Save .npy immediately after run
    mlflow.log_artifact(save_npy(blend_oof_preds,  "oof_blend"),        artifact_path="npy")
    mlflow.log_artifact(save_npy(blend_test_preds, "test_preds_blend"), artifact_path="npy")
    print(f"  MLflow Run ID : {blend_run_id}\n")

SECTION 9: BLENDING — Weighted Average Ensemble (LGB + XGB + CAT)
  Sweep all weight combos (step 0.1), pick best OOF AUC

  Best weights → LGB: 0.1 | XGB: 0.5 
  Blended OOF ROC-AUC: 0.95546
    [NPY] Saved → oof_blend.npy  shape=(630000,)
    [NPY] Saved → test_preds_blend.npy  shape=(270000,)
  MLflow Run ID : 5446fd3dbd5d432487235c5785914cfb



In [24]:
# =============================================================================
# SECTION 10 — STACKING: META-LEARNER ENSEMBLE
# =============================================================================
print("=" * 70)
print("SECTION 10: STACKING ENSEMBLE — HistGradientBoosting Meta-Learner")
print("=" * 70)
print("  Base learners : LightGBM · XGBoost · CatBoost OOF predictions")
print("  Meta-learner  : HistGradientBoostingClassifier")
print("  No leakage    : meta-learner trained only on OOF predictions\n")

stacked_train = np.column_stack([oof_lgb,        oof_xgb])
stacked_test  = np.column_stack([lgb_test_preds, xgb_test_preds])

meta_params = dict(max_depth=3, learning_rate=0.05, max_iter=300, random_state=42)
meta_model  = HistGradientBoostingClassifier(**meta_params)
meta_model.fit(stacked_train, y)

stacked_oof_preds  = meta_model.predict_proba(stacked_train)[:, 1]
stacked_oof_auc    = roc_auc_score(y, stacked_oof_preds)
stacked_test_preds = meta_model.predict_proba(stacked_test)[:, 1]

print(f"  Stacked OOF ROC-AUC : {stacked_oof_auc:.5f}")

with mlflow.start_run(run_name="Stacking_HistGB_Meta") as run:
    stack_run_id = run.info.run_id
    mlflow.log_param("model_type",    "StackingEnsemble")
    mlflow.log_param("meta_learner",  "HistGradientBoostingClassifier")
    mlflow.log_param("base_learners", "LightGBM, XGBoost")
    mlflow.log_params({f"meta_{k}": v for k, v in meta_params.items()})
    mlflow.log_metric("oof_roc_auc",  stacked_oof_auc)

    # Save .npy immediately after run
    mlflow.log_artifact(save_npy(stacked_oof_preds,  "oof_stacked"),        artifact_path="npy")
    mlflow.log_artifact(save_npy(stacked_test_preds, "test_preds_stacked"), artifact_path="npy")
    print(f"  MLflow Run ID : {stack_run_id}\n")


SECTION 10: STACKING ENSEMBLE — HistGradientBoosting Meta-Learner
  Base learners : LightGBM · XGBoost · CatBoost OOF predictions
  Meta-learner  : HistGradientBoostingClassifier
  No leakage    : meta-learner trained only on OOF predictions

  Stacked OOF ROC-AUC : 0.95551
    [NPY] Saved → oof_stacked.npy  shape=(630000,)
    [NPY] Saved → test_preds_stacked.npy  shape=(270000,)
  MLflow Run ID : 4a5a151107354381890b7a5f582a8c4a



In [25]:
# =============================================================================
# SECTION 11 — FINAL RESULTS SUMMARY
# =============================================================================
print("=" * 70)
print("SECTION 11: FINAL RESULTS SUMMARY")
print("=" * 70)

results_df = pd.DataFrame({
    "Approach"    : ["LightGBM", "XGBoost",
                     "Blend (LGB+XGB+CAT)", "Stacking Ensemble"],
    "OOF ROC-AUC" : [lgb_oof_auc, xgb_oof_auc,
                     best_blend_auc, stacked_oof_auc],
    "MLflow Run"  : [lgb_run_id[:8],   xgb_run_id[:8],
                     blend_run_id[:8], stack_run_id[:8]],
}).sort_values("OOF ROC-AUC", ascending=False).reset_index(drop=True)

results_df["OOF ROC-AUC"] = results_df["OOF ROC-AUC"].map("{:.5f}".format)
print(results_df.to_string(index=False))

best_row = results_df.iloc[0]
print(f"\n  ✓ Best approach : {best_row['Approach']}  (OOF AUC = {best_row['OOF ROC-AUC']})")

# List all saved .npy files with shapes
print(f"\nAll .npy files saved in: {NPY_DIR}")
print("-" * 50)
for fname in sorted(os.listdir(NPY_DIR)):
    if fname.endswith(".npy"):
        arr = np.load(os.path.join(NPY_DIR, fname))
        print(f"  {fname:<35} shape={arr.shape}  dtype={arr.dtype}")



SECTION 11: FINAL RESULTS SUMMARY
           Approach OOF ROC-AUC MLflow Run
  Stacking Ensemble     0.95551   4a5a1511
Blend (LGB+XGB+CAT)     0.95546   5446fd3d
            XGBoost     0.95545   535c8556
           LightGBM     0.95527   10ec9ac6

  ✓ Best approach : Stacking Ensemble  (OOF AUC = 0.95551)

All .npy files saved in: /content/npy_results
--------------------------------------------------
  oof_blend.npy                       shape=(630000,)  dtype=float64
  oof_lgb.npy                         shape=(630000,)  dtype=float64
  oof_stacked.npy                     shape=(630000,)  dtype=float64
  oof_xgb.npy                         shape=(630000,)  dtype=float64
  test_preds_blend.npy                shape=(270000,)  dtype=float64
  test_preds_lgb.npy                  shape=(270000,)  dtype=float64
  test_preds_stacked.npy              shape=(270000,)  dtype=float64
  test_preds_xgb.npy                  shape=(270000,)  dtype=float32


In [26]:
# =============================================================================
# SECTION 12 — SUBMISSION FILES
# =============================================================================
print("\n" + "=" * 70)
print("SECTION 12: GENERATING SUBMISSION FILES")
print("=" * 70)

def save_submission(preds: np.ndarray, ids: pd.Series, filename: str) -> None:
    pd.DataFrame({"id": ids, "Heart Disease": preds}).to_csv(filename, index=False)
    print(f"  Saved: {filename}  ({len(ids):,} rows)")

save_submission(lgb_test_preds,     test_ids, "submission_lightgbm.csv")
save_submission(xgb_test_preds,     test_ids, "submission_xgboost.csv")
save_submission(blend_test_preds,   test_ids, "submission_blend.csv")
save_submission(stacked_test_preds, test_ids, "submission_stacked.csv")

print("\nAll submissions generated successfully.")
print("=" * 70)


SECTION 12: GENERATING SUBMISSION FILES
  Saved: submission_lightgbm.csv  (270,000 rows)
  Saved: submission_xgboost.csv  (270,000 rows)
  Saved: submission_blend.csv  (270,000 rows)
  Saved: submission_stacked.csv  (270,000 rows)

All submissions generated successfully.


In [27]:
# create a zip file from folder you want to download
!zip -r /content/file.zip /content/mlruns

# download file recently created
from google.colab import files
files.download("/content/file.zip")


  adding: content/mlruns/ (stored 0%)
  adding: content/mlruns/1/ (stored 0%)
  adding: content/mlruns/1/535c855619fe48bc9e5b4f2beb683cb6/ (stored 0%)
  adding: content/mlruns/1/535c855619fe48bc9e5b4f2beb683cb6/artifacts/ (stored 0%)
  adding: content/mlruns/1/535c855619fe48bc9e5b4f2beb683cb6/artifacts/npy/ (stored 0%)
  adding: content/mlruns/1/535c855619fe48bc9e5b4f2beb683cb6/artifacts/npy/test_preds_xgb.npy (deflated 9%)
  adding: content/mlruns/1/535c855619fe48bc9e5b4f2beb683cb6/artifacts/npy/oof_xgb.npy (deflated 48%)
  adding: content/mlruns/1/4a5a151107354381890b7a5f582a8c4a/ (stored 0%)
  adding: content/mlruns/1/4a5a151107354381890b7a5f582a8c4a/artifacts/ (stored 0%)
  adding: content/mlruns/1/4a5a151107354381890b7a5f582a8c4a/artifacts/npy/ (stored 0%)
  adding: content/mlruns/1/4a5a151107354381890b7a5f582a8c4a/artifacts/npy/oof_stacked.npy (deflated 75%)
  adding: content/mlruns/1/4a5a151107354381890b7a5f582a8c4a/artifacts/npy/test_preds_stacked.npy (deflated 76%)
  adding: c

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# create a zip file from folder you want to download
!zip -r /content/file1.zip /content/npy_results

# download file recently created
from google.colab import files
files.download("/content/file1.zip")


updating: content/npy_results/ (stored 0%)
updating: content/npy_results/test_preds_lgb.npy (deflated 6%)
updating: content/npy_results/oof_stacked.npy (deflated 75%)
updating: content/npy_results/test_preds_stacked.npy (deflated 76%)
updating: content/npy_results/oof_lgb.npy (deflated 6%)
updating: content/npy_results/test_preds_xgb.npy (deflated 9%)
updating: content/npy_results/oof_blend.npy (deflated 6%)
updating: content/npy_results/oof_xgb.npy (deflated 48%)
updating: content/npy_results/test_preds_blend.npy (deflated 6%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>