In [5]:
import sys
!{sys.executable} -m pip install pytorch-tabnet

import pandas as pd
import numpy as np
import torch

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    average_precision_score,
    precision_recall_curve,
)
from sklearn.utils.class_weight import compute_class_weight

from utils.diabetes_utils import clean_diabetes_data
from utils.diabetes_utils import plot_and_save_metrics


# Load ORIGINAL CSV (patient_nbr exists)
df = pd.read_csv("data/diabetic_data.csv")

df_clean = clean_diabetes_data(df)

target_col = "readmit_30d"

df_tab = df_clean.copy()
groups = df_tab["patient_nbr"].values
y = df_tab[target_col].values.astype(int)

# Drop raw label + target + patient id
df_tab = df_tab.drop(columns=["readmitted", target_col, "patient_nbr"])

# Identify categorical columns
cat_cols = df_tab.select_dtypes(include="object").columns.tolist()

# Label encode categoricals
for col in cat_cols:
    le = LabelEncoder()
    df_tab[col] = le.fit_transform(df_tab[col].astype(str))

X = df_tab.values.astype(np.float32)

cat_idxs = [df_tab.columns.get_loc(c) for c in cat_cols]
cat_dims = [df_tab[c].nunique() for c in cat_cols]

print("Feature matrix shape:", X.shape)
print("Positive rate:", round(y.mean(), 3))


def build_tabnet():
    return TabNetClassifier(
        n_d=32,
        n_a=32,
        n_steps=3,
        gamma=1.3,
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=1,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size": 10, "gamma": 0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type="sparsemax",
        verbose=0
    )


# Patient-level CV
gkf = GroupKFold(n_splits=5)

# Allocate OOF storage
oof_probs = np.zeros(len(y))
oof_targets = np.zeros(len(y))

cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(
        gkf.split(X, y, groups=groups), start=1):

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    class_weights_arr = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1]),
        y=y_tr
    )
    weight_map = {0: class_weights_arr[0], 1: class_weights_arr[1]}
    sample_weights = np.array([weight_map[label] for label in y_tr])

    model = build_tabnet()

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_name=["valid"],
        eval_metric=["auc"],
        weights=sample_weights,
        max_epochs=50,
        patience=5,
        batch_size=1024,
        virtual_batch_size=128
    )

    y_val_prob = model.predict_proba(X_val)[:, 1]

    # Save OOF predictions
    oof_probs[val_idx] = y_val_prob
    oof_targets[val_idx] = y_val

    y_val_pred_default = (y_val_prob >= 0.5).astype(int)

    auprc = average_precision_score(y_val, y_val_prob)

    precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_idx]

    y_val_pred_tuned = (y_val_prob >= best_threshold).astype(int)

    fold_result = {
        "fold": fold,
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "auprc": auprc,
        "f1_default": f1_score(y_val, y_val_pred_default, zero_division=0),
        "f1_tuned": f1_score(y_val, y_val_pred_tuned, zero_division=0),
    }

    cv_metrics.append(fold_result)

    print(f"\nFold {fold}:")
    print(f"  AUC:        {fold_result['roc_auc']:.3f}")
    print(f"  AUPRC:      {fold_result['auprc']:.3f}")
    print(f"  F1 (0.5):   {fold_result['f1_default']:.3f}")
    print(f"  F1 (tuned): {fold_result['f1_tuned']:.3f}")

cv_df = pd.DataFrame(cv_metrics)

print("\n5-fold CV summary (TabNet – patient split + tuned threshold)")
print("Mean AUC:", round(cv_df["roc_auc"].mean(), 3))
print("Mean AUPRC:", round(cv_df["auprc"].mean(), 3))
print("Mean F1 (0.5):", round(cv_df["f1_default"].mean(), 3))
print("Mean F1 (tuned):", round(cv_df["f1_tuned"].mean(), 3))

# Plot OOF performance
plot_and_save_metrics(
    model_name="tabnet_oof",
    y_test=oof_targets,
    y_prob=oof_probs,
    threshold=cv_df["f1_tuned"].idxmax() 
)

# Save OOF predictions for stacking
np.save("oof_predictions/oof_tabnet.npy", oof_probs)
np.save("oof_predictions/y_oof_tabnet.npy", oof_targets)

print("\nSaved oof_tabnet.npy and y_oof.npy for stacking")

Feature matrix shape: (101766, 48)
Positive rate: 0.112

Early stopping occurred at epoch 8 with best_epoch = 3 and best_valid_auc = 0.63073





Fold 1:
  AUC:        0.631
  AUPRC:      0.196
  F1 (0.5):   0.251
  F1 (tuned): 0.263

Early stopping occurred at epoch 9 with best_epoch = 4 and best_valid_auc = 0.63536





Fold 2:
  AUC:        0.635
  AUPRC:      0.182
  F1 (0.5):   0.230
  F1 (tuned): 0.251

Early stopping occurred at epoch 11 with best_epoch = 6 and best_valid_auc = 0.63923





Fold 3:
  AUC:        0.639
  AUPRC:      0.200
  F1 (0.5):   0.249
  F1 (tuned): 0.264

Early stopping occurred at epoch 11 with best_epoch = 6 and best_valid_auc = 0.63673





Fold 4:
  AUC:        0.637
  AUPRC:      0.200
  F1 (0.5):   0.254
  F1 (tuned): 0.258

Early stopping occurred at epoch 11 with best_epoch = 6 and best_valid_auc = 0.64082





Fold 5:
  AUC:        0.641
  AUPRC:      0.188
  F1 (0.5):   0.258
  F1 (tuned): 0.260

5-fold CV summary (TabNet – patient split + tuned threshold)
Mean AUC: 0.637
Mean AUPRC: 0.193
Mean F1 (0.5): 0.248
Mean F1 (tuned): 0.259

Saved oof_tabnet.npy and y_oof.npy for stacking
