In [9]:
import pandas as pd
import numpy as np

from utils.diabetes_utils import clean_diabetes_data
from utils.diabetes_utils import plot_and_save_metrics

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    average_precision_score,
    precision_recall_curve,
)
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Input


# Load ORIGINAL CSV (patient_nbr exists)
df = pd.read_csv("data/diabetic_data.csv")

df_clean = clean_diabetes_data(df)

print("Cleaned shape:", df_clean.shape)


# Numeric + categorical features
numeric_cols = [
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
]

cat_cols = [
    "race",
    "gender",
    "age",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "diag_1_group",
    "diag_2_group",
    "diag_3_group",
    "insulin",
    "change",
    "diabetesMed",
]

# Keep required columns
lstm_df = df_clean[numeric_cols + cat_cols + ["readmit_30d", "patient_nbr"]].dropna()

groups = lstm_df["patient_nbr"]

# One-hot encode categoricals
lstm_df = pd.get_dummies(lstm_df, columns=cat_cols, drop_first=True)

X = lstm_df.drop(columns=["readmit_30d", "patient_nbr"]).astype(np.float32)
y = lstm_df["readmit_30d"].astype(int)

print("Feature matrix shape:", X.shape)
print("Positive rate:", round(y.mean(), 3))


# GroupKFold (patient-level split)
gkf = GroupKFold(n_splits=5)
cv_metrics = []

# OOF storage for stacking
oof_probs = np.zeros(len(y))

for fold, (train_idx, val_idx) in enumerate(
        gkf.split(X, y, groups=groups), start=1):

    X_tr = X.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_tr = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    # Scale numeric features within fold
    scaler = StandardScaler()
    X_tr[numeric_cols] = scaler.fit_transform(X_tr[numeric_cols])
    X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])

    # Class weighting
    class_weights_arr = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1]),
        y=y_tr.values
    )
    class_weight_dict = {0: class_weights_arr[0], 1: class_weights_arr[1]}

    # Convert to LSTM input shape (samples, timesteps=1, features)
    X_tr_np = X_tr.values.reshape(-1, 1, X_tr.shape[1])
    X_val_np = X_val.values.reshape(-1, 1, X_val.shape[1])

    # Build LSTM
    model = Sequential([
        Input(shape=(1, X_tr.shape[1])),
        LSTM(64),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
    )

    early_stop = EarlyStopping(
        monitor="val_auc",
        patience=2,
        mode="max",
        restore_best_weights=True,
    )

    model.fit(
        X_tr_np,
        y_tr,
        validation_data=(X_val_np, y_val),
        epochs=10,
        batch_size=256,
        callbacks=[early_stop],
        class_weight=class_weight_dict,
        verbose=0,
    )

    y_val_prob = model.predict(X_val_np, verbose=0).ravel()

    # store OOF probabilities
    oof_probs[val_idx] = y_val_prob

    # Default threshold
    y_val_pred_default = (y_val_prob >= 0.5).astype(int)

    # AUPRC
    auprc = average_precision_score(y_val, y_val_prob)

    # Threshold tuning
    precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_idx]

    y_val_pred_tuned = (y_val_prob >= best_threshold).astype(int)

    fold_result = {
        "fold": fold,
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "auprc": auprc,
        "f1_default": f1_score(y_val, y_val_pred_default, zero_division=0),
        "f1_tuned": f1_score(y_val, y_val_pred_tuned, zero_division=0),
        "best_threshold": best_threshold,
    }

    cv_metrics.append(fold_result)

    print(f"\nFold {fold}:")
    print(f"  AUC:        {fold_result['roc_auc']:.3f}")
    print(f"  AUPRC:      {fold_result['auprc']:.3f}")
    print(f"  F1 (0.5):   {fold_result['f1_default']:.3f}")
    print(f"  F1 (tuned): {fold_result['f1_tuned']:.3f}")
    print(f"  Best thr:   {best_threshold:.3f}")

cv_df = pd.DataFrame(cv_metrics)

print("\n5-fold CV summary (LSTM – patient split + tuned threshold)")
print("Mean AUC:", round(cv_df["roc_auc"].mean(), 3))
print("Mean AUPRC:", round(cv_df["auprc"].mean(), 3))
print("Mean F1 (0.5):", round(cv_df["f1_default"].mean(), 3))
print("Mean F1 (tuned):", round(cv_df["f1_tuned"].mean(), 3))

# Plot metrics using OOF predictions
plot_and_save_metrics(
    model_name="lstm_oof",
    y_test=y.values,
    y_prob=oof_probs,
    threshold=cv_df["best_threshold"].mean()
)

# Save OOF predictions for stacking
np.save("oof_predictions/oof_lstm.npy", oof_probs)
np.save("oof_predictions/y_oof_lstm.npy", y.values)

Cleaned shape: (101766, 51)
Feature matrix shape: (101766, 104)
Positive rate: 0.112

Fold 1:
  AUC:        0.687
  AUPRC:      0.236
  F1 (0.5):   0.280
  F1 (tuned): 0.300
  Best thr:   0.584

Fold 2:
  AUC:        0.678
  AUPRC:      0.217
  F1 (0.5):   0.271
  F1 (tuned): 0.281
  Best thr:   0.553

Fold 3:
  AUC:        0.677
  AUPRC:      0.225
  F1 (0.5):   0.276
  F1 (tuned): 0.285
  Best thr:   0.547

Fold 4:
  AUC:        0.690
  AUPRC:      0.237
  F1 (0.5):   0.287
  F1 (tuned): 0.295
  Best thr:   0.549

Fold 5:
  AUC:        0.674
  AUPRC:      0.222
  F1 (0.5):   0.271
  F1 (tuned): 0.277
  Best thr:   0.554

5-fold CV summary (LSTM – patient split + tuned threshold)
Mean AUC: 0.681
Mean AUPRC: 0.227
Mean F1 (0.5): 0.277
Mean F1 (tuned): 0.287
