In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ucimlrepo import fetch_ucirepo
from diabetes_utils import clean_diabetes_data, plot_and_save_metrics

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load and clean dataset (same source as LACE)
diabetes_data = fetch_ucirepo(id=296)
X_raw = diabetes_data.data.features
y_raw = diabetes_data.data.targets

if "readmitted" not in y_raw.columns:
    y_raw.columns = ["readmitted"]

df = pd.concat([X_raw, y_raw], axis=1)
df_clean = clean_diabetes_data(df)  # reuse our cleaning pipeline

print("Cleaned shape:", df_clean.shape)

# Build feature set for LSTM (numeric + categorical)
numeric_cols = [
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
]

cat_cols = [
    "race",
    "gender",
    "age",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "diag_1_group",
    "diag_2_group",
    "diag_3_group",
    "insulin",
    "change",
    "diabetesMed",
]

# Keep only selected features + target, drop rows missing these
lstm_df = df_clean[numeric_cols + cat_cols + ["readmit_30d"]].dropna()

# One-hot encode categoricals
lstm_df = pd.get_dummies(lstm_df, columns=cat_cols, drop_first=True)

X = lstm_df.drop(columns=["readmit_30d"])
y = lstm_df["readmit_30d"].astype(int)

print("LSTM feature matrix shape:", X.shape)

# Stratified K-FOLD CV for LSTM
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
    X_tr = X.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_tr = y.iloc[train_idx]
    y_val = y.iloc[val_idx]

    # Scale numeric features within this fold to avoid leakage
    scaler_cv = StandardScaler()
    X_tr[numeric_cols] = scaler_cv.fit_transform(X_tr[numeric_cols])
    X_val[numeric_cols] = scaler_cv.transform(X_val[numeric_cols])

    # Convert to numpy and reshape for LSTM: (samples, timesteps, features)
    X_tr_np = X_tr.values
    X_val_np = X_val.values

    n_features = X_tr_np.shape[1]
    X_tr_lstm = X_tr_np.reshape(-1, 1, n_features)
    X_val_lstm = X_val_np.reshape(-1, 1, n_features)

    # Build a fresh LSTM model for this fold
    model_cv = Sequential([
        LSTM(64, input_shape=(1, n_features)),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),  
    ])

    model_cv.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
    )

    early_stop_cv = EarlyStopping(
        monitor="val_auc",
        patience=2,
        mode="max",
        restore_best_weights=True,
    )

    history_cv = model_cv.fit(
        X_tr_lstm,
        y_tr,
        validation_data=(X_val_lstm, y_val),
        epochs=10,
        batch_size=256,
        callbacks=[early_stop_cv],
        verbose=0, 
    )

    # Evaluate on this fold's validation split
    y_val_prob = model_cv.predict(X_val_lstm).ravel()
    y_val_pred = (y_val_prob >= 0.5).astype(int)

    fold_result = {
        "fold": fold,
        "accuracy": accuracy_score(y_val, y_val_pred),
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "f1_pos":  f1_score(y_val, y_val_pred, zero_division=0),
    }
    cv_metrics.append(fold_result)

    print(f"\nFold {fold}:")
    print(f"  accuracy: {fold_result['accuracy']:.3f}")
    print(f"  roc_auc:  {fold_result['roc_auc']:.3f}")
    print(f"  f1_pos:   {fold_result['f1_pos']:.3f}")

cv_df = pd.DataFrame(cv_metrics)
print("\n5-fold CV summary (LSTM)")
print(cv_df[["accuracy", "roc_auc", "f1_pos"]].mean().round(3))


# trainâ€“test split + final LSTM
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

X_train_np = X_train.values
X_test_np = X_test.values

n_features = X_train_np.shape[1]
X_train_lstm = X_train_np.reshape(-1, 1, n_features)
X_test_lstm = X_test_np.reshape(-1, 1, n_features)

print("LSTM input shape:", X_train_lstm.shape)

# Define final LSTM model (same architecture)
model = Sequential([
    LSTM(64, input_shape=(1, n_features)),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid"),  # P(readmit_30d = 1)
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
)

model.summary()

early_stop = EarlyStopping(
    monitor="val_auc",
    patience=2,
    mode="max",
    restore_best_weights=True,
)

history = model.fit(
    X_train_lstm,
    y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1,
)

# Evaluate on test set
y_prob = model.predict(X_test_lstm).ravel()
y_pred = (y_prob >= 0.5).astype(int)

lstm_results = {
    "accuracy": round(accuracy_score(y_test, y_pred), 3),
    "roc_auc": round(roc_auc_score(y_test, y_prob), 3),
    "f1_pos":  round(f1_score(y_test, y_pred, zero_division=0), 3),
}

print("\nLSTM model results (no k fold):")
for k, v in lstm_results.items():
    print(f"  {k}: {v}")

# Save plots
plot_and_save_metrics("lstm", y_test, y_prob)

# Save probabilities
np.save("y_test_LSTM.npy", y_test)
np.save("probs_LSTM.npy", y_prob)

  df = pd.read_csv(data_url)


Cleaned shape: (101766, 49)
LSTM feature matrix shape: (101766, 104)

Fold 1:
  accuracy: 0.889
  roc_auc:  0.677
  f1_pos:   0.041

Fold 2:
  accuracy: 0.889
  roc_auc:  0.690
  f1_pos:   0.027

Fold 3:
  accuracy: 0.889
  roc_auc:  0.678
  f1_pos:   0.020

Fold 4:
  accuracy: 0.889
  roc_auc:  0.680
  f1_pos:   0.018

Fold 5:
  accuracy: 0.888
  roc_auc:  0.674
  f1_pos:   0.009

5-fold CV summary (LSTM)
accuracy    0.889
roc_auc     0.680
f1_pos      0.023
dtype: float64
LSTM input shape: (81412, 1, 104)
Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_11 (LSTM)              (None, 64)                43264     
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_22 (Dense)            (None, 32)                2080      
  