In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ucimlrepo import fetch_ucirepo
from diabetes_utils import clean_diabetes_data, plot_and_save_metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


# 1. Load and clean dataset (same source as LACE)
diabetes_data = fetch_ucirepo(id=296)
X = diabetes_data.data.features
y = diabetes_data.data.targets

if "readmitted" not in y.columns:
    y.columns = ["readmitted"]

df = pd.concat([X, y], axis=1)
df_clean = clean_diabetes_data(df)  # reuse our cleaning pipeline

print("Cleaned shape:", df_clean.shape)

# 2. Build feature set for LSTM (numeric + categorical)
numeric_cols = [
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
]

cat_cols = [
    "race",
    "gender",
    "age",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "diag_1_group",
    "diag_2_group",
    "diag_3_group",
    "insulin",
    "change",
    "diabetesMed",
]

# Keep only selected features + target, drop rows missing these
lstm_df = df_clean[numeric_cols + cat_cols + ["readmit_30d"]].dropna()

# One-hot encode categoricals
lstm_df = pd.get_dummies(lstm_df, columns=cat_cols, drop_first=True)

X = lstm_df.drop(columns=["readmit_30d"])
y = lstm_df["readmit_30d"].astype(int)

print("LSTM feature matrix shape:", X.shape)


# 3. Trainâ€“test split and scaling
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Convert to numpy and reshape for LSTM: (samples, timesteps, features)
X_train_np = X_train.values
X_test_np = X_test.values

n_features = X_train_np.shape[1]
X_train_lstm = X_train_np.reshape(-1, 1, n_features)
X_test_lstm = X_test_np.reshape(-1, 1, n_features)

print("LSTM input shape:", X_train_lstm.shape)

# 4. Define LSTM model
#    (simple baseline, easy to swap out for other deep models later)
model = Sequential([
    LSTM(64, input_shape=(1, n_features)),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid"),  # output = P(readmit_30d = 1)
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
)

model.summary()


# 5. Train with early stopping
early_stop = EarlyStopping(
    monitor="val_auc",
    patience=2,
    mode="max",
    restore_best_weights=True,
)

history = model.fit(
    X_train_lstm,
    y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1,
)


# 6. Evaluate on test set
y_prob = model.predict(X_test_lstm).ravel()
y_pred = (y_prob >= 0.5).astype(int)

lstm_results = {
    "accuracy": round(accuracy_score(y_test, y_pred), 3),
    "roc_auc": round(roc_auc_score(y_test, y_prob), 3),
    "f1_pos":  round(f1_score(y_test, y_pred, zero_division=0), 3),
}

print("\nLSTM model results:")
for k, v in lstm_results.items():
    print(f"  {k}: {v}")

# Save plots
plot_and_save_metrics("lstm", y_test, y_prob)

# Save probabilites
np.save("y_test_LSTM.npy", y_test)
np.save("probs_LSTM.npy", y_prob)

  df = pd.read_csv(data_url)


Cleaned shape: (101766, 49)
LSTM feature matrix shape: (101766, 104)
LSTM input shape: (81412, 1, 104)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 64)                43264     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 45,377
Trainable params: 45,377
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Ep