In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    average_precision_score,
    precision_recall_curve,
)

from utils.diabetes_utils import clean_diabetes_data
from utils.diabetes_utils import plot_and_save_metrics



# Load the original Diabetes 130 dataset (with identifiers)
df = pd.read_csv("data/diabetic_data.csv")

print("Shape:", df.shape)
print(df.head())
print(df.columns.tolist())


# Clean raw dataset
df_clean = clean_diabetes_data(df)


# LACE-style feature set
lace_features = [
    "time_in_hospital",
    "number_diagnoses",
    "number_emergency",
    "number_inpatient",
    "admission_type_id",
]

lace_df = df_clean[lace_features + ["readmit_30d", "patient_nbr"]].dropna()

groups = lace_df["patient_nbr"]

X_lace = lace_df.drop(columns=["readmit_30d", "patient_nbr"])
y_lace = lace_df["readmit_30d"]

# One-hot encode admission type
X_lace = pd.get_dummies(X_lace, columns=["admission_type_id"], drop_first=True)

num_cols = [
    "time_in_hospital",
    "number_diagnoses",
    "number_emergency",
    "number_inpatient",
]

print("Feature matrix shape:", X_lace.shape)
print("Positive rate:", round(y_lace.mean(), 3))


gkf = GroupKFold(n_splits=5)
cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(
        gkf.split(X_lace, y_lace, groups=groups), start=1):

    X_tr = X_lace.iloc[train_idx].copy()
    X_val = X_lace.iloc[val_idx].copy()
    y_tr = y_lace.iloc[train_idx]
    y_val = y_lace.iloc[val_idx]

    # Scale numeric features
    scaler = StandardScaler()
    X_tr[num_cols] = scaler.fit_transform(X_tr[num_cols])
    X_val[num_cols] = scaler.transform(X_val[num_cols])

    # Logistic regression baseline
    model = LogisticRegression(max_iter=1000, class_weight="balanced")
    model.fit(X_tr, y_tr)

    y_val_prob = model.predict_proba(X_val)[:, 1]

    # Default threshold
    y_val_pred_default = (y_val_prob >= 0.5).astype(int)

    # AUPRC
    auprc = average_precision_score(y_val, y_val_prob)

    # Threshold tuning
    precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_idx]

    y_val_pred_tuned = (y_val_prob >= best_threshold).astype(int)

    fold_result = {
        "fold": fold,
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "auprc": auprc,
        "f1_default": f1_score(y_val, y_val_pred_default, zero_division=0),
        "f1_tuned": f1_score(y_val, y_val_pred_tuned, zero_division=0),
        "best_threshold": best_threshold,
    }

    cv_metrics.append(fold_result)

    print(f"\nFold {fold}:")
    print(f"  AUC:        {fold_result['roc_auc']:.3f}")
    print(f"  AUPRC:      {fold_result['auprc']:.3f}")
    print(f"  F1 (0.5):   {fold_result['f1_default']:.3f}")
    print(f"  F1 (tuned): {fold_result['f1_tuned']:.3f}")
    print(f"  Best thr:   {best_threshold:.3f}")

cv_df = pd.DataFrame(cv_metrics)

print("\n5-fold CV summary (LACE – patient split + tuned threshold)")
print("Mean AUC:", round(cv_df["roc_auc"].mean(), 3))
print("Mean AUPRC:", round(cv_df["auprc"].mean(), 3))
print("Mean F1 (0.5):", round(cv_df["f1_default"].mean(), 3))
print("Mean F1 (tuned):", round(cv_df["f1_tuned"].mean(), 3))

# Save plots for last validation fold
plot_and_save_metrics(
    model_name="lace_lastfold",
    y_test=y_val,
    y_prob=y_val_prob,
    threshold=best_threshold
)

print("Plots saved to figures/ folder")

Shape: (101766, 50)
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No 