In [4]:
import sys
!{sys.executable} -m pip install ucimlrepo



In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI ML Repository (ID 296)
diabetes_data = fetch_ucirepo(id=296)
X = diabetes_data.data.features
y = diabetes_data.data.targets

# Make sure target column has a nice name
if "readmitted" not in y.columns:
    y.columns = ["readmitted"]

# Combine into a single DataFrame
df = pd.concat([X, y], axis=1)

print("Shape:", df.shape)
print(df.head())
print(df.columns)
print(y.head())

  df = pd.read_csv(data_url)


Shape: (101766, 48)
              race  gender      age weight  admission_type_id  \
0        Caucasian  Female   [0-10)    NaN                  6   
1        Caucasian  Female  [10-20)    NaN                  1   
2  AfricanAmerican  Female  [20-30)    NaN                  1   
3        Caucasian    Male  [30-40)    NaN                  1   
4        Caucasian    Male  [40-50)    NaN                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital payer_code  \
0                        25                    1                 1        NaN   
1                         1                    7                 3        NaN   
2                         1                    7                 2        NaN   
3                         1                    7                 2        NaN   
4                         1                    7                 1        NaN   

          medical_specialty  ...  citoglipton  insulin  glyburide-metformin  \
0  Pediatrics-Endocrino

In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from diabetes_utils import clean_diabetes_data
from diabetes_utils import plot_and_save_metrics
import pandas as pd

# Clean the raw dataset with our function
df_clean = clean_diabetes_data(df)

# LACE-style feature set
lace_features = [
    "time_in_hospital",   # L: length of stay
    "number_diagnoses",   # C: comorbidity proxy
    "number_emergency",   # E: prior emergency visits
    "number_inpatient",   # extra utilization signal
    "admission_type_id",  # A: acuity of admission
]

# Build LACE dataset: only these features + binary target
lace_df = df_clean[lace_features + ["readmit_30d"]].dropna()

X_lace = lace_df.drop(columns=["readmit_30d"])
y_lace = lace_df["readmit_30d"]

# One-hot encode admission_type_id (emergency vs other types)
X_lace = pd.get_dummies(X_lace, columns=["admission_type_id"], drop_first=True)

print("LACE feature matrix shape:", X_lace.shape)
print("LACE columns:", X_lace.columns.tolist())

# Numeric columns to scale
num_cols = ["time_in_hospital", "number_diagnoses",
            "number_emergency", "number_inpatient"]


# K-FOLD CV (addresses skew ie low number of people being readmited early or before 30 days, gives stable metrics)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_lace, y_lace), start=1):
    X_tr = X_lace.iloc[train_idx].copy()
    X_val = X_lace.iloc[val_idx].copy()
    y_tr = y_lace.iloc[train_idx]
    y_val = y_lace.iloc[val_idx]

    # Scale numeric features within this fold (avoid leakage)
    scaler_cv = StandardScaler()
    X_tr[num_cols] = scaler_cv.fit_transform(X_tr[num_cols])
    X_val[num_cols] = scaler_cv.transform(X_val[num_cols])

    lace_cv = LogisticRegression(max_iter=1000)
    lace_cv.fit(X_tr, y_tr)

    y_val_prob = lace_cv.predict_proba(X_val)[:, 1]
    y_val_pred = lace_cv.predict(X_val)

    fold_result = {
        "fold": fold,
        "accuracy": accuracy_score(y_val, y_val_pred),
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "f1_pos":  f1_score(y_val, y_val_pred, zero_division=0),
    }
    cv_metrics.append(fold_result)

    print(f"\nFold {fold}:")
    print(f"  accuracy: {fold_result['accuracy']:.3f}")
    print(f"  roc_auc:  {fold_result['roc_auc']:.3f}")
    print(f"  f1_pos:   {fold_result['f1_pos']:.3f}")

cv_df = pd.DataFrame(cv_metrics)
print("\n5-fold CV summary (LACE logistic)")
print(cv_df[["accuracy", "roc_auc", "f1_pos"]].mean().round(3))


# Original held-out trainâ€“test split + plotting
X_train, X_test, y_train, y_test = train_test_split(
    X_lace,
    y_lace,
    test_size=0.2,
    random_state=42,
    stratify=y_lace
)

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

lace_logreg = LogisticRegression(max_iter=1000)
lace_logreg.fit(X_train, y_train)

y_prob = lace_logreg.predict_proba(X_test)[:, 1]  # P(readmit_30d = 1) = P(<30d)
y_pred = lace_logreg.predict(X_test)

lace_results = {
    "accuracy": round(accuracy_score(y_test, y_pred), 3),
    "roc_auc": round(roc_auc_score(y_test, y_prob), 3),
    "f1_pos":  round(f1_score(y_test, y_pred, zero_division=0), 3),
}

print("\nLACE-style logistic regression baseline (no k fold):")
for k, v in lace_results.items():
    print(f"  {k}: {v}")

# Save plots (ROC, PR, etc.) using your existing utility
plot_and_save_metrics("lace_logreg", y_test, y_prob)

LACE feature matrix shape: (101766, 11)
LACE columns: ['time_in_hospital', 'number_diagnoses', 'number_emergency', 'number_inpatient', 'admission_type_id_2', 'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8']

Fold 1:
  accuracy: 0.889
  roc_auc:  0.621
  f1_pos:   0.040

Fold 2:
  accuracy: 0.889
  roc_auc:  0.639
  f1_pos:   0.032

Fold 3:
  accuracy: 0.888
  roc_auc:  0.628
  f1_pos:   0.024

Fold 4:
  accuracy: 0.889
  roc_auc:  0.635
  f1_pos:   0.034

Fold 5:
  accuracy: 0.888
  roc_auc:  0.622
  f1_pos:   0.022

5-fold CV summary (LACE logistic)
accuracy    0.888
roc_auc     0.629
f1_pos      0.031
dtype: float64

LACE-style logistic regression baseline (no k fold):
  accuracy: 0.888
  roc_auc: 0.632
  f1_pos: 0.028
