In [2]:
import sys
!{sys.executable} -m pip install ucimlrepo



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI ML Repository (ID 296)
diabetes_data = fetch_ucirepo(id=296)
X = diabetes_data.data.features
y = diabetes_data.data.targets

# Make sure target column has a nice name
if "readmitted" not in y.columns:
    y.columns = ["readmitted"]
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# LACE-style features
lace_features = [
    "time_in_hospital",   # L
    "number_diagnoses",   # C
    "number_emergency",   # E
    "number_inpatient",   # extra utilization
    "admission_type_id",  # A
]

# Use cleaned data and keep only these cols + target
lace_df = df_clean[lace_features + ["readmit_30d"]].dropna()

X_lace = lace_df.drop(columns=["readmit_30d"])
y_lace = lace_df["readmit_30d"]

# One-hot encode admission_type_id
X_lace = pd.get_dummies(X_lace, columns=["admission_type_id"], drop_first=True)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X_lace, y_lace, test_size=0.2, random_state=42, stratify=y_lace
)

# Scale numeric features for logistic regression
num_cols = ["time_in_hospital", "number_diagnoses",
            "number_emergency", "number_inpatient"]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Fit logistic regression
lace_logreg = LogisticRegression(max_iter=1000)
lace_logreg.fit(X_train, y_train)

# Evaluate
y_prob = lace_logreg.predict_proba(X_test)[:, 1]
y_pred = lace_logreg.predict(X_test)

print("LACE-style logistic regression baseline:")
print("  Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("  ROC-AUC :", round(roc_auc_score(y_test, y_prob), 3))
print("  F1 (pos):", round(f1_score(y_test, y_pred, zero_division=0), 3))

# Combine into a single DataFrame
df = pd.concat([X, y], axis=1)

print("Shape:", df.shape)
print(df.head())
print(df.columns)
print(y.head())

Shape: (101766, 48)
              race  gender      age weight  admission_type_id  \
0        Caucasian  Female   [0-10)    NaN                  6   
1        Caucasian  Female  [10-20)    NaN                  1   
2  AfricanAmerican  Female  [20-30)    NaN                  1   
3        Caucasian    Male  [30-40)    NaN                  1   
4        Caucasian    Male  [40-50)    NaN                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital payer_code  \
0                        25                    1                 1        NaN   
1                         1                    7                 3        NaN   
2                         1                    7                 2        NaN   
3                         1                    7                 2        NaN   
4                         1                    7                 1        NaN   

          medical_specialty  ...  citoglipton  insulin  glyburide-metformin  \
0  Pediatrics-Endocrino

  df = pd.read_csv(data_url)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from diabetes_utils import clean_diabetes_data

# LACE-style features
lace_features = [
    "time_in_hospital",   # L
    "number_diagnoses",   # C
    "number_emergency",   # E
    "number_inpatient",   # extra utilization
    "admission_type_id",  # A
]

# Use cleaned data and keep only these cols + target
lace_df = clean_diabetes_data(df) 

# Keep only the l

X_lace = lace_df.drop(columns=["readmit_30d"])
y_lace = lace_df["readmit_30d"]

# One-hot encode admission_type_id
X_lace = pd.get_dummies(X_lace, columns=["admission_type_id"], drop_first=True)

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X_lace, y_lace, test_size=0.2, random_state=42, stratify=y_lace
)

# Scale numeric features for logistic regression
num_cols = ["time_in_hospital", "number_diagnoses",
            "number_emergency", "number_inpatient"]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Fit logistic regression
lace_logreg = LogisticRegression(max_iter=1000)
lace_logreg.fit(X_train, y_train)

# Evaluate
y_prob = lace_logreg.predict_proba(X_test)[:, 1]
y_pred = lace_logreg.predict(X_test)

print("LACE-style logistic regression baseline:")
print("  Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("  ROC-AUC :", round(roc_auc_score(y_test, y_prob), 3))
print("  F1 (pos):", round(f1_score(y_test, y_pred, zero_division=0), 3))

LACE-style logistic regression baseline:
  Accuracy: 0.888
  ROC-AUC : 0.632
  F1 (pos): 0.028
