In [2]:
import sys
!{sys.executable} -m pip install xgboost

import pandas as pd
import numpy as np
import xgboost as xgb

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

from diabetes_utils import clean_diabetes_data, plot_and_save_metrics

# Load raw dataset and run our cleaning function
diabetes_data = fetch_ucirepo(id=296)
X_raw = diabetes_data.data.features
y_raw = diabetes_data.data.targets

if "readmitted" not in y_raw.columns:
    y_raw.columns = ["readmitted"]

df_raw = pd.concat([X_raw, y_raw], axis=1)
df_clean = clean_diabetes_data(df_raw)

print("Cleaned shape:", df_clean.shape)
print(df_clean.head())

# Build feature matrix for XGBoost
# Target is the binary 30-day readmission flag from the cleaner
target_col = "readmit_30d"

df_xgb = df_clean.copy()
y = df_xgb[target_col].values
df_xgb = df_xgb.drop(columns=["readmitted", target_col])  # drop raw label + target

# treat object columns as categorical and label-encode them
cat_cols = df_xgb.select_dtypes(include="object").columns.tolist()
num_cols = [c for c in df_xgb.columns if c not in cat_cols]

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df_xgb[col] = le.fit_transform(df_xgb[col].astype(str))
    encoders[col] = le

X = df_xgb.values
print("Feature matrix shape:", X.shape)
print("Positive rate (overall):", y.mean().round(3))

# Stratified 5-fold CV for XGBoost
def build_xgb(scale_pos_weight):
    return xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        n_estimators=500,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        n_jobs=4,
        early_stopping_rounds=20
    )

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    # class-weighting for imbalance in THIS fold
    pos_weight_fold = (len(y_tr) - y_tr.sum()) / y_tr.sum()
    print(f"\nFold {fold} scale_pos_weight: {pos_weight_fold:.2f}")

    xgb_cv = build_xgb(scale_pos_weight=pos_weight_fold)

    xgb_cv.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        verbose=False
    )

    # Evaluate on validation fold
    y_val_prob = xgb_cv.predict_proba(X_val)[:, 1]
    y_val_pred = (y_val_prob >= 0.5).astype(int)

    fold_result = {
        "fold": fold,
        "accuracy": accuracy_score(y_val, y_val_pred),
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "f1_pos":  f1_score(y_val, y_val_pred, zero_division=0),
    }
    cv_metrics.append(fold_result)

    print(f"Fold {fold}:")
    print(f"  accuracy: {fold_result['accuracy']:.3f}")
    print(f"  roc_auc:  {fold_result['roc_auc']:.3f}")
    print(f"  f1_pos:   {fold_result['f1_pos']:.3f}")

cv_df = pd.DataFrame(cv_metrics)
print("\n5-fold CV summary (XGBoost)")
print(cv_df[["accuracy", "roc_auc", "f1_pos"]].mean().round(3))

# Original single trainâ€“test split + final XGBoost
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Positive rate (train):", y_train.mean().round(3))

# simple class-weighting for imbalance on TRAIN split
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print("Final scale_pos_weight:", round(pos_weight, 2))

xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,
    n_jobs=4,
    early_stopping_rounds=20
)

# 4) Fit model with early stopping on held-out test as validation
xgb_clf.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)

print("Best iteration:", xgb_clf.best_iteration)

# 5) Evaluate and save plots on held-out test set
y_prob = xgb_clf.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

xgb_results = {
    "accuracy": round(accuracy_score(y_test, y_pred), 3),
    "roc_auc":  round(roc_auc_score(y_test, y_prob), 3),
    "f1_pos":   round(f1_score(y_test, y_pred, zero_division=0), 3),
}

print("\nXGBoost model results (no k fold):")
for k, v in xgb_results.items():
    print(f"  {k}: {v}")

# Save plots
plot_and_save_metrics("xgboost", y_test, y_prob)

# Save probabilites
np.save("y_test_xgb.npy", y_test)
np.save("probs_xgb.npy", y_prob)



  df = pd.read_csv(data_url)


Cleaned shape: (101766, 49)
              race  gender      age  admission_type_id  \
0        Caucasian  Female   [0-10)                  6   
1        Caucasian  Female  [10-20)                  1   
2  AfricanAmerican  Female  [20-30)                  1   
3        Caucasian    Male  [30-40)                  1   
4        Caucasian    Male  [40-50)                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital  \
0                        25                    1                 1   
1                         1                    7                 3   
2                         1                    7                 2   
3                         1                    7                 2   
4                         1                    7                 1   

   num_lab_procedures  num_procedures  num_medications  ...  \
0                  41               0                1  ...   
1                  59               0               18  ...   
2            