In [6]:
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
from utils.diabetes_utils import clean_diabetes_data
from utils.diabetes_utils import plot_and_save_metrics

gkf = GroupKFold(n_splits=5)

oof_probs = np.zeros(len(y))
oof_labels = np.zeros(len(y))

cv_metrics = []

for fold, (train_idx, val_idx) in enumerate(
        gkf.split(X, y, groups=groups), start=1):

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    pos_weight_fold = (len(y_tr) - y_tr.sum()) / y_tr.sum()

    xgb_cv = build_xgb(scale_pos_weight=pos_weight_fold)

    xgb_cv.fit(
        X_tr,
        y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_val_prob = xgb_cv.predict_proba(X_val)[:, 1]

    # store OOF predictions
    oof_probs[val_idx] = y_val_prob
    oof_labels[val_idx] = y_val

    # threshold tuning for reporting
    best_f1 = 0
    best_thr = 0.5

    for thr in np.linspace(0.1, 0.9, 81):
        preds = (y_val_prob >= thr).astype(int)
        f1 = f1_score(y_val, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr

    fold_result = {
        "fold": fold,
        "roc_auc": roc_auc_score(y_val, y_val_prob),
        "auprc": average_precision_score(y_val, y_val_prob),
        "f1_05": f1_score(y_val, (y_val_prob >= 0.5).astype(int), zero_division=0),
        "f1_tuned": best_f1,
    }

    cv_metrics.append(fold_result)

    print(f"\nFold {fold}")
    print("  AUC:", round(fold_result["roc_auc"], 3))
    print("  AUPRC:", round(fold_result["auprc"], 3))
    print("  F1 (0.5):", round(fold_result["f1_05"], 3))
    print("  F1 (tuned):", round(best_f1, 3))
    print("  Best thr:", round(best_thr, 3))

cv_df = pd.DataFrame(cv_metrics)

print("\n5-fold CV summary (XGBoost OOF)")
print("Mean AUC:", round(cv_df["roc_auc"].mean(), 3))
print("Mean AUPRC:", round(cv_df["auprc"].mean(), 3))
print("Mean F1 (0.5):", round(cv_df["f1_05"].mean(), 3))
print("Mean F1 (tuned):", round(cv_df["f1_tuned"].mean(), 3))

# Use OOF predictions for plotting
plot_and_save_metrics(
    model_name="xgboost_oof",
    y_test=oof_labels,
    y_prob=oof_probs,
    threshold= best_thr
)

# save OOF predictions
np.save("oof_predictions/xgb_oof_probs.npy", oof_probs)
np.save("oof_predictions/y_oof_xgb.npy", oof_labels)


Fold 1
  AUC: 0.686
  AUPRC: 0.234
  F1 (0.5): 0.288
  F1 (tuned): 0.296
  Best thr: 0.58

Fold 2
  AUC: 0.663
  AUPRC: 0.198
  F1 (0.5): 0.26
  F1 (tuned): 0.272
  Best thr: 0.51

Fold 3
  AUC: 0.676
  AUPRC: 0.224
  F1 (0.5): 0.272
  F1 (tuned): 0.283
  Best thr: 0.57

Fold 4
  AUC: 0.678
  AUPRC: 0.221
  F1 (0.5): 0.274
  F1 (tuned): 0.285
  Best thr: 0.52

Fold 5
  AUC: 0.663
  AUPRC: 0.209
  F1 (0.5): 0.261
  F1 (tuned): 0.271
  Best thr: 0.51

5-fold CV summary (XGBoost OOF)
Mean AUC: 0.673
Mean AUPRC: 0.217
Mean F1 (0.5): 0.271
Mean F1 (tuned): 0.281
