In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve,
    brier_score_loss
)
from sklearn.calibration import calibration_curve
import os

os.makedirs("figures", exist_ok=True)

# LOAD STACKING OOF RESULTS
meta_oof = np.load("../oof_predictions/oof_meta.npy").ravel()
y = np.load("../oof_predictions/y_oof_meta.npy").astype(int).ravel()

print("Samples:", len(y))
print("Positive rate:", round(y.mean(), 3))

# Bootstrap confidence intervals
def bootstrap_ci(y_true, y_prob, metric_fn, n_boot=1000):
    scores = []
    n = len(y_true)

    for _ in range(n_boot):
        idx = np.random.choice(n, n, replace=True)
        score = metric_fn(y_true[idx], y_prob[idx])
        scores.append(score)

    lower = np.percentile(scores, 2.5)
    upper = np.percentile(scores, 97.5)
    return lower, upper

auc = roc_auc_score(y, meta_oof)
auprc = average_precision_score(y, meta_oof)

auc_ci = bootstrap_ci(y, meta_oof, roc_auc_score)
auprc_ci = bootstrap_ci(y, meta_oof, average_precision_score)

print("\nBOOTSTRAP CONFIDENCE INTERVALS")
print(f"AUC: {auc:.3f} (95% CI: {auc_ci[0]:.3f}–{auc_ci[1]:.3f})")
print(f"AUPRC: {auprc:.3f} (95% CI: {auprc_ci[0]:.3f}–{auprc_ci[1]:.3f})")

# ROC Curve
fpr, tpr, _ = roc_curve(y, meta_oof)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Stacking ROC")
plt.legend()
plt.tight_layout()
plt.savefig("figures/stacking_roc.png", dpi=300)
plt.close()

# PR Curve
precision, recall, _ = precision_recall_curve(y, meta_oof)
plt.figure()
plt.plot(recall, precision, label=f"AUPRC = {auprc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Stacking Precision-Recall")
plt.legend()
plt.tight_layout()
plt.savefig("figures/stacking_pr.png", dpi=300)
plt.close()

# Calibration curve
prob_true, prob_pred = calibration_curve(y, meta_oof, n_bins=10)

plt.figure()
plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0,1],[0,1],'k--')
plt.xlabel("Predicted probability")
plt.ylabel("Observed frequency")
plt.title("Stacking Calibration")
plt.tight_layout()
plt.savefig("figures/stacking_calibration.png", dpi=300)
plt.close()

# Brier score
brier = brier_score_loss(y, meta_oof)
print("Brier score:", round(brier, 4))


Samples: 101766
Positive rate: 0.112

BOOTSTRAP CONFIDENCE INTERVALS
AUC: 0.682 (95% CI: 0.677–0.687)
AUPRC: 0.226 (95% CI: 0.220–0.233)
Brier score: 0.2241
