In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from diabetes_utils import plot_and_save_metrics  

# load saved test labels and probabilities 
y_test_tarnn  = np.load("y_test_tarnn.npy")
y_test_tabnet = np.load("y_test_tabnet.npy")
y_test_xgb    = np.load("y_test_xgb.npy")

p_tarnn  = np.load("prob_tarnn.npy")
p_tabnet = np.load("probs_tabnet.npy")
p_xgb    = np.load("probs_xgb.npy")

# sanity check that all models are evaluated on the SAME samples 
assert np.array_equal(y_test_tarnn, y_test_tabnet)
assert np.array_equal(y_test_tarnn, y_test_xgb)

y_test = y_test_tarnn  

print("y_test shape:", y_test.shape)
print("TA-RNN probs shape:",  p_tarnn.shape)
print("TabNet probs shape:",  p_tabnet.shape)
print("XGBoost probs shape:", p_xgb.shape)

# simple probability averaging ensemble 
p_ens = (p_tarnn + p_tabnet + p_xgb) / 3.0

y_pred_ens = (p_ens >= 0.5).astype(int)

ens_results = {
    "accuracy": round(accuracy_score(y_test, y_pred_ens), 3),
    "roc_auc": round(roc_auc_score(y_test, p_ens), 3),
    "f1_pos":  round(f1_score(y_test, y_pred_ens, zero_division=0), 3),
}

print("\nEnsemble TA-RNN + TabNet + XGBoost:")
for k, v in ens_results.items():
    print(f"  {k}: {v}")

# Save plots
plot_and_save_metrics("ensemble_tarnn_tabnet_xgb", y_test, p_ens)

y_test shape: (20354,)
TA-RNN probs shape: (20354,)
TabNet probs shape: (20354,)
XGBoost probs shape: (20354,)

Ensemble TA-RNN + TabNet + XGBoost:
  accuracy: 0.889
  roc_auc: 0.684
  f1_pos: 0.043
