In [6]:
# %%
# 🧬 05_ensemble_stack.ipynb – Blending & Stacking Ensemble (Tuned Models)
# -----------------------------------------------------------------------

import sys
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV

BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_INTERIM = BASE / "data" / "interim"
OUTPUTS = BASE / "outputs"
OUTPUTS.mkdir(exist_ok=True)

# --- Utility: Normalize so group sums to 1
def normalize_probs_per_race(df, group_col, prob_cols):
    for col in prob_cols:
        df[col] = df.groupby(group_col)[col].transform(lambda x: x / x.sum())
    return df

# %%
# === Load TEST predictions for each tuned model ===
test_files = {
    "LGBM": "test_preds_lgbm_tuned.csv",
    "RF": "test_preds_rf_tuned.csv",
    "XGB": "test_preds_xgb_tuned.csv",
    "CAT": "test_preds_cat_tuned.csv",
    "MLP": "test_preds_mlp_tuned.csv",
    # Add "Market": "test_preds_market.csv" if you have market odds predictions
}

test_dfs = []
for name, fname in test_files.items():
    df = pd.read_csv(OUTPUTS / fname)
    df = df.rename(columns={"Predicted_Probability": f"Pred_{name}"})
    test_dfs.append(df[["Race_ID", "Horse", f"Pred_{name}"]])

# Merge all predictions
ensemble_test = test_dfs[0]
for df in test_dfs[1:]:
    ensemble_test = ensemble_test.merge(df, on=["Race_ID", "Horse"], how="inner")

# === Simple Blend (mean) ===
model_cols = [f"Pred_{k}" for k in test_files.keys() if k != "Market"]
blend_weights = np.ones(len(model_cols)) / len(model_cols)
ensemble_test["Pred_BlendAvg"] = ensemble_test[model_cols].values @ blend_weights

# Optional: Add a market blend if available
if "Pred_Market" in ensemble_test.columns:
    ensemble_test["Pred_BlendWithMarket"] = (
        0.7 * ensemble_test["Pred_BlendAvg"] + 0.3 * ensemble_test["Pred_Market"]
    )

# === Stacking: Ridge Regression ===
meta = RidgeCV(alphas=np.logspace(-3, 3, 7), cv=3)
meta.fit(ensemble_test[model_cols], ensemble_test["Pred_BlendAvg"])
ensemble_test["Pred_StackRidge"] = meta.predict(ensemble_test[model_cols]).clip(0, 1)

if "Pred_Market" in ensemble_test.columns:
    X_test2 = np.column_stack([ensemble_test["Pred_StackRidge"], ensemble_test["Pred_Market"]])
    meta2 = RidgeCV(alphas=np.logspace(-3, 3, 7), cv=3)
    meta2.fit(X_test2, ensemble_test["Pred_BlendAvg"])
    ensemble_test["Pred_StackRidgeMarket"] = meta2.predict(X_test2).clip(0, 1)

# FINAL normalization
ensemble_test = normalize_probs_per_race(
    ensemble_test,
    "Race_ID",
    [c for c in ["Pred_BlendAvg", "Pred_StackRidge", "Pred_BlendWithMarket", "Pred_StackRidgeMarket"]
     if c in ensemble_test.columns]
)

# %%
# === Save Ensemble Test Predictions ===
ensemble_test.to_csv(OUTPUTS / "test_preds_ensemble.csv", index=False)
print("✅ Saved: test_preds_ensemble.csv")

# Show quick check (first 10 horses, all columns)
display_cols = ["Race_ID", "Horse"] + model_cols + [
    c for c in ["Pred_BlendAvg", "Pred_StackRidge", "Pred_BlendWithMarket", "Pred_StackRidgeMarket"]
    if c in ensemble_test.columns
]
print(ensemble_test[display_cols].head(10))




✅ Saved: test_preds_ensemble.csv
   Race_ID               Horse  Pred_LGBM   Pred_RF  Pred_XGB  Pred_CAT  \
0       58  Signora Bellissima   0.038500  0.037956  0.038943  0.037098   
1       58      Sonnerie Power   0.139506  0.138155  0.137068  0.152437   
2       58          Ribba Hill   0.082640  0.083920  0.086366  0.072798   
3       58       Orange N Blue   0.035574  0.033268  0.041209  0.035736   
4       58       Ever Hopefull   0.140203  0.127846  0.135066  0.136374   
5       58        Percy Willis   0.087102  0.082929  0.087095  0.085127   
6       58           Hartswood   0.071074  0.065759  0.074564  0.074041   
7       58          Urban Road   0.099671  0.079528  0.093573  0.095975   
8       58       Alpine Sierra   0.113218  0.109183  0.122059  0.113714   
9       58       Midnight Lion   0.144505  0.126711  0.127812  0.103214   

   Pred_MLP  Pred_BlendAvg  Pred_StackRidge  
0  0.097829       0.048724         0.048725  
1  0.106912       0.131204         0.131203  
2  