In [3]:
# %%
# 📟 06_metrics_evaluation.ipynb – Evaluation & Summary
# -----------------------------------------------------

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from sklearn.calibration import calibration_curve
from sklearn.metrics import log_loss, brier_score_loss
from scipy.stats import entropy

# === Path Setup ===
BASE = Path.cwd()
PROJECT_ROOT = BASE if (BASE / "src").exists() else BASE.parent
sys.path.append(str(PROJECT_ROOT / "src"))
OUT_DIR = PROJECT_ROOT / "outputs"
OUT_DIR.mkdir(exist_ok=True)
MARKET_PATH = PROJECT_ROOT / "data" / "market" / "betfair_sp.csv"

# === CONFIG: Load Model Selection ===
CONFIG_PATH = PROJECT_ROOT / "config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

default_model = config["default_model"]
prediction_files = config["prediction_files"]
column_map = config.get("column_map", {})

PRED_FILE = prediction_files[default_model]
PRED_PATH = OUT_DIR / PRED_FILE

# Decide OOF file for baseline/ensemble models
if default_model in ["lgbm", "LGBM"]:
    OOF_FILE = "oof_preds_lgbm.csv"
elif default_model in ["ensemble", "stackridge"]:
    OOF_FILE = f"oof_preds_{default_model}.csv"
else:
    OOF_FILE = f"oof_preds_{default_model}.csv"
OOF_PATH = OUT_DIR / OOF_FILE

print(f"🔍 Using model: {default_model} ({PRED_FILE})")

# === Pick column for prediction based on config ===
pred_col = column_map.get(default_model, "Predicted_Probability")

# %%
# === 1. Load OOF Predictions & Evaluate ===
try:
    oof_df = pd.read_csv(OOF_PATH)
    assert {'True_Label', 'Predicted_Probability', 'Race_ID'}.issubset(oof_df.columns)

    target = oof_df["True_Label"]
    oof_preds = oof_df["Predicted_Probability"]
    race_ids = oof_df["Race_ID"]

    print(f"✅ Loaded OOF predictions: {len(oof_df)} samples")

    # --- Calibration Curve ---
    prob_true, prob_pred = calibration_curve(target, oof_preds, n_bins=10, strategy='uniform')

    plt.figure(figsize=(6, 6))
    plt.plot(prob_pred, prob_true, marker='o', label='Model Calibration')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect Calibration')
    plt.title("Reliability Curve – Model Only")
    plt.xlabel("Predicted Probability")
    plt.ylabel("Empirical Win Rate")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Log Loss & Brier Score ---
    log = log_loss(target, oof_preds)
    brier = brier_score_loss(target, oof_preds)
    print(f"📉 Log Loss: {log:.5f}")
    print(f"🌧️  Brier Score: {brier:.5f}")

    # --- Confidence Bin Summary ---
    oof_df["Prob_Bin"] = pd.cut(oof_preds, bins=np.linspace(0, 1, 11))
    summary = oof_df.groupby("Prob_Bin")["True_Label"].agg(['count', 'mean']).reset_index()
    summary.columns = ["Bin", "Samples", "Empirical Win Rate"]
    print(summary)

    # --- Save ---
    eval_df = pd.DataFrame({
        "Log_Loss": [log],
        "Brier_Score": [brier]
    })
    eval_df.to_csv(OUT_DIR / "eval_metrics.csv", index=False)
    print("✅ Saved evaluation metrics")
except FileNotFoundError:
    print(f"⚠️ No OOF predictions found for '{default_model}'. Skipping OOF evaluation.")


# %%
# === 2. Load Test Predictions & Race-Level Metrics ===
df = pd.read_csv(PRED_PATH)
assert {'Race_ID', 'Horse', pred_col}.issubset(df.columns)
print(f"✅ Loaded: {PRED_PATH} with {len(df)} rows")

# Race-level metrics
def top2_margin(group):
    sorted_probs = group[pred_col].sort_values(ascending=False)
    return sorted_probs.iloc[0] - sorted_probs.iloc[1]

race_margin = df.groupby('Race_ID').apply(top2_margin)
race_entropy = df.groupby('Race_ID')[pred_col].apply(entropy)
field_size = df.groupby('Race_ID').size()
adj_confidence = df.groupby('Race_ID')[pred_col].max() / (1 / field_size)

race_edge = pd.DataFrame({
    'Race_ID': race_margin.index,
    'Margin_Top2': race_margin.values,
    'Entropy': race_entropy.values,
    'Field_Size': field_size.values,
    'Adj_Confidence': adj_confidence.values
})
race_edge.to_csv(OUT_DIR / "summary_race_edge.csv", index=False)
print("✅ Exported race-level edge metrics")

# %%
# === 3. Top Picks, Longshots, Confidence Bins ===
df_sorted = df.sort_values(pred_col, ascending=False)
top_preds = df_sorted.groupby("Race_ID").first().reset_index()
top_safe = top_preds.sort_values(pred_col, ascending=False).head(10)
top_safe.to_csv(OUT_DIR / "summary_top_safe.csv", index=False)
print("✅ Exported top safest picks")

df["Rank_in_Race"] = df.groupby("Race_ID")[pred_col].rank(ascending=False, method='first')
longshots = df[(df[pred_col] >= 0.10) & (df["Rank_in_Race"] > 1)]
longshots = longshots.sort_values(pred_col, ascending=False).head(10)
longshots.to_csv(OUT_DIR / "summary_longshots.csv", index=False)
print("✅ Exported top longshots")

# Confidence bins
conf_bins = pd.cut(df[pred_col], bins=np.linspace(0, 1, 11))
conf_counts = conf_bins.value_counts().sort_index()
conf_summary = conf_counts.reset_index()
conf_summary.columns = ["Bin", "Count"]
conf_summary.to_csv(OUT_DIR / "summary_confidence_bins.csv", index=False)
print("✅ Exported confidence bin summary")

print(f"🎯 High-confidence horses (>85%): {len(df[df[pred_col] > 0.85])}")
print(f"🧐 Low-confidence horses (<2%): {len(df[df[pred_col] < 0.02])}")

# %%
# === 4. Edge Score vs Market + Blended Probability ===
market_df = pd.read_csv(MARKET_PATH)
df_edge = df.merge(market_df, on=["Race_ID", "Horse"], how="left")

df_edge["Market_Prob"] = 1 / df_edge["Market_Odds"]
df_edge["Edge_Score"] = df_edge[pred_col] - df_edge["Market_Prob"]

# --- Compute Blended Probability ---
ALPHA = 0.7  # Static blending weight
df_edge["Blended_Probability"] = ALPHA * df_edge[pred_col] + (1 - ALPHA) * df_edge["Market_Prob"]

# Save blended predictions
df_edge.to_csv(OUT_DIR / "test_predictions_with_edge.csv", index=False)
print("✅ Exported test_predictions_with_edge.csv including Blended_Probability")

# Top mispriced
mispriced_top = df_edge.sort_values("Edge_Score", ascending=False).head(10)
mispriced_top.to_csv(OUT_DIR / "summary_top_mispriced.csv", index=False)
print("✅ Exported top mispriced horses")

# %%
# === 5. Evaluate Blended Probability (Optional Section) ===
if 'True_Label' in df_edge.columns:
    y_true = df_edge["True_Label"]
    y_blend = df_edge["Blended_Probability"]

    log_blend = log_loss(y_true, y_blend)
    brier_blend = brier_score_loss(y_true, y_blend)

    # Calibration curve
    prob_true_blend, prob_pred_blend = calibration_curve(y_true, y_blend, n_bins=10)

    plt.figure(figsize=(6, 6))
    plt.plot(prob_pred_blend, prob_true_blend, marker='o', label='Blended')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect')
    plt.title("Reliability Curve – Blended Probability")
    plt.xlabel("Predicted Probability")
    plt.ylabel("Empirical Win Rate")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    print(f"\n🔀 Blended Log Loss: {log_blend:.5f}")
    print(f"🔀 Blended Brier Score: {brier_blend:.5f}")

    pd.DataFrame({
        "Log_Loss": [log_blend],
        "Brier_Score": [brier_blend],
        "Alpha": [ALPHA]
    }).to_csv(OUT_DIR / "eval_blended_metrics.csv", index=False)
    print("✅ Saved eval_blended_metrics.csv")

# %%
print("\n✅ Evaluation & summary complete. Ready for 07_visuals or 08_edge_analysis.")






🔍 Using model: stackridge (test_preds_ensemble.csv)
⚠️ No OOF predictions found for 'stackridge'. Skipping OOF evaluation.
✅ Loaded: c:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_preds_ensemble.csv with 11275 rows


  race_margin = df.groupby('Race_ID').apply(top2_margin)


✅ Exported race-level edge metrics
✅ Exported top safest picks
✅ Exported top longshots
✅ Exported confidence bin summary
🎯 High-confidence horses (>85%): 0
🧐 Low-confidence horses (<2%): 0
✅ Exported test_predictions_with_edge.csv including Blended_Probability
✅ Exported top mispriced horses

✅ Evaluation & summary complete. Ready for 07_visuals or 08_edge_analysis.
