In [None]:
# %%
# 💸 08_edge_analysis.ipynb – Simulated Betting Performance + Threshold/Sharpe/Bootstrap
# -------------------------------------------------------------------------------

import sys
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
from tqdm import tqdm

# Set base path and append src/
BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(BASE / "src"))

from processing.combine import create_edge_backtest_file

# --- CONFIG: Model Selection ---
CONFIG_PATH = BASE / "config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

PRED_FILE = config["prediction_files"][config["default_model"]]
PRED_PATH = BASE / "outputs" / PRED_FILE
MARKET_PATH = BASE / "data" / "market" / "betfair_sp.csv"
LABEL_PATH = BASE / "data" / "raw" / "test.csv"
OUT_PATH = BASE / "outputs" / "test_predictions_with_edge.csv"

print(f"🔍 Edge/backtest using model: {config['default_model']} ({PRED_FILE})")

# Ensure standard probability column for backtest
preds = pd.read_csv(PRED_PATH)
if "Pred_StackRidge" in preds.columns:
    preds["Predicted_Probability"] = preds["Pred_StackRidge"]
elif "Pred_BlendAvg" in preds.columns:
    preds["Predicted_Probability"] = preds["Pred_BlendAvg"]
else:
    last_col = preds.columns[-1]
    print(f"⚠️ No standard ensemble column found, using '{last_col}' as Predicted_Probability.")
    preds["Predicted_Probability"] = preds[last_col]
preds.to_csv(PRED_PATH, index=False)
create_edge_backtest_file(PRED_PATH, MARKET_PATH, LABEL_PATH, OUT_PATH)

# %%
# === Load Edge-annotated Predictions ===
preds = pd.read_csv(OUT_PATH)
required_cols = {'Edge_Score', 'Market_Odds', 'Predicted_Probability', 'True_Label'}
missing = required_cols - set(preds.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing} in {OUT_PATH}")

# === Compute Return Columns ===
preds["Flat_Return"] = np.where(
    preds["Edge_Score"] > 0,
    np.where(preds["True_Label"] == 1, preds["Market_Odds"] - 1, -1),
    0
)
kelly_frac = (preds["Predicted_Probability"] * preds["Market_Odds"] - 1) / (preds["Market_Odds"] - 1)
kelly_frac = kelly_frac.clip(lower=0, upper=1)
preds["Kelly_Frac"] = np.where(preds["Edge_Score"] > 0, kelly_frac, 0)
preds["Kelly_Return"] = np.where(
    preds["Edge_Score"] > 0,
    np.where(preds["True_Label"] == 1,
             preds["Kelly_Frac"] * (preds["Market_Odds"] - 1),
             -preds["Kelly_Frac"]),
    0
)
fraction = 0.5
preds["Frac_Kelly_Return"] = preds["Kelly_Return"] * fraction

# %%
# === Edge Threshold Sweep + Sharpe Optimization ===
print("\n🔎 Sweeping Edge Thresholds (Sharpe, ROI, Win Rate)...")

thresholds = np.linspace(0, 0.10, 21)  # 0.00, 0.005, ..., 0.10
sweep_stats = []
for thresh in thresholds:
    bets = preds[preds["Edge_Score"] > thresh]
    if len(bets) == 0:
        continue
    flat = bets["Flat_Return"]
    roi = flat.mean()
    win_rate = bets["True_Label"].mean()
    sharpe = roi / flat.std() if flat.std() > 0 else np.nan
    sweep_stats.append({
        "Threshold": thresh,
        "Bets": len(bets),
        "ROI_per_Bet": roi,
        "Sharpe": sharpe,
        "Win_Rate": win_rate
    })

sweep_df = pd.DataFrame(sweep_stats)
sweep_df.to_csv(BASE / "outputs" / "edge_threshold_sweep.csv", index=False)
print("\n🗂️ Saved edge threshold sweep stats.")

# --- Find the threshold with the best Sharpe ratio ---
best_row = sweep_df.loc[sweep_df["Sharpe"].idxmax()]
print(f"\n🌟 Best Sharpe at threshold {best_row['Threshold']:.3f}: Sharpe={best_row['Sharpe']:.3f}, ROI={best_row['ROI_per_Bet']:.3f}, Bets={int(best_row['Bets'])}")

# %%
# === Bootstrapped Confidence Intervals ===
def bootstrap_ci(data, func, n_iter=1000, ci=0.95):
    stats = []
    n = len(data)
    for _ in range(n_iter):
        sample = np.random.choice(data, size=n, replace=True)
        stats.append(func(sample))
    lower = np.percentile(stats, (1 - ci) / 2 * 100)
    upper = np.percentile(stats, (1 + ci) / 2 * 100)
    return np.mean(stats), lower, upper

# Apply bootstrapping at best Sharpe threshold
bets = preds[preds["Edge_Score"] > best_row["Threshold"]]
flat = bets["Flat_Return"].values
roi_mean, roi_low, roi_high = bootstrap_ci(flat, np.mean)
sharpe_mean, sharpe_low, sharpe_high = bootstrap_ci(flat, lambda x: np.mean(x) / np.std(x) if np.std(x) > 0 else 0)
print(f"\n🔬 Bootstrapped ROI: {roi_mean:.3f} (95% CI {roi_low:.3f} to {roi_high:.3f})")
print(f"🔬 Bootstrapped Sharpe: {sharpe_mean:.3f} (95% CI {sharpe_low:.3f} to {sharpe_high:.3f})")

# Save detailed summary for report
report_summary = {
    "Best_Threshold": best_row["Threshold"],
    "ROI_per_Bet": best_row["ROI_per_Bet"],
    "Sharpe": best_row["Sharpe"],
    "Win_Rate": best_row["Win_Rate"],
    "Total_Bets": int(best_row["Bets"]),
    "ROI_CI_Low": roi_low,
    "ROI_CI_High": roi_high,
    "Sharpe_CI_Low": sharpe_low,
    "Sharpe_CI_High": sharpe_high
}
pd.DataFrame([report_summary]).to_csv(BASE / "outputs" / "best_edge_summary.csv", index=False)

# %%
# === Save Everything / Finalize ===
preds.to_csv(OUT_PATH, index=False)
print(f"\n✅ Updated test_predictions_with_edge.csv with all return columns for portfolio simulation.")
print(f"✅ Best Sharpe summary saved to outputs/best_edge_summary.csv")

# Print key takeaway for submission/report
print(f"\nSummary for report:")
for k, v in report_summary.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# %%
# === ROI and Sharpe vs. Edge Threshold Plot ===
import matplotlib.pyplot as plt

sweep_df = pd.read_csv(BASE / "outputs" / "edge_threshold_sweep.csv")

plt.figure(figsize=(8, 5))
plt.plot(sweep_df["Threshold"], sweep_df["ROI_per_Bet"], marker="o", label="ROI per Bet")
plt.plot(sweep_df["Threshold"], sweep_df["Sharpe"], marker="s", label="Sharpe Ratio")
plt.axvline(float(report_summary["Best_Threshold"]), color='orange', linestyle='--', label=f"Best Threshold: {report_summary['Best_Threshold']}")
plt.xlabel("Edge Threshold")
plt.ylabel("Metric Value")
plt.title("ROI & Sharpe vs. Edge Threshold")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

print("\n🏁 Edge, ROI, and Sharpe optimization complete. Ready for portfolio simulation or dashboard export.")





🔍 Edge/backtest using model: stackridge (test_preds_ensemble.csv)
✅ Saved: c:\Users\dylan\Documents\Projects\horse_model_project\outputs\test_predictions_with_edge.csv

🔎 Sweeping Edge Thresholds (Sharpe, ROI, Win Rate)...

🗂️ Saved edge threshold sweep stats.


KeyError: 'ROI'