In [15]:
import os
import glob
import joblib
import numpy as np
import pandas as pd

# =========================
# CONFIG
# =========================
DATA_FILE = "post_info_2026_diff.csv"              # the feature file you want to predict on
OUT_FILE  = "post_2026_predictions_main.csv"      # output

HOLDOUT_DIR = "models_holdout"
RANDOM_DIR  = "models_random"

# Optional: keep identifiers in the output (add/remove as you like)
ID_COLS = [c for c in ["home_team", "away_team"] if c in pd.read_csv(DATA_FILE, nrows=1).columns]


# =========================
# HELPERS
# =========================
def predict_from_bundle(bundle, df_features: pd.DataFrame) -> np.ndarray:
    """Predict using a saved bundle dict from our earlier scripts."""
    model = bundle["model"]
    feats = bundle["features"]
    scaler = bundle.get("scaler", None)
    numeric_cols = bundle.get("numeric_cols", [])

    # Build X in the exact training feature order
    missing = [c for c in feats if c not in df_features.columns]
    if missing:
        raise ValueError(f"Missing columns required by model {bundle.get('model_name','?')}: {missing}")

    X = df_features[feats].copy().apply(pd.to_numeric, errors="coerce").fillna(0)

    # Apply scaler if stored (linear/huber models)
    if scaler is not None and numeric_cols:
        X[numeric_cols] = scaler.transform(X[numeric_cols])

    return model.predict(X)


def load_and_predict_dir(model_dir: str, tag: str, df: pd.DataFrame) -> pd.DataFrame:
    """
    Loads every .joblib bundle in model_dir and returns a dataframe of predictions.
    Column naming:
      prediction_spread_{tag}_{modelname}
    where modelname strips any leading 'holdout_' or 'random_'.
    """
    preds = {}

    for path in sorted(glob.glob(os.path.join(model_dir, "*.joblib"))):
        bundle = joblib.load(path)

        # Determine a stable name
        raw_name = bundle.get("model_name", os.path.splitext(os.path.basename(path))[0])

        # Strip prefixes to keep the column readable
        base_name = raw_name
        if base_name.lower().startswith("holdout_"):
            base_name = base_name[len("holdout_"):]
        if base_name.lower().startswith("random_"):
            base_name = base_name[len("random_"):]

        col = f"prediction_spread_{tag}_{base_name}"
        preds[col] = predict_from_bundle(bundle, df)

    return pd.DataFrame(preds)


# =========================
# MAIN
# =========================
df = pd.read_csv(DATA_FILE)

# Make predictions from all models
pred_holdout = load_and_predict_dir(HOLDOUT_DIR, "holdout", df)
pred_random  = load_and_predict_dir(RANDOM_DIR,  "random",  df)

# Combine into one output
out = pd.DataFrame()
if ID_COLS:
    out[ID_COLS] = df[ID_COLS]

out = pd.concat([out, pred_holdout, pred_random], axis=1)

# Averages
holdout_cols = [c for c in out.columns if c.startswith("prediction_spread_holdout_")]
random_cols  = [c for c in out.columns if c.startswith("prediction_spread_random_")]
all_pred_cols = holdout_cols + random_cols

out["avg_holdout"] = out[holdout_cols].mean(axis=1) if holdout_cols else np.nan
out["avg_random"]  = out[random_cols].mean(axis=1) if random_cols else np.nan
out["avg_all"]     = out[all_pred_cols].mean(axis=1) if all_pred_cols else np.nan

# Save
out.to_csv(OUT_FILE, index=False)

print(f"✅ Saved: {OUT_FILE}")
print(f"Holdout models: {len(holdout_cols)} | Random models: {len(random_cols)} | Total: {len(all_pred_cols)}")
print("Columns:", list(out.columns))

✅ Saved: post_2026_predictions_main.csv
Holdout models: 7 | Random models: 7 | Total: 14
Columns: ['home_team', 'away_team', 'prediction_spread_holdout_ElasticNet', 'prediction_spread_holdout_GradientBoosting_TUNED', 'prediction_spread_holdout_HuberRegressor', 'prediction_spread_holdout_Lasso', 'prediction_spread_holdout_LinearRegression', 'prediction_spread_holdout_RandomForest', 'prediction_spread_holdout_Ridge', 'prediction_spread_random_ElasticNet', 'prediction_spread_random_GradientBoosting_TUNED', 'prediction_spread_random_HuberRegressor', 'prediction_spread_random_Lasso', 'prediction_spread_random_LinearRegression', 'prediction_spread_random_RandomForest', 'prediction_spread_random_Ridge', 'avg_holdout', 'avg_random', 'avg_all']


In [17]:
import os
import glob
import joblib
import numpy as np
import pandas as pd

# ======================================================
# CONFIG
# ======================================================
DATA_FILE = "post_info_2026_diff.csv"
OUT_FILE  = "post_2026_predictions_main.csv"

HOLDOUT_DIR = "models_holdout"
RANDOM_DIR  = "models_random"

ID_COLS = ["home_team", "away_team"]  # kept if present

# ======================================================
# HELPERS
# ======================================================
def _get_base_name(raw_name: str) -> str:
    base = raw_name or ""
    low = base.lower()
    if low.startswith("holdout_"):
        base = base[len("holdout_"):]
    elif low.startswith("random_"):
        base = base[len("random_"):]
    return base


def predict_from_bundle(bundle: dict, df: pd.DataFrame) -> np.ndarray:
    """
    Bundle format (your saved joblibs):
      {
        "model_name": str,
        "model": sklearn estimator OR Pipeline,
        "features": [..],
        "numeric_cols": [..],
        "binary_cols": [..],
        "scaler": StandardScaler or None,
        # optional PI fields:
        "pi_halfwidth_q": float,
      }
    """
    model = bundle["model"]
    feats = bundle["features"]
    scaler = bundle.get("scaler", None)
    numeric_cols = bundle.get("numeric_cols", [])

    missing = [c for c in feats if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required features for {bundle.get('model_name','?')}: {missing}")

    X = df[feats].copy().apply(pd.to_numeric, errors="coerce").fillna(0)

    # Linear/HUBER bundles have scaler; tree bundles usually have scaler=None
    if scaler is not None and numeric_cols:
        X[numeric_cols] = scaler.transform(X[numeric_cols])

    return model.predict(X)


def load_predictions_and_intervals(model_dir: str, prefix: str, df: pd.DataFrame) -> dict:
    """
    Returns dict of columns -> arrays for each joblib in a folder:
      - prediction_spread_{prefix}_{BaseModelName}
      - ci_lb_{prefix}_{BaseModelName}
      - ci_ub_{prefix}_{BaseModelName}

    Assumes each bundle includes:
      - pi_halfwidth_q (from your training script)
    """
    paths = sorted(glob.glob(os.path.join(model_dir, "*.joblib")))
    if not paths:
        raise FileNotFoundError(f"No .joblib files found in: {model_dir}")

    out = {}
    for p in paths:
        bundle = joblib.load(p)

        raw_name = bundle.get("model_name", os.path.splitext(os.path.basename(p))[0])
        base = _get_base_name(raw_name)

        pred_col = f"prediction_spread_{prefix}_{base}"
        lb_col   = f"ci_lb_{prefix}_{base}"
        ub_col   = f"ci_ub_{prefix}_{base}"

        preds = predict_from_bundle(bundle, df)

        q = bundle.get("pi_halfwidth_q", None)
        if q is None or (isinstance(q, float) and np.isnan(q)):
            # If PI wasn't stored, still create cols but leave NA
            lb = np.full_like(preds, np.nan, dtype=float)
            ub = np.full_like(preds, np.nan, dtype=float)
        else:
            q = float(q)
            lb = preds - q
            ub = preds + q

        out[pred_col] = preds
        out[lb_col]   = lb
        out[ub_col]   = ub

    return out


# ======================================================
# MAIN
# ======================================================
df = pd.read_csv(DATA_FILE)

# Keep IDs (if present)
id_cols = [c for c in ID_COLS if c in df.columns]
out = df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index)

# 1) Load all predictions + intervals
holdout_cols = load_predictions_and_intervals(HOLDOUT_DIR, "holdout", df)
random_cols  = load_predictions_and_intervals(RANDOM_DIR,  "random",  df)

out = pd.concat([out, pd.DataFrame(holdout_cols), pd.DataFrame(random_cols)], axis=1)

# 2) “What I’d ship” point prediction columns

# MAIN: best holdout model
MAIN_COL = "prediction_spread_holdout_Lasso"
if MAIN_COL not in out.columns:
    raise KeyError(f"Expected main prediction column not found: {MAIN_COL}")

out["pred_main_holdout_lasso"] = out[MAIN_COL]

# Safety ensemble: Top-4 holdout linear models
top4_cols = [
    "prediction_spread_holdout_Lasso",
    "prediction_spread_holdout_LinearRegression",
    "prediction_spread_holdout_Ridge",
    "prediction_spread_holdout_ElasticNet",
]
missing_top4 = [c for c in top4_cols if c not in out.columns]
if missing_top4:
    raise KeyError(f"Missing columns needed for avg_holdout_top4: {missing_top4}")

out["avg_holdout_top4"] = out[top4_cols].mean(axis=1)
out["median_holdout_top4"] = out[top4_cols].median(axis=1)

# Alternative forecast: best random model (GB tuned)
ALT_COL = "prediction_spread_random_GradientBoosting_TUNED"
out["pred_alt_random_gb"] = out[ALT_COL] if ALT_COL in out.columns else np.nan

# Optional: diagnostic averages
holdout_pred_cols_all = [c for c in out.columns if c.startswith("prediction_spread_holdout_")]
random_pred_cols_all  = [c for c in out.columns if c.startswith("prediction_spread_random_")]

out["avg_holdout_all"] = out[holdout_pred_cols_all].mean(axis=1) if holdout_pred_cols_all else np.nan
out["avg_random_all"]  = out[random_pred_cols_all].mean(axis=1)  if random_pred_cols_all  else np.nan
out["avg_all_models"]  = out[holdout_pred_cols_all + random_pred_cols_all].mean(axis=1) if (holdout_pred_cols_all or random_pred_cols_all) else np.nan

# 3) Choose a single “final_pred”
out["final_pred"] = out["pred_main_holdout_lasso"]
# If you want:
# out["final_pred"] = out["avg_holdout_top4"]

# 4) OPTIONAL: add a “final” interval based on your chosen final_pred
# If you want final interval to match MAIN holdout lasso:
MAIN_LB = "ci_lb_holdout_Lasso"
MAIN_UB = "ci_ub_holdout_Lasso"
if MAIN_LB in out.columns and MAIN_UB in out.columns:
    out["final_ci_lb"] = out[MAIN_LB]
    out["final_ci_ub"] = out[MAIN_UB]
else:
    out["final_ci_lb"] = np.nan
    out["final_ci_ub"] = np.nan

# 5) Write out
out.to_csv(OUT_FILE, index=False)
print(f"✅ Saved: {OUT_FILE}")

print("Wrote these key columns:")
print(" - pred_main_holdout_lasso (MAIN)")
print(" - avg_holdout_top4 (recommended ensemble)")
print(" - median_holdout_top4 (robust ensemble)")
print(" - pred_alt_random_gb (alternative / market check)")
print(" - avg_holdout_all, avg_random_all, avg_all_models (optional diagnostics)")
print(" - final_pred (currently = pred_main_holdout_lasso)")
print(" - final_ci_lb, final_ci_ub (currently = holdout_Lasso interval)")

✅ Saved: post_2026_predictions_main.csv
Wrote these key columns:
 - pred_main_holdout_lasso (MAIN)
 - avg_holdout_top4 (recommended ensemble)
 - median_holdout_top4 (robust ensemble)
 - pred_alt_random_gb (alternative / market check)
 - avg_holdout_all, avg_random_all, avg_all_models (optional diagnostics)
 - final_pred (currently = pred_main_holdout_lasso)
 - final_ci_lb, final_ci_ub (currently = holdout_Lasso interval)


In [18]:
import os
import glob
import joblib
import numpy as np
import pandas as pd

# ======================================================
# CONFIG
# ======================================================
DATA_FILE = "post_info_2026_diff.csv"
OUT_FILE  = "post_2026_predictions_main_2.csv"

HOLDOUT_DIR = "models_holdout"
RANDOM_DIR  = "models_random"

ID_COLS = ["home_team", "away_team"]  # kept if present

# Blending weights (your requested logic)
W_RANDOM_GB = 0.60
W_HOLDOUT_LASSO = 0.40

# Simple fallback halfwidth for final interval (if model-level PI not stored)
# Use 10 if you want a safer >=70% coverage baseline.
FALLBACK_HALF_WIDTH = 10.0


# ======================================================
# HELPERS
# ======================================================
def _get_base_name(raw_name: str) -> str:
    base = raw_name or ""
    low = base.lower()
    if low.startswith("holdout_"):
        base = base[len("holdout_"):]
    elif low.startswith("random_"):
        base = base[len("random_"):]
    return base


def predict_from_bundle(bundle: dict, df: pd.DataFrame) -> np.ndarray:
    """
    Bundle format (your saved joblibs):
      {
        "model_name": str,
        "model": sklearn estimator OR Pipeline,
        "features": [..],
        "numeric_cols": [..],
        "binary_cols": [..],
        "scaler": StandardScaler or None,
        # optional PI fields:
        "pi_halfwidth_q": float,
      }
    """
    model = bundle["model"]
    feats = bundle["features"]
    scaler = bundle.get("scaler", None)
    numeric_cols = bundle.get("numeric_cols", [])

    missing = [c for c in feats if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required features for {bundle.get('model_name','?')}: {missing}")

    X = df[feats].copy().apply(pd.to_numeric, errors="coerce").fillna(0)

    # Linear/HUBER bundles have scaler; tree bundles usually have scaler=None
    if scaler is not None and numeric_cols:
        X[numeric_cols] = scaler.transform(X[numeric_cols])

    return model.predict(X)


def load_predictions_and_intervals(model_dir: str, prefix: str, df: pd.DataFrame) -> dict:
    """
    Returns dict of columns -> arrays for each joblib in a folder:
      - prediction_spread_{prefix}_{BaseModelName}
      - ci_lb_{prefix}_{BaseModelName}
      - ci_ub_{prefix}_{BaseModelName}

    Assumes each bundle optionally includes:
      - pi_halfwidth_q (half-width for target coverage)
    """
    paths = sorted(glob.glob(os.path.join(model_dir, "*.joblib")))
    if not paths:
        raise FileNotFoundError(f"No .joblib files found in: {model_dir}")

    out = {}
    for p in paths:
        bundle = joblib.load(p)

        raw_name = bundle.get("model_name", os.path.splitext(os.path.basename(p))[0])
        base = _get_base_name(raw_name)

        pred_col = f"prediction_spread_{prefix}_{base}"
        lb_col   = f"ci_lb_{prefix}_{base}"
        ub_col   = f"ci_ub_{prefix}_{base}"

        preds = predict_from_bundle(bundle, df)

        q = bundle.get("pi_halfwidth_q", None)
        if q is None or (isinstance(q, float) and np.isnan(q)):
            # If PI wasn't stored, still create cols but leave NA
            lb = np.full_like(preds, np.nan, dtype=float)
            ub = np.full_like(preds, np.nan, dtype=float)
        else:
            q = float(q)
            lb = preds - q
            ub = preds + q

        out[pred_col] = preds
        out[lb_col]   = lb
        out[ub_col]   = ub

    return out


def safe_mean(a: np.ndarray, b: np.ndarray, wa: float, wb: float) -> np.ndarray:
    """
    Weighted blend with graceful handling of NaN:
    - if one side NaN -> use the other
    - if both NaN -> NaN
    """
    a = a.astype(float)
    b = b.astype(float)
    out = np.full_like(a, np.nan, dtype=float)

    a_ok = ~np.isnan(a)
    b_ok = ~np.isnan(b)

    out[a_ok & ~b_ok] = a[a_ok & ~b_ok]
    out[~a_ok & b_ok] = b[~a_ok & b_ok]
    both = a_ok & b_ok
    out[both] = wa * a[both] + wb * b[both]
    return out


# ======================================================
# MAIN
# ======================================================
df = pd.read_csv(DATA_FILE)

# Keep IDs (if present)
id_cols = [c for c in ID_COLS if c in df.columns]
out = df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index)

# 1) Load all predictions + intervals
holdout_cols = load_predictions_and_intervals(HOLDOUT_DIR, "holdout", df)
random_cols  = load_predictions_and_intervals(RANDOM_DIR,  "random",  df)

out = pd.concat([out, pd.DataFrame(holdout_cols), pd.DataFrame(random_cols)], axis=1)

# 2) “What I’d ship” point prediction columns (kept for diagnostics)
MAIN_COL = "prediction_spread_holdout_Lasso"
ALT_COL  = "prediction_spread_random_GradientBoosting_TUNED"

if MAIN_COL not in out.columns:
    raise KeyError(f"Expected holdout lasso prediction not found: {MAIN_COL}")
if ALT_COL not in out.columns:
    raise KeyError(f"Expected random GB prediction not found: {ALT_COL}")

out["pred_main_holdout_lasso"] = out[MAIN_COL]
out["pred_alt_random_gb"] = out[ALT_COL]

# Holdout Top-4 (optional)
top4_cols = [
    "prediction_spread_holdout_Lasso",
    "prediction_spread_holdout_LinearRegression",
    "prediction_spread_holdout_Ridge",
    "prediction_spread_holdout_ElasticNet",
]
missing_top4 = [c for c in top4_cols if c not in out.columns]
if not missing_top4:
    out["avg_holdout_top4"] = out[top4_cols].mean(axis=1)
    out["median_holdout_top4"] = out[top4_cols].median(axis=1)
else:
    out["avg_holdout_top4"] = np.nan
    out["median_holdout_top4"] = np.nan

# Optional: diagnostic averages
holdout_pred_cols_all = [c for c in out.columns if c.startswith("prediction_spread_holdout_")]
random_pred_cols_all  = [c for c in out.columns if c.startswith("prediction_spread_random_")]

out["avg_holdout_all"] = out[holdout_pred_cols_all].mean(axis=1) if holdout_pred_cols_all else np.nan
out["avg_random_all"]  = out[random_pred_cols_all].mean(axis=1)  if random_pred_cols_all  else np.nan
out["avg_all_models"]  = out[holdout_pred_cols_all + random_pred_cols_all].mean(axis=1) if (holdout_pred_cols_all or random_pred_cols_all) else np.nan

# 3) Choose final_pred = 0.6 * random_GB + 0.4 * holdout_Lasso
out["final_pred"] = safe_mean(
    out["pred_alt_random_gb"].to_numpy(),
    out["pred_main_holdout_lasso"].to_numpy(),
    W_RANDOM_GB,
    W_HOLDOUT_LASSO,
)

# 4) Final interval: blend model intervals if available; otherwise fallback ±10
# Prefer blending the halfwidths (interval sizes), then center on final_pred.

lb_gb = f"ci_lb_random_GradientBoosting_TUNED"
ub_gb = f"ci_ub_random_GradientBoosting_TUNED"
lb_la = f"ci_lb_holdout_Lasso"
ub_la = f"ci_ub_holdout_Lasso"

half_gb = None
half_la = None

if lb_gb in out.columns and ub_gb in out.columns:
    half_gb = (out[ub_gb] - out[lb_gb]) / 2.0
if lb_la in out.columns and ub_la in out.columns:
    half_la = (out[ub_la] - out[lb_la]) / 2.0

if half_gb is not None and half_la is not None:
    # Blend halfwidths with same weights (handles NaN safely)
    final_half = safe_mean(
        half_gb.to_numpy(),
        half_la.to_numpy(),
        W_RANDOM_GB,
        W_HOLDOUT_LASSO,
    )
elif half_gb is not None:
    final_half = half_gb.to_numpy().astype(float)
elif half_la is not None:
    final_half = half_la.to_numpy().astype(float)
else:
    final_half = np.full(len(out), FALLBACK_HALF_WIDTH, dtype=float)

# If any halfwidth is still NaN, fill with fallback
final_half = np.where(np.isnan(final_half), FALLBACK_HALF_WIDTH, final_half)

out["final_ci_lb"] = out["final_pred"] - final_half
out["final_ci_ub"] = out["final_pred"] + final_half

# 5) Write out
out.to_csv(OUT_FILE, index=False)
print(f"✅ Saved: {OUT_FILE}")

print("\nKey outputs:")
print(f" - final_pred = {W_RANDOM_GB:.2f}*random_GB + {W_HOLDOUT_LASSO:.2f}*holdout_Lasso")
print(f" - final_ci_lb/final_ci_ub centered on final_pred")
print(f" - interval halfwidth uses blended model halfwidths if available, else ±{FALLBACK_HALF_WIDTH}")

✅ Saved: post_2026_predictions_main_2.csv

Key outputs:
 - final_pred = 0.60*random_GB + 0.40*holdout_Lasso
 - final_ci_lb/final_ci_ub centered on final_pred
 - interval halfwidth uses blended model halfwidths if available, else ±10.0
