In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_parquet("final_first_model.parquet")

In [3]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2760 entries, 0 to 2759
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   game_id             2760 non-null   str    
 1   season              2760 non-null   int32  
 2   game_type           2760 non-null   str    
 3   week                2760 non-null   int32  
 4   away_team           2760 non-null   str    
 5   away_score          2760 non-null   float64
 6   home_team           2760 non-null   str    
 7   home_score          2760 non-null   float64
 8   result              2760 non-null   float64
 9   total               2760 non-null   float64
 10  overtime            2760 non-null   float64
 11  away_rest           2760 non-null   int32  
 12  home_rest           2760 non-null   int32  
 13  div_game            2760 non-null   int32  
 14  roof                2760 non-null   str    
 15  surface             2760 non-null   str    
 16  r_away           

In [4]:
train = df[df["season"] <= 2023].copy()
test  = df[df["season"] >= 2024].copy()

In [5]:
feat_spread = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]
Xtr_s, ytr_s = train[feat_spread], train["margin"]
Xte_s, yte_s = test[feat_spread],  test["margin"]

In [6]:
spread_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

In [7]:
spread_model.fit(Xtr_s, ytr_s)
pred_s = spread_model.predict(Xte_s)

In [8]:
mae_s = mean_absolute_error(yte_s, pred_s)
rmse_s = mean_squared_error(yte_s, pred_s) ** 0.5
print("SPREAD (margin)  MAE:", round(mae_s,3), " RMSE:", round(rmse_s,3))

SPREAD (margin)  MAE: 10.003  RMSE: 12.882


In [9]:
feat_total = [
    "off_epa_pp_home","off_epa_pp_away",
    "def_epa_pp_home","def_epa_pp_away",
    "turnover_rate_home","turnover_rate_away",
    "is_dome","is_outdoors","is_retractable","is_grass",
    "div_game","rest_diff"
]

Xtr_t, ytr_t = train[feat_total], train["total_points"]
Xte_t, yte_t = test[feat_total],  test["total_points"]

total_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

total_model.fit(Xtr_t, ytr_t)
pred_t = total_model.predict(Xte_t)

mae_t = mean_absolute_error(yte_t, pred_t)
rmse_t = mean_squared_error(yte_t, pred_t) ** 0.5
print("TOTAL (points)   MAE:", round(mae_t,3), " RMSE:", round(rmse_t,3))

TOTAL (points)   MAE: 10.375  RMSE: 13.169


In [10]:
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge())])
param_grid = {"ridge__alpha": [0.1, 0.3, 1, 3, 10, 30, 100]}

gs = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error", cv=5)
gs.fit(Xtr_s, ytr_s)

print("Best alpha:", gs.best_params_)
best = gs.best_estimator_
pred = best.predict(Xte_s)
print("Tuned Ridge  MAE:", mean_absolute_error(yte_s, pred), " RMSE:", mean_squared_error(yte_s, pred) ** 0.5)

Best alpha: {'ridge__alpha': 100}
Tuned Ridge  MAE: 9.999684798961008  RMSE: 12.885383666423568


In [11]:
feat_elo = ["elo_diff"]
Xtr_e, ytr_e = train[feat_elo], train["margin"]
Xte_e, yte_e = test[feat_elo], test["margin"]

elo_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

elo_model.fit(Xtr_e, ytr_e)
pred_e = elo_model.predict(Xte_e)

mae_e = mean_absolute_error(yte_e, pred_e)
rmse_e = mean_squared_error(yte_e, pred_e) ** 0.5
print("Elo-only  MAE:", round(mae_e,3), " RMSE:", round(rmse_e,3))


Elo-only  MAE: 10.134  RMSE: 13.034


In [12]:
best = gs.best_estimator_  # tu tuned ridge
coefs = best.named_steps["ridge"].coef_
pd.Series(coefs, index=feat_spread).sort_values()

div_game           -0.233353
to_diff             0.197927
rest_diff           0.334000
def_allowed_diff    0.714115
off_diff            1.575992
elo_diff            3.669582
dtype: float64

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

feat_spread = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]

Xtr, ytr = train[feat_spread], train["margin"]
Xte, yte = test[feat_spread],  test["margin"]

spread_hgb = HistGradientBoostingRegressor(
    loss="squared_error",
    max_depth=3,
    learning_rate=0.05,
    max_iter=600,
    min_samples_leaf=40,
    l2_regularization=0.0,
    random_state=42
)

spread_hgb.fit(Xtr, ytr)
pred = spread_hgb.predict(Xte)

mae = mean_absolute_error(yte, pred)
rmse = mean_squared_error(yte, pred) ** 0.5
print("SPREAD HGB  MAE:", round(mae,3), " RMSE:", round(rmse,3))

SPREAD HGB  MAE: 10.188  RMSE: 13.058


In [14]:
feat_total = [
    "off_epa_pp_home","off_epa_pp_away",
    "def_epa_pp_home","def_epa_pp_away",
    "turnover_rate_home","turnover_rate_away",
    "is_dome","is_outdoors","is_retractable","is_grass",
    "div_game","rest_diff"
]

Xtr, ytr = train[feat_total], train["total_points"]
Xte, yte = test[feat_total],  test["total_points"]

total_hgb = HistGradientBoostingRegressor(
    loss="squared_error",
    max_depth=3,
    learning_rate=0.05,
    max_iter=600,
    min_samples_leaf=40,
    l2_regularization=0.0,
    random_state=42
)

total_hgb.fit(Xtr, ytr)
pred = total_hgb.predict(Xte)

mae = mean_absolute_error(yte, pred)
rmse = mean_squared_error(yte, pred) ** 0.5
print("TOTAL HGB   MAE:", round(mae,3), " RMSE:", round(rmse,3))

TOTAL HGB   MAE: 10.881  RMSE: 13.769


In [15]:
pred[0:5]

array([44.25570131, 41.51230843, 40.84931276, 52.73774075, 41.08578402])

In [16]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# spread baseline: 0
pred0 = np.zeros(len(test))
print("SPREAD baseline(0) MAE:", mean_absolute_error(test["margin"], pred0),
      "RMSE:", mean_squared_error(test["margin"], pred0) ** 0.5)

# total baseline: mean
predm = np.full(len(test), train["total_points"].mean())
print("TOTAL baseline(mean) MAE:", mean_absolute_error(test["total_points"], predm),
      "RMSE:", mean_squared_error(test["total_points"], predm) ** 0.5)


SPREAD baseline(0) MAE: 11.186291739894552 RMSE: 14.447225334277944
TOTAL baseline(mean) MAE: 10.5726157254594 RMSE: 13.523975586177393


In [17]:
# --- features derivadas (nivel / intensidad) ---
df = df.copy()

df["off_sum"] = df["off_epa_pp_home"] + df["off_epa_pp_away"]
df["def_sum_allowed"] = df["def_epa_pp_home"] + df["def_epa_pp_away"]     # allowed: + = peor defensa
df["to_sum"] = df["turnover_rate_home"] + df["turnover_rate_away"]
df["abs_elo_diff"] = df["elo_diff"].abs()

# opcional: el total suele bajar un poco cuando hay blowout esperado (garbage time)
# abs_elo_diff captura parte de eso.

In [18]:
train = df[df["season"] <= 2023].copy()
test  = df[df["season"] >= 2024].copy()

In [19]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

feat_total_v2 = [
    # niveles
    "off_epa_pp_home","off_epa_pp_away",
    "def_epa_pp_home","def_epa_pp_away",
    "turnover_rate_home","turnover_rate_away",

    # derivados
    "off_sum","def_sum_allowed","to_sum","abs_elo_diff",

    # contexto
    "week","div_game","rest_diff",
    "is_dome","is_outdoors","is_retractable","is_grass",
]

Xtr, ytr = train[feat_total_v2], train["total_points"]
Xte, yte = test[feat_total_v2],  test["total_points"]

total_ridge_v2 = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=5.0))   # alpha un poco mayor por colinealidad (sumas vs componentes)
])

total_ridge_v2.fit(Xtr, ytr)
pred = total_ridge_v2.predict(Xte)

mae = mean_absolute_error(yte, pred)
rmse = mean_squared_error(yte, pred) ** 0.5
print("TOTAL Ridge v2  MAE:", round(mae,3), " RMSE:", round(rmse,3))

# baseline mean (para comparar en la misma celda)
predm = np.full(len(test), train["total_points"].mean())
mae_b = mean_absolute_error(yte, predm)
rmse_b = mean_squared_error(yte, predm) ** 0.5
print("TOTAL baseline  MAE:", round(mae_b,3), " RMSE:", round(rmse_b,3))

TOTAL Ridge v2  MAE: 10.369  RMSE: 13.16
TOTAL baseline  MAE: 10.573  RMSE: 13.524


In [20]:
from sklearn.ensemble import HistGradientBoostingRegressor

total_hgb_v2 = HistGradientBoostingRegressor(
    max_depth=4,
    learning_rate=0.05,
    max_iter=1200,
    min_samples_leaf=20,
    l2_regularization=0.1,
    random_state=42
)

total_hgb_v2.fit(Xtr, ytr)
pred = total_hgb_v2.predict(Xte)

mae = mean_absolute_error(yte, pred)
rmse = mean_squared_error(yte, pred) ** 0.5
print("TOTAL HGB v2    MAE:", round(mae,3), " RMSE:", round(rmse,3))

TOTAL HGB v2    MAE: 11.815  RMSE: 14.815


In [21]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

# --- split temporal ---
train = df[df["season"] <= 2023].copy()
test  = df[df["season"] >= 2024].copy()

ytr = train["home_win"].astype(int)
yte = test["home_win"].astype(int)

def fit_eval_logit(Xtr, Xte, ytr, yte, name):
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("logit", LogisticRegression(
            C=1.0,
            penalty="l2",
            solver="lbfgs",
            max_iter=2000
        ))
    ])
    model.fit(Xtr, ytr)
    p = model.predict_proba(Xte)[:, 1]
    pred = (p >= 0.5).astype(int)

    auc = roc_auc_score(yte, p)
    ll = log_loss(yte, p)
    acc = accuracy_score(yte, pred)

    print(f"{name} | AUC={auc:.4f}  LogLoss={ll:.4f}  Acc={acc:.4f}")
    return model, p

# --- Modelo A: solo Elo ---
feat_A = ["elo_diff"]
model_A, pA = fit_eval_logit(train[feat_A], test[feat_A], ytr, yte, "Logit A (elo_diff)")

# --- Modelo B: Elo + EPA + TO + contexto ---
feat_B = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]
model_B, pB = fit_eval_logit(train[feat_B], test[feat_B], ytr, yte, "Logit B (+EPA/+TO)")

# --- Opcional: agrega roof flags (básico) ---
feat_C = feat_B + ["is_dome","is_outdoors","is_retractable","is_grass"]
model_C, pC = fit_eval_logit(train[feat_C], test[feat_C], ytr, yte, "Logit C (+roof/surface flags)")

Logit A (elo_diff) | AUC=0.7113  LogLoss=0.6240  Acc=0.6555
Logit B (+EPA/+TO) | AUC=0.7125  LogLoss=0.6202  Acc=0.6643
Logit C (+roof/surface flags) | AUC=0.7074  LogLoss=0.6223  Acc=0.6661




In [22]:
p0 = np.full(len(test), 0.5)
print("Baseline coinflip | LogLoss=", round(log_loss(yte, p0),4), " AUC=0.5")

Baseline coinflip | LogLoss= 0.6931  AUC=0.5


In [23]:
p_base = np.full(len(test), ytr.mean())
print("Baseline base-rate | LogLoss=", round(log_loss(yte, p_base),4))


Baseline base-rate | LogLoss= 0.6899


In [24]:

feat_B = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]
logit = model_B.named_steps["logit"]
coef = pd.Series(logit.coef_[0], index=feat_B).sort_values()
print(coef)

div_game           -0.015175
def_allowed_diff    0.023522
to_diff             0.023970
rest_diff           0.089460
off_diff            0.223438
elo_diff            0.544502
dtype: float64


In [29]:
df_pred = pd.read_parquet('schedule_features_2016_2025.parquet')

In [30]:
df_pred = df_pred.tail(1)

In [33]:
df_pred.info()

<class 'pandas.DataFrame'>
RangeIndex: 1 entries, 2760 to 2760
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   game_id             1 non-null      str    
 1   season              1 non-null      int32  
 2   game_type           1 non-null      str    
 3   week                1 non-null      int32  
 4   away_team           1 non-null      str    
 5   away_score          0 non-null      float64
 6   home_team           1 non-null      str    
 7   home_score          0 non-null      float64
 8   result              0 non-null      float64
 9   total               0 non-null      float64
 10  overtime            0 non-null      float64
 11  away_rest           1 non-null      int32  
 12  home_rest           1 non-null      int32  
 13  div_game            1 non-null      int32  
 14  roof                1 non-null      str    
 15  surface             1 non-null      str    
 16  r_away           

In [40]:
# --- 1) preparar df_pred ---
X = df_pred.copy()

# diffs (mismas fórmulas que antes)
X["margin"] = np.nan
X["total_points"] = np.nan
X["elo_diff"] = X["r_home"] - X["r_away"]
X["off_diff"] = X["off_epa_pp_home"] - X["off_epa_pp_away"]
X["def_allowed_diff"] = X["def_epa_pp_away"] - X["def_epa_pp_home"]
X["to_diff"] = X["turnover_rate_away"] - X["turnover_rate_home"]
X["rest_diff"] = X["home_rest"] - X["away_rest"]

# flags (solo si tu modelo de total los usa; si no, puedes omitir)
X["roof"] = X["roof"].astype(str).str.strip().str.lower()
X["surface"] = X["surface"].astype(str).str.strip().str.lower()

X["is_grass"] = (X["surface"] == "grass").astype(int)
X["is_dome"] = (X["roof"] == "dome").astype(int)
X["is_outdoors"] = (X["roof"] == "outdoors").astype(int)
X["is_retractable"] = X["roof"].isin(["open","closed"]).astype(int)

# derivados total v2 (si los usaste)
X["off_sum"] = X["off_epa_pp_home"] + X["off_epa_pp_away"]
X["def_sum_allowed"] = X["def_epa_pp_home"] + X["def_epa_pp_away"]
X["to_sum"] = X["turnover_rate_home"] + X["turnover_rate_away"]
X["abs_elo_diff"] = X["elo_diff"].abs()

# --- 2) predecir SPREAD (margin) ---
feat_spread = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]
pred_margin = spread_model.predict(X[feat_spread])[0]

# --- 3) predecir TOTAL (total_points) ---
feat_total_v2 = [
    "off_epa_pp_home","off_epa_pp_away",
    "def_epa_pp_home","def_epa_pp_away",
    "turnover_rate_home","turnover_rate_away",
    "off_sum","def_sum_allowed","to_sum","abs_elo_diff",
    "week","div_game","rest_diff",
    "is_dome","is_outdoors","is_retractable","is_grass",
]
pred_total = total_ridge_v2.predict(X[feat_total_v2])[0]

# --- 4) predecir WIN (probabilidad home) ---
feat_win = ["elo_diff","off_diff","def_allowed_diff","to_diff","rest_diff","div_game"]
p_home_win = model_B.predict_proba(X[feat_win])[:, 1][0]
pred_home_win = int(p_home_win >= 0.5)

# --- 5) imprimir bonito ---
home = X.loc[X.index[0], "home_team"]
away = X.loc[X.index[0], "away_team"]

print(f"{away} @ {home}")
print(f"Pred home win prob: {p_home_win:.3f}  (pick: {'HOME' if pred_home_win else 'AWAY'})")
print(f"Pred spread (home margin): {pred_margin:.2f}  -> line sugerida: {home} {(-pred_margin):.2f}")
print(f"Pred total points: {pred_total:.2f}")


SEA @ NE
Pred home win prob: 0.435  (pick: AWAY)
Pred spread (home margin): -0.94  -> line sugerida: NE 0.94
Pred total points: 43.64
