In [1]:
# =========================
# CELL 1 — Imports & global config
# =========================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401  # enables HGBR backend
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance

# Optional SHAP (explainability)
try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

# === EDIT THIS PATH IF NEEDED ===
CSV_PATH = r"C:\Users\Vex\Desktop\football\dataset\matchday\clean_matchday_stat.csv"

# Controls / run configuration
BASELINE_B  = 1.0       # neutral baseline points per match (used in skill/luck split)
N_SPLITS    = 5         # number of GroupKFold splits
RANDOM_SEED = 42        # reproducibility
TEAM_HINTS  = ["Leverkusen", "Bayern"]  # teams to highlight in printed tables

pd.set_option("display.max_columns", 200)  # show more columns when printing DataFrames
print("Using:", CSV_PATH)




Using: C:\Users\Vex\Desktop\football\dataset\matchday\clean_matchday_stat.csv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =========================
# CELL 2 — Load, resolve headers, build per-team long table + match_id
# =========================

df_raw = pd.read_csv(CSV_PATH)  # load raw CSV once

# Flexible header aliases (handles multiple naming schemes across datasets)
aliases = {
    "home_team": ["home_team","HomeTeam","home","Home","Home Team","home_side"],
    "away_team": ["away_team","AwayTeam","away","Away","Away Team","away_side"],
    "home_score": ["home_score","FTHG","HomeGoals","home_final_goal","home_fulltime_goals"],
    "away_score": ["away_score","FTAG","AwayGoals","away_final_goal","away_fulltime_goals"],
    "xg_home": ["expected_goals_xg_home","home_xg","xg_home","expected_goals_home","Home_xG","home_expected_goals"],
    "xg_away": ["expected_goals_xg_host","away_xg","xg_away","expected_goals_away","Away_Xg","away_expected_goals","Away_xG"],
    "date": ["Date_day","date","match_date","Date","MatchDate","matchday_date","kickoff_date"],
    "round": ["round","matchday","gw","GameWeek","match_week"],

    # match stats (optional; attached later if present)
    "shots_home": ["Total_Shots_Home","Shots_Home","shots_home"],
    "shots_away": ["Total_Shots_Host","Shots_Host","shots_away","Total_Shots_Away"],
    "sot_home": ["Shots_On_Target_Home","SOT_Home","OnTarget_Home"],
    "sot_away": ["Shots_On_Target_Host","SOT_Host","OnTarget_Away","Shots_On_Target_Away"],
    "corners_home": ["Corner_Kicks_Home","Corners_Home"],
    "corners_away": ["Corner_Kicks_Host","Corners_Away"],
    "fouls_home": ["Fouls_Home","Foul_Home"],
    "fouls_away": ["Fouls_Host","Foul_Away"],
    "poss_home": ["Ball_Possession_Home","Possession_Home","Home_Possession"],
    "poss_away": ["Ball_Possession_Host","Possession_Host","Away_Possession"],
    "danger_home": ["Dangerous_Attacks_Home","Danger_Attacks_Home"],
    "danger_away": ["Dangerous_Attacks_Host","Danger_Attacks_Away"],
    "distance_home": ["Distance_Covered_Km_Home","Distance_Home"],
    "distance_away": ["Distance_Covered_Km_Host","Distance_Away"],
    "yc_times_home": ["home_team_yellow_card_current_time","Yellow_Cards_Times_Home"],
    "yc_times_away": ["away_team_yellow_card_current_time","Yellow_Cards_Times_Host"],
}

# Resolve the actual columns present in this file
resolved = {}
for k, opts in aliases.items():
    for c in opts:
        if c in df_raw.columns:
            resolved[k] = c
            break

# Ensure all required core fields exist
required = ["home_team","away_team","home_score","away_score","xg_home","xg_away"]
missing = [k for k in required if k not in resolved]
if missing:
    raise ValueError(f"Missing required columns: {missing}\nResolved={resolved}")

# Short-hands for resolved column names
HOME, AWAY = resolved["home_team"], resolved["away_team"]
FTHG, FTAG = resolved["home_score"], resolved["away_score"]
XGH, XGA   = resolved["xg_home"],     resolved["xg_away"]
DATE_COL   = resolved.get("date")
ROUND_COL  = resolved.get("round")

# Coerce numeric essentials with safe parsing
df = df_raw.copy()
df[FTHG] = pd.to_numeric(df[FTHG], errors="coerce").fillna(0).astype(int)
df[FTAG] = pd.to_numeric(df[FTAG], errors="coerce").fillna(0).astype(int)
df[XGH]  = pd.to_numeric(df[XGH],  errors="coerce").astype(float)
df[XGA]  = pd.to_numeric(df[XGA],  errors="coerce").astype(float)

# Chronological ordering proxy + unique match id (keeps home/away paired)
if DATE_COL and DATE_COL in df.columns:
    df["_order"] = pd.to_datetime(df[DATE_COL], errors="coerce")
elif ROUND_COL and ROUND_COL in df.columns:
    df["_order"] = pd.to_numeric(df[ROUND_COL], errors="coerce")
else:
    df["_order"] = np.arange(len(df))  # fallback if no date/round available

df["_match_id"] = np.arange(len(df))  # one id per fixture (home & away rows share it later)

# Observed points by result, from each side's perspective
home_pts = np.select([df[FTHG] > df[FTAG], df[FTHG] == df[FTAG], df[FTHG] < df[FTAG]], [3,1,0])
away_pts = np.select([df[FTHG] < df[FTAG], df[FTHG] == df[FTAG], df[FTHG] > df[FTAG]], [3,1,0])

# Build long-format rows: once per team per match (home & away views)
home_long = pd.DataFrame({
    "team":           df[HOME].values,
    "opponent":       df[AWAY].values,
    "order":          df["_order"].values,
    "venue":          "Home",
    "goals_for":      df[FTHG].values,
    "goals_against":  df[FTAG].values,
    "xg_for":         df[XGH].values,
    "xg_against":     df[XGA].values,
    "obs_pts":        home_pts,
    "match_id":       df["_match_id"].values
})
away_long = pd.DataFrame({
    "team":           df[AWAY].values,
    "opponent":       df[HOME].values,
    "order":          df["_order"].values,
    "venue":          "Away",
    "goals_for":      df[FTAG].values,
    "goals_against":  df[FTHG].values,
    "xg_for":         df[XGA].values,
    "xg_against":     df[XGH].values,
    "obs_pts":        away_pts,
    "match_id":       df["_match_id"].values
})

# Final long table with a convenient home/away indicator
long = pd.concat([home_long, away_long], ignore_index=True)
long["venue_home"] = (long["venue"]=="Home").astype(int)
long.sort_values(["team","order","match_id"], inplace=True)

print("Long shape:", long.shape)
print(long.head(6).to_string(index=False))


Long shape: (612, 11)
    team            opponent                         order venue  goals_for  goals_against  xg_for  xg_against  obs_pts  match_id  venue_home
Augsburg            Freiburg 1970-01-01 00:00:00.000000001  Away          0              2    1.58        1.09        0       251           0
Augsburg           Darmstadt 1970-01-01 00:00:00.000000002  Away          6              0    3.31        1.22        3        93           0
Augsburg              Bochum 1970-01-01 00:00:00.000000002  Home          2              2    2.63        1.49        1       281           1
Augsburg              Bochum 1970-01-01 00:00:00.000000003  Away          1              1    1.69        0.99        1       130           0
Augsburg Eintracht Frankfurt 1970-01-01 00:00:00.000000003  Home          2              1    1.75        1.09        3       189           1
Augsburg            Dortmund 1970-01-01 00:00:00.000000004  Away          1              5    2.25        4.27        0       

In [3]:
# =========================
# CELL 3 — Attach optional match stats (robust numeric/pct parsing)
# =========================

def get_cols(alias_home, alias_away):
    """Return actual column names for home/away stats if they exist in df."""
    ch = resolved.get(alias_home)
    ca = resolved.get(alias_away)
    ch = ch if (ch in df.columns) else None
    ca = ca if (ca in df.columns) else None
    return ch, ca

def parse_pct_vec(v):
    """Parse percentage-like strings to floats (drop '%' and coerce)."""
    out = []
    for s in v:
        try:
            out.append(float(str(s).strip().replace('%','')))
        except Exception:
            out.append(np.nan)
    return np.array(out, dtype=float)

def as_numeric(arr):
    """Vectorized numeric coercion with NaN for non-numeric cells."""
    return np.asarray(pd.to_numeric(pd.Series(arr), errors="coerce"), dtype=float)

def attach_for_stat(alias_home, alias_away, out_colname, coerce_numeric=False, pct=False):
    """
    Attach a per-team stat to 'long' by concatenating home values with away values
    so the sequence aligns with [home rows ... then away rows].
    """
    ch, ca = get_cols(alias_home, alias_away)
    if ch is None or ca is None:
        return  # silently skip if this stat is absent

    vals_home = df[ch].to_numpy()
    vals_away = df[ca].to_numpy()

    if pct:
        vals_home = parse_pct_vec(vals_home)
        vals_away = parse_pct_vec(vals_away)
    elif coerce_numeric:
        vals_home = as_numeric(vals_home)
        vals_away = as_numeric(vals_away)

    vals = np.concatenate([vals_home, vals_away])  # order must match long (home block then away block)
    if len(vals) != len(long):
        raise ValueError(f"Length mismatch for '{out_colname}': {len(vals)} vs {len(long)}")

    long[out_colname] = vals  # attach as a new feature

# Attach if columns exist (keeps code robust across datasets)
attach_for_stat("shots_home","shots_away",           out_colname="shots_for",        coerce_numeric=True)
attach_for_stat("sot_home","sot_away",               out_colname="sot_for",          coerce_numeric=True)
attach_for_stat("corners_home","corners_away",       out_colname="corners_for",      coerce_numeric=True)
attach_for_stat("fouls_home","fouls_away",           out_colname="fouls_for",        coerce_numeric=True)
attach_for_stat("danger_home","danger_away",         out_colname="danger_for",       coerce_numeric=True)
attach_for_stat("distance_home","distance_away",     out_colname="distance_for",     coerce_numeric=True)
attach_for_stat("poss_home","poss_away",             out_colname="poss_for_pct",     pct=True)

attached = [c for c in [
    "shots_for","sot_for","corners_for","fouls_for","danger_for","distance_for","poss_for_pct"
] if c in long.columns]
print("Attached features:", attached)


Attached features: ['corners_for', 'fouls_for', 'danger_for', 'poss_for_pct']


In [4]:
# =========================
# CELL 4 — Helpers (team picking, calibration plots)
# =========================

def pick_team_name(names, hint):
    """Pick the first team name containing the hint (case-insensitive)."""
    m = [t for t in names if isinstance(t, str) and hint.lower() in t.lower()]
    return m[0] if m else None

# Choose canonical display names from what's present in the data
teams_available = sorted(long["team"].unique())
LEV = pick_team_name(teams_available, "Leverkusen") or "Bayer Leverkusen"
BAY = pick_team_name(teams_available, "Bayern")     or "Bayern München"
print("Selected focus:", LEV, "|", BAY)

# (Optional) Calibration plot kept commented; enable when needed.
# def calibration_plot(name, exp_col, floor=0.5, ceil=2.5):
#     """
#     Scatter of team-level mean Observed Pts vs Expected Pts.
#     Uses numeric-only mins/maxes and avoids empty plots.
#     """
#     if exp_col not in long.columns:
#         print(f"[calibration_plot] Column '{exp_col}' not found.")
#         return
#     tmp = (long[["team", "obs_pts", exp_col]]
#            .rename(columns={exp_col: "exp"})
#            .copy())
#     tmp["obs"] = pd.to_numeric(tmp["obs_pts"], errors="coerce")
#     tmp["exp"] = pd.to_numeric(tmp["exp"], errors="coerce")
#     tmp = (tmp.dropna(subset=["obs", "exp"])
#                .groupby("team", as_index=False)
#                .agg(obs=("obs", "mean"), exp=("exp", "mean")))
#     if tmp.empty:
#         print(f"[calibration_plot] No valid rows to plot for '{name}'.")
#         return
#     lo = min(tmp[["obs","exp"]].min().min(), floor)
#     hi = max(tmp[["obs","exp"]].max().max(), ceil)
#     fig, ax = plt.subplots(figsize=(5, 5))
#     ax.scatter(tmp["exp"], tmp["obs"], alpha=0.7)
#     ax.plot([lo, hi], [lo, hi], "--", color="gray", linewidth=1)
#     ax.set_xlim(lo, hi); ax.set_ylim(lo, hi)
#     ax.set_aspect("equal", adjustable="box")
#     ax.set_xlabel("Expected Pts / match")
#     ax.set_ylabel("Observed Pts / match")
#     ax.set_title(f"Calibration — {name}")
#     ax.grid(True, alpha=0.3, linestyle="--", linewidth=0.7)
#     plt.tight_layout()
#     plt.show()


Selected focus: Bayer Leverkusen | Bayern Munich


In [5]:
# =========================
# CELL 5 — Model 1: GBR EPts with GroupKFold OOF
# =========================

# Columns to exclude from features
id_cols = {"team","opponent","venue","order","obs_pts","match_id"}
exclude_exact = {"goals_for","goals_against"}  # avoid direct leakage of outcome

# Candidate numeric features = all numeric not in ids/exclusions
candidate_num = [c for c in long.columns
                 if (c not in id_cols)
                 and (c not in exclude_exact)
                 and pd.api.types.is_numeric_dtype(long[c])]

# Categorical features for team/opponent/venue effects
cat_features = ["team","opponent","venue"]

# Build design matrix and target
X_all = long[candidate_num + cat_features].copy()
y_all = long["obs_pts"].astype(float)
groups = long["match_id"].values  # ties home/away rows for CV grouping

# Preprocessing pipelines (numeric imputation + OHE for cats)
num_pre = Pipeline([("imp", SimpleImputer(strategy="median"))])
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
cat_pre = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe)])

pre = ColumnTransformer(
    [("num", num_pre, candidate_num),
     ("cat", cat_pre, cat_features)],
    remainder="drop",
    verbose_feature_names_out=False
)

# Model: Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=RANDOM_SEED)
pipe_gbr = Pipeline([("prep", pre), ("gbm", gbr)])

# Out-of-fold predictions using grouped CV (prevents match leakage)
gkf = GroupKFold(n_splits=N_SPLITS)
oof_pred = np.zeros(len(long), dtype=float)
for tr, te in gkf.split(X_all, y_all, groups=groups):
    pipe_gbr.fit(X_all.iloc[tr], y_all.iloc[tr])
    oof_pred[te] = pipe_gbr.predict(X_all.iloc[te])

# Final fit on all data (for inference/SHAP if needed)
pipe_gbr.fit(X_all, y_all)
long["exp_pts_gbr"] = oof_pred  # store OOF expected points

# Skill/Luck decomposition vs baseline (vectorized)
def skill_luck_split_vector(obs, exp, b=BASELINE_B):
    S = np.abs(exp - b)         # "skill distance" from baseline
    U = np.abs(obs - exp)       # "luck distance" between obs and expected
    den = S + U
    Skill = np.where(den==0, 0.5, S/den)
    Luck  = 1.0 - Skill
    return Skill, Luck

long["Skill%_gbr"], long["Luck%_gbr"] = skill_luck_split_vector(
    long["obs_pts"].to_numpy(float),
    long["exp_pts_gbr"].to_numpy(float)
)

# Report out-of-fold accuracy
mae_gbr = mean_absolute_error(long["obs_pts"], long["exp_pts_gbr"])
print(f"[GBR] OOF MAE (points): {mae_gbr:.3f}")

# Convenience printer for a team's per-match expected points table
def team_table_gbr(team):
    sub = long[long["team"]==team].sort_values(["order","match_id"]).reset_index(drop=True)
    return pd.DataFrame({
        "team": team,
        "opponent": sub["opponent"],
        "venue": sub["venue"],
        "obs_pts": sub["obs_pts"].round(3),
        "EPts_gbr": sub["exp_pts_gbr"].round(3),
        "Skill%": (100*sub["Skill%_gbr"]).round(1),
        "Luck%": (100*sub["Luck%_gbr"]).round(1)
    })

print("\n=== GBR EPts — Leverkusen ===")
print(team_table_gbr(LEV).to_string(index=False))
print("\n=== GBR EPts — Bayern ===")
print(team_table_gbr(BAY).to_string(index=False))

# Optional diagnostics (kept commented)
# calibration_plot("GBR", "exp_pts_gbr")
# calibration_plot("Logit", "exp_pts_logit")
# calibration_plot("HGBR", "exp_pts_hgbr")


[GBR] OOF MAE (points): 0.952

=== GBR EPts — Leverkusen ===
            team            opponent venue  obs_pts  EPts_gbr  Skill%  Luck%
Bayer Leverkusen           Darmstadt  Home        3     2.728    86.4   13.6
Bayer Leverkusen             FC Koln  Away        3     2.622    81.1   18.9
Bayer Leverkusen           Darmstadt  Away        3     2.858    92.9    7.1
Bayer Leverkusen            Dortmund  Home        1     2.473    50.0   50.0
Bayer Leverkusen          Hoffenheim  Away        3     1.300    15.0   85.0
Bayer Leverkusen Eintracht Frankfurt  Away        3     2.575    78.7   21.3
Bayer Leverkusen        Union Berlin  Away        3     3.377    86.3   13.7
Bayer Leverkusen             FC Koln  Home        3     2.396    69.8   30.2
Bayer Leverkusen           Wolfsburg  Home        3     1.844    42.2   57.8
Bayer Leverkusen       Bayern Munich  Home        3     2.815    90.8    9.2
Bayer Leverkusen           Stuttgart  Away        1     2.117    50.0   50.0
Bayer Leverkuse

In [6]:
# =========================
# CELL 6 — Multinomial Logistic EPts with GroupKFold OOF
# =========================

id_cols = {"team","opponent","venue","order","obs_pts","match_id"}
exclude_exact = {"goals_for","goals_against"}
candidate_num = [c for c in long.columns
                 if (c not in id_cols)
                 and (c not in exclude_exact)
                 and pd.api.types.is_numeric_dtype(long[c])]
cat_features = ["team","opponent","venue"]

X_all = long[candidate_num + cat_features].copy()

# Map points to classes (0=loss, 1=draw, 2=win) for multinomial logit
def pts_to_class(p):
    if p == 3: return 2  # win
    if p == 1: return 1  # draw
    return 0             # loss

y_class = long["obs_pts"].map(pts_to_class).astype(int)
groups  = long["match_id"].values

# Preprocess: impute + **scale** numeric for logit stability; OHE cats
num_pre = Pipeline([("imp", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())])
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
cat_pre = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                    ("ohe", ohe)])

pre = ColumnTransformer(
    [("num", num_pre, candidate_num),
     ("cat", cat_pre, cat_features)],
    remainder="drop",
    verbose_feature_names_out=False
)

# Multinomial logistic regression (lbfgs) for W/D/L probabilities
logit = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000)
pipe_logit = Pipeline([("prep", pre), ("clf", logit)])

# OOF class probabilities → expected points
gkf = GroupKFold(n_splits=N_SPLITS)
proba_oof = np.zeros((len(long), 3), dtype=float)
for tr, te in gkf.split(X_all, y_class, groups=groups):
    pipe_logit.fit(X_all.iloc[tr], y_class.iloc[tr])
    proba_oof[te] = pipe_logit.predict_proba(X_all.iloc[te])

EPts_oof = 3*proba_oof[:,2] + 1*proba_oof[:,1]  # EPts = 3*P(win) + 1*P(draw)
long["exp_pts_logit"] = EPts_oof

# Skill/Luck split for logit EPts
S = np.abs(long["exp_pts_logit"] - BASELINE_B)
U = np.abs(long["obs_pts"] - long["exp_pts_logit"])
den = (S + U).replace(0, np.nan)
long["Skill%_logit"] = (S/den).fillna(0.5)
long["Luck%_logit"]  = 1.0 - long["Skill%_logit"]

# Report OOF accuracy
mae_logit = mean_absolute_error(long["obs_pts"], long["exp_pts_logit"])
print(f"[Logit] OOF MAE (points): {mae_logit:.3f}")

# Convenience printer
def team_table_logit(team):
    sub = long[long["team"]==team].sort_values(["order","match_id"]).reset_index(drop=True)
    return pd.DataFrame({
        "team": team,
        "opponent": sub["opponent"],
        "venue": sub["venue"],
        "obs_pts": sub["obs_pts"].round(3),
        "EPts_logit": sub["exp_pts_logit"].round(3),
        "Skill%": (100*sub["Skill%_logit"]).round(1),
        "Luck%": (100*sub["Luck%_logit"]).round(1)
    })

print("\n=== Multinomial Logit EPts — Leverkusen ===")
print(team_table_logit(LEV).to_string(index=False))
print("\n=== Multinomial Logit EPts — Bayern ===")
print(team_table_logit(BAY).to_string(index=False))

# calibration_plot("Logit", "exp_pts_logit")  # optional




[Logit] OOF MAE (points): 0.856

=== Multinomial Logit EPts — Leverkusen ===
            team            opponent venue  obs_pts  EPts_logit  Skill%  Luck%
Bayer Leverkusen           Darmstadt  Home        3       2.919    95.9    4.1
Bayer Leverkusen             FC Koln  Away        3       2.144    57.2   42.8
Bayer Leverkusen           Darmstadt  Away        3       2.306    65.3   34.7
Bayer Leverkusen            Dortmund  Home        1       2.554    50.0   50.0
Bayer Leverkusen          Hoffenheim  Away        3       2.394    69.7   30.3
Bayer Leverkusen Eintracht Frankfurt  Away        3       2.187    59.4   40.6
Bayer Leverkusen        Union Berlin  Away        3       2.783    89.1   10.9
Bayer Leverkusen             FC Koln  Home        3       2.716    85.8   14.2
Bayer Leverkusen           Wolfsburg  Home        3       2.971    98.5    1.5
Bayer Leverkusen       Bayern Munich  Home        3       2.378    68.9   31.1
Bayer Leverkusen           Stuttgart  Away        1   

In [7]:
# =========================
# CELL 7 — HGBR EPts with GroupKFold OOF
# =========================

id_cols = {"team","opponent","venue","order","obs_pts","match_id"}
exclude_exact = {"goals_for","goals_against"}
candidate_num = [c for c in long.columns
                 if (c not in id_cols)
                 and (c not in exclude_exact)
                 and pd.api.types.is_numeric_dtype(long[c])]
cat_features = ["team","opponent","venue"]

X_all = long[candidate_num + cat_features].copy()
y_all = long["obs_pts"].astype(float)
groups = long["match_id"].values

# Preprocessing for HGBR (impute numerics; OHE cats)
num_pre = Pipeline([("imp", SimpleImputer(strategy="median"))])
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
cat_pre = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe)])

pre = ColumnTransformer(
    [("num", num_pre, candidate_num),
     ("cat", cat_pre, cat_features)],
    remainder="drop",
    verbose_feature_names_out=False
)

# Model: HistGradientBoostingRegressor (fast, robust GBM variant)
hgb = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
pipe_hgbr = Pipeline([("prep", pre), ("hgb", hgb)])

# OOF predictions with grouped CV
gkf = GroupKFold(n_splits=N_SPLITS)
oof_pred = np.zeros(len(long), dtype=float)
for tr, te in gkf.split(X_all, y_all, groups=groups):
    pipe_hgbr.fit(X_all.iloc[tr], y_all.iloc[tr])
    oof_pred[te] = pipe_hgbr.predict(X_all.iloc[te])

# Final fit on all data (optional for inference/importance)
pipe_hgbr.fit(X_all, y_all)
long["exp_pts_hgbr"] = oof_pred

# Skill/Luck for HGBR EPts
S = np.abs(long["exp_pts_hgbr"] - BASELINE_B)
U = np.abs(long["obs_pts"] - long["exp_pts_hgbr"])
den = (S + U).replace(0, np.nan)
long["Skill%_hgbr"] = (S/den).fillna(0.5)
long["Luck%_hgbr"]  = 1.0 - long["Skill%_hgbr"]

# Report OOF accuracy
mae_hgbr = mean_absolute_error(long["obs_pts"], long["exp_pts_hgbr"])
print(f"[HGBR] OOF MAE (points): {mae_hgbr:.3f}")

# Convenience printer
def team_table_hgbr(team):
    sub = long[long["team"]==team].sort_values(["order","match_id"]).reset_index(drop=True)
    return pd.DataFrame({
        "team": team,
        "opponent": sub["opponent"],
        "venue": sub["venue"],
        "obs_pts": sub["obs_pts"].round(3),
        "EPts_hgbr": sub["exp_pts_hgbr"].round(3),
        "Skill%": (100*sub["Skill%_hgbr"]).round(1),
        "Luck%": (100*sub["Luck%_hgbr"]).round(1)
    })

print("\n=== HGBR EPts — Leverkusen ===")
print(team_table_hgbr(LEV).to_string(index=False))
print("\n=== HGBR EPts — Bayern ===")
print(team_table_hgbr(BAY).to_string(index=False))


[HGBR] OOF MAE (points): 0.339

=== HGBR EPts — Leverkusen ===
            team            opponent venue  obs_pts  EPts_hgbr  Skill%  Luck%
Bayer Leverkusen           Darmstadt  Home        3      3.023    98.9    1.1
Bayer Leverkusen             FC Koln  Away        3      3.066    96.9    3.1
Bayer Leverkusen           Darmstadt  Away        3      3.037    98.2    1.8
Bayer Leverkusen            Dortmund  Home        1      0.958    50.0   50.0
Bayer Leverkusen          Hoffenheim  Away        3      3.260    89.7   10.3
Bayer Leverkusen Eintracht Frankfurt  Away        3      3.071    96.7    3.3
Bayer Leverkusen        Union Berlin  Away        3      2.970    98.5    1.5
Bayer Leverkusen             FC Koln  Home        3      2.965    98.2    1.8
Bayer Leverkusen           Wolfsburg  Home        3      3.044    97.9    2.1
Bayer Leverkusen       Bayern Munich  Home        3      3.000   100.0    0.0
Bayer Leverkusen           Stuttgart  Away        1      0.892    50.0   50.0
B