In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingRegressor




In [2]:
training_data = pd.read_csv("cleaned_datasets/training_data.csv")
all_data = pd.read_csv("cleaned_datasets/all_data.csv")


In [3]:
features = ["AGE","GP","MpG","PpG","RpG","ApG","SpG","BpG","TOpG"]
seasons = sorted(training_data["SEASON"].unique())




In [4]:
def mae_series(y_true, y_pred):
    return float(mean_absolute_error(y_true, y_pred))

In [5]:
# naive baseline per fold
rows = []
for t in seasons[:-1]:
    test = training_data[training_data.SEASON == t+1].copy()
    prev = training_data[training_data.SEASON == t][["NAME","FP_total"]].rename(columns={"FP_total":"prev_FP"})
    test = test.merge(prev, on="NAME", how="left")
    m = test["prev_FP"].notna()
    naive_mae = mae_series(test.loc[m,"FP_total_next"], test.loc[m,"prev_FP"])

    # models trained on <= t, tested on t+1
    train = training_data[training_data.SEASON <= t]
    Xtr, ytr = train[features], train["FP_total_next"]
    Xte, yte = test[features], test["FP_total_next"]

    models = {
        "Linear": LinearRegression(),
        "Ridge": Ridge(alpha=1.0, random_state=42),
        "RF": RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42, n_jobs=-1),
        "HGB": HistGradientBoostingRegressor(max_depth=None, learning_rate=0.05, max_iter=500, random_state=42),
    }
    maes = {}
    for name, mdl in models.items():
        mdl.fit(Xtr, ytr)
        maes[name] = mae_series(yte, mdl.predict(Xte))

    rows.append({
        "train_upto": t,
        "test_year": t+1,
        "Naive": naive_mae,
        **maes
    })

In [6]:
# add 2023->2024 fold explicitly (train on <=2023, test on 2024 actual FPpG)
t = 2023
train = training_data[training_data.SEASON <= t]
Xtr, ytr = train[features], train["FP_total_next"]
test24 = all_data[all_data.SEASON == 2024].copy()
Xte, yte_24 = test24[features], test24["FP_total"]

KeyError: 'FP_total'

In [None]:
# start clean and align indexes
test24 = all_data[all_data.SEASON == 2024][["NAME","FP_total_next"] + features].copy().reset_index(drop=True)

prev23 = training_data[training_data.SEASON == 2023][["NAME","FP_total"]].rename(columns={"FP_total":"prev_FP"})
test24 = test24.merge(prev23, on="NAME", how="left")

m = test24["prev_FP"].notna()
naive24 = mean_absolute_error(test24.loc[m, "FP_total"], test24.loc[m, "prev_FP"])


In [None]:
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "RF": RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42, n_jobs=-1),
    "HGB": HistGradientBoostingRegressor(max_depth=None, learning_rate=0.05, max_iter=500, random_state=42),
}

In [None]:
maes24 = {}
for name, mdl in models.items():
    mdl.fit(Xtr, ytr)
    maes24[name] = mae_series(yte_24, mdl.predict(Xte))

In [None]:
rows.append({
    "train_upto": 2023,
    "test_year": 2024,
    "Naive": naive24,
    **maes24
})

results_df = pd.DataFrame(rows)
avg_row = {"train_upto":"Avg","test_year":"—"}
for col in ["Naive","Linear","Ridge","RF","HGB"]:
    avg_row[col] = results_df[col].mean()
results_df = pd.concat([results_df, pd.DataFrame([avg_row])], ignore_index=True)

results_df.round(3)

In [None]:
# from here, we learn that 