In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Data
td = pd.read_csv("cleaned_datasets/training_data.csv")   # must contain NAME, SEASON, FPpG_next, GP_next and current-season stats used as features
ad = pd.read_csv("cleaned_datasets/all_data.csv")        # must contain NAME, SEASON, FPpG, GP at minimum

# Build naive targets: last season's FPpG and GP
last = ad[["NAME","SEASON","FPpG","GP"]].copy()
last["SEASON"] += 1
last = last.rename(columns={"FPpG":"FPpG_last","GP":"GP_last"})

td = td.merge(last, on=["NAME","SEASON"], how="left")

# Masks for players with real next-season outcomes
m = (td["FPpG_next"].notna()) & (td["FPpG_next"] != 0)

# Naive errors
mae_naive_fppg = mean_absolute_error(td.loc[m,"FPpG_next"], td.loc[m,"FPpG_last"])
mae_naive_gp   = mean_absolute_error(td.loc[m,"GP_next"],   td.loc[m,"GP_last"])

# Model errors using your trained preds already in memory? If not, refit quickly:
features = ["AGE","GP","MpG","PpG","RpG","ApG","SpG","BpG","TOpG"]
X = td[features]
y_fppg = td["FPpG_next"]
y_gp   = td["GP_next"]

fp = LinearRegression().fit(X, y_fppg)
gp = LinearRegression().fit(X, y_gp)

pred_fppg = fp.predict(X)
pred_gp   = gp.predict(X)

mae_model_fppg = mean_absolute_error(td.loc[m,"FPpG_next"], pred_fppg[m])
mae_model_gp   = mean_absolute_error(td.loc[m,"GP_next"],   pred_gp[m])

# Percent improvement
imp_fppg = 100*(mae_naive_fppg - mae_model_fppg)/mae_naive_fppg
imp_gp   = 100*(mae_naive_gp   - mae_model_gp)/mae_naive_gp

N = int(m.sum())
print(f"Error reduction vs last-season baseline: FP/game {imp_fppg:.1f}% (N={N}), Games {imp_gp:.1f}% (N={N})")


ValueError: Input contains NaN.