In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error as rmse

In [2]:
df_translate = pd.read_csv("clean/fbref_to_fpl_api_2425.csv")
df_fpl = pd.read_csv("clean/fpl_api_2425.csv", usecols=[
    "code",
    "total_points",
    "clean_sheets",
])
df_fbref = pd.read_csv("clean/fbref_keeper.csv")

In [3]:
df_fbref = df_fbref.drop(columns=["league", "season"])

for col in df_fbref.columns:
    if (
        col.startswith("passes")
        or col.startswith("crosses")
        or col.startswith("goal_kicks")
        or col.startswith("launched")
        or col.startswith("sweeper")
    ):
        df_fbref = df_fbref.drop(columns=[col])

In [4]:
df_fbref = df_fbref.merge(df_translate, left_on="player", right_on="fbref_name", how="inner")
df_all = df_fbref.merge(df_fpl, left_on="fpl_code", right_on="code", how="inner")

In [5]:
df_all.columns

Index(['team', 'player', 'nation', 'pos', 'age', 'born', 'playing_time_mp',
       'playing_time_starts', 'playing_time_min', 'playing_time_90s',
       'performance_ga', 'performance_ga90', 'performance_sota',
       'performance_saves', 'performance_save%', 'performance_w',
       'performance_d', 'performance_l', 'performance_cs', 'performance_cs%',
       'penalty_kicks_pkatt', 'penalty_kicks_pka', 'penalty_kicks_pksv',
       'penalty_kicks_pkm', 'penalty_kicks_save%', 'goals_fk', 'goals_ck',
       'goals_og', 'expected_psxg', 'expected_psxg/sot', 'expected_psxg+/-',
       'expected_/90', 'fbref_name', 'fpl_code', 'clean_sheets', 'code',
       'total_points'],
      dtype='object')

In [6]:
X = df_all[["performance_saves", "performance_save%", "expected_/90"]].dropna()
Y = df_all["total_points"].loc[X.index]
results = sm.OLS(Y, X).fit()
print(rmse(Y, results.fittedvalues))
results.summary()

19.45829176783204


0,1,2,3
Dep. Variable:,total_points,R-squared (uncentered):,0.938
Model:,OLS,Adj. R-squared (uncentered):,0.933
Method:,Least Squares,F-statistic:,200.8
Date:,"Thu, 14 Aug 2025",Prob (F-statistic):,3.8199999999999995e-24
Time:,15:17:36,Log-Likelihood:,-188.65
No. Observations:,43,AIC:,383.3
Df Residuals:,40,BIC:,388.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
performance_saves,1.1335,0.077,14.689,0.000,0.978,1.289
performance_save%,-0.0065,0.076,-0.086,0.932,-0.160,0.147
expected_/90,3.5944,7.368,0.488,0.628,-11.298,18.486

0,1,2,3
Omnibus:,2.096,Durbin-Watson:,1.635
Prob(Omnibus):,0.351,Jarque-Bera (JB):,1.145
Skew:,0.19,Prob(JB):,0.564
Kurtosis:,3.703,Cond. No.,217.0


In [7]:
new_input = pd.DataFrame({
    "performance_saves": [116],
    "performance_save%": [74.4],
    "expected_/90": [0.12]
})

print(results.predict(new_input))

0    131.426464
dtype: float64


In [8]:
df_all["expected_points"] = results.fittedvalues
df_all["residual_points"] = df_all["total_points"] - df_all["expected_points"]
df_all["residual_points_per_90"] = df_all["residual_points"] / df_all["playing_time_min"] * 90

In [9]:
df_all.sort_values(by="residual_points_per_90", ascending=False)[["player", "total_points", "expected_points", "residual_points_per_90"]].reset_index()

Unnamed: 0,index,player,total_points,expected_points,residual_points_per_90
0,18,Danny Ward,2,-3.311781,3.541187
1,9,Jason Steele,9,1.975503,3.512248
2,24,Ederson,111,60.303975,1.966656
3,22,Caoimhín Kelleher,45,26.617341,1.838266
4,0,David Raya,142,97.100425,1.181568
5,28,Martin Dúbravka,44,32.73165,1.126835
6,21,Alisson,112,82.739243,1.050027
7,11,Robert Sánchez,126,103.9587,0.688791
8,13,Jordan Pickford,158,132.605022,0.668289
9,27,André Onana,120,99.402,0.605824


In [10]:
df_all[["player", "playing_time_starts", "playing_time_min", "total_points"]]

Unnamed: 0,player,playing_time_starts,playing_time_min,total_points
0,David Raya,38,3420,142
1,Emiliano Martínez,37,3194,111
2,Robin Olsen,1,226,9
3,Kepa Arrizabalaga,31,2790,106
4,Mark Travers,5,450,19
5,Neto,2,180,7
6,Hákon Rafn Valdimarsson,1,145,2
7,Mark Flekken,37,3275,138
8,Bart Verbruggen,36,3240,103
9,Jason Steele,2,180,9
