In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error as rmse
from scipy.stats import poisson

In [2]:
def estimate_defensive_contribution_points(defensive_actions_per_90: float) -> float:
    """Estimate defensive contribution points per 90."""
    return 2 * poisson.sf(10, mu=defensive_actions_per_90)

In [3]:
df_translate = pd.read_csv("clean/fbref_to_fpl_api_2425.csv")

df_fpl = pd.read_csv(
    "clean/fpl_api_2425.csv",
    usecols=["code", "position", "total_points", "clean_sheets", "goals_conceded"],
)

df_fbref = pd.read_csv("clean/fbref_general.csv")
df_fbref = df_fbref.drop(columns=["league", "season", "pos"])

df_fpl_elo = pd.read_csv(
    "clean/fpl_elo.csv",
    usecols=[
        "player_code",
        "tackles_won",
        "interceptions",
        "blocks",
        "clearances",
        "tackles_won_pct",
        "recoveries",
        "dribbled_past",
        "ground_duels_won_pct",
        "aerial_duels_won_pct",
    ],
)


In [4]:
df_fbref = df_fbref.merge(
    df_translate, left_on="player", right_on="fbref_name", how="left"
)

df_all = df_fbref.merge(df_fpl, left_on="fpl_code", right_on="code", how="inner")
df_all = df_all.merge(df_fpl_elo, left_on="code", right_on="player_code", how="left")
df_all["defensive_actions"] = (
    df_all["tackles_won"]
    + df_all["interceptions"]
    + df_all["blocks"]
    + df_all["clearances"]
    + df_all["recoveries"]
)
df_all["defensive_actions_per_90"] = df_all["defensive_actions"] / (
    df_all["playing_time_min"] / 90
)

df_defense = df_all[(df_all["position"] == "DEF") | (df_all["player"] == "Myles Lewis-Skelly")]
df_defense["defensive_contribution_points"] = df_defense[
    "defensive_actions_per_90"
].apply(estimate_defensive_contribution_points) * (df_defense["playing_time_min"] / 90)

df_defense["observable_points"] = (
    (df_defense["playing_time_min"] / 90 * 2)
    + 3 * df_defense["performance_ast"]
    + 6 * df_defense["performance_gls"]
    + 4 * df_defense["clean_sheets"]
    + df_defense["defensive_contribution_points"]
    - df_defense["goals_conceded"]
)
df_defense["points_per_90"] = df_defense["total_points"] / (
    df_defense["playing_time_min"] / 90
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["defensive_contribution_points"] = df_defense[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["observable_points"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["points_per_90"] = df_defense["total_points"] / (


In [5]:
df_defense_starter = df_defense[df_defense["playing_time_min"] >= 1500]

In [6]:
df_defense.columns

Index(['team', 'player', 'nation', 'age', 'born', 'playing_time_mp',
       'playing_time_starts', 'playing_time_min', 'playing_time_90s',
       'performance_gls', 'performance_ast', 'performance_g+a',
       'performance_g-pk', 'performance_pk', 'performance_pkatt',
       'performance_crdy', 'performance_crdr', 'expected_xg', 'expected_npxg',
       'expected_xag', 'expected_npxg+xag', 'progression_prgc',
       'progression_prgp', 'progression_prgr', 'per_90_minutes_gls',
       'per_90_minutes_ast', 'per_90_minutes_g+a', 'per_90_minutes_g-pk',
       'per_90_minutes_g+a-pk', 'per_90_minutes_xg', 'per_90_minutes_xag',
       'per_90_minutes_xg+xag', 'per_90_minutes_npxg',
       'per_90_minutes_npxg+xag', 'performance_recov', 'aerial_duels_won',
       'aerial_duels_lost', 'aerial_duels_won%', 'att', 'pass_types_live',
       'pass_types_dead', 'pass_types_fk', 'pass_types_tb', 'pass_types_sw',
       'pass_types_crs', 'pass_types_ti', 'pass_types_ck', 'touches_touches',
       'to

In [7]:
Y = df_defense_starter["observable_points"]
X = df_defense_starter[
    [
        "clean_sheets",
        "goals_conceded",
        "expected_npxg+xag",
        "touches_def_pen",
    ]
]
model = sm.OLS(Y, X).fit()
print(rmse(Y, model.fittedvalues))
print(model.summary())

13.423856228828754
                                 OLS Regression Results                                
Dep. Variable:      observable_points   R-squared (uncentered):                   0.976
Model:                            OLS   Adj. R-squared (uncentered):              0.975
Method:                 Least Squares   F-statistic:                              795.6
Date:                Thu, 14 Aug 2025   Prob (F-statistic):                    5.95e-63
Time:                        20:13:33   Log-Likelihood:                         -333.33
No. Observations:                  83   AIC:                                      674.7
Df Residuals:                      79   BIC:                                      684.3
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
---------------

In [8]:
df_defense["predicted_points"] = model.predict(
    df_defense[
        [
            "clean_sheets",
            "goals_conceded",
            "expected_npxg+xag",
            "touches_def_pen",
        ]
    ]
)
df_defense["predicted_points_per_90"] = df_defense["predicted_points"] / (
    df_defense["playing_time_min"] / 90
)

df_defense["residual_points"] = (
    df_defense["total_points"] - df_defense["predicted_points"]
)
df_defense["residual_points_per_90"] = df_defense["residual_points"] / (
    df_defense["playing_time_min"] / 90
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["predicted_points"] = model.predict(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["predicted_points_per_90"] = df_defense["predicted_points"] / (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_defense["residual_points"] = (
A value is trying to be set on a copy of a slice

In [9]:
df_defense[df_defense["playing_time_starts"] >= 28].sort_values(by="predicted_points_per_90", ascending=False)[
    [
        "player",
        "team",
        "playing_time_starts",
        "playing_time_min",
        "points_per_90",
        "predicted_points_per_90",
        "residual_points_per_90",
    ]
].head(60)


Unnamed: 0,player,team,playing_time_starts,playing_time_min,points_per_90,predicted_points_per_90,residual_points_per_90
336,Trent Alexander-Arnold,Liverpool,28,2365,5.632135,5.398071,0.234064
337,Virgil van Dijk,Liverpool,37,3330,3.864865,4.211825,-0.34696
6,Gabriel Magalhães,Arsenal,28,2363,4.4562,4.064543,0.391657
179,Daniel Muñoz,Crystal Palace,37,3228,3.959108,3.952297,0.006811
328,Ibrahima Konaté,Liverpool,30,2560,3.726562,3.944086,-0.217524
318,Andrew Robertson,Liverpool,29,2482,3.299758,3.937187,-0.637428
198,Tyrick Mitchell,Crystal Palace,37,3092,3.580207,3.8894,-0.309193
438,Murillo,Nott'ham Forest,36,3188,3.670013,3.82829,-0.158278
213,James Tarkowski,Everton,33,2922,3.234086,3.822143,-0.588057
193,Maxence Lacroix,Crystal Palace,35,3116,3.292683,3.819216,-0.526533
