In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error as rmse
from scipy.stats import poisson

In [2]:
def estimate_defensive_contribution_points(defensive_actions_per_90: float) -> float:
    """Estimate defensive contribution points per 90."""
    return 2 * poisson.sf(10, mu=defensive_actions_per_90)

In [3]:
df_translate = pd.read_csv("clean/fbref_to_fpl_api_2425.csv")

df_fpl = pd.read_csv(
    "clean/fpl_api_2425.csv",
    usecols=[
        "code",
        "position",
        "total_points",
        "clean_sheets"
    ],
)

df_fbref = pd.read_csv("clean/fbref_general.csv")
df_fbref = df_fbref.drop(columns=["league", "season", "pos"])

df_fpl_elo = pd.read_csv(
    "clean/fpl_elo.csv",
    usecols=[
        "player_code",
        "tackles_won",
        "interceptions",
        "blocks",
        "clearances",
        "tackles_won_pct",
        "recoveries"
    ],
)

In [4]:
df_fbref = df_fbref.merge(
    df_translate, left_on="player", right_on="fbref_name", how="left"
)

df_all = df_fbref.merge(df_fpl, left_on="fpl_code", right_on="code", how="inner")
df_all = df_all.merge(df_fpl_elo, left_on="code", right_on="player_code", how="left")
df_all["defensive_actions"] = (
    df_all["tackles_won"]
    + df_all["interceptions"]
    + df_all["blocks"]
    + df_all["clearances"]
    + df_all["recoveries"]
)
df_all["defensive_actions_per_90"] = df_all["defensive_actions"] / (
    df_all["playing_time_min"] / 90
)

df_midfield = df_all[
    (df_all["position"] == "MID") | (df_all["player"].isin([
        "Iliman Ndiaye",
        "Matheus Cunha",
        "Cody Gakpo",
        "Omar Marmoush",
        "Ryan Sessegnon",
    ]))
]
df_midfield["defensive_contribution_points"] = df_midfield[
    "defensive_actions_per_90"
].apply(estimate_defensive_contribution_points) * (df_midfield["playing_time_min"] / 90)

df_midfield["observable_points"] = (
    (df_midfield["playing_time_min"] / 90 * 2)
    + 3 * df_midfield["performance_ast"]
    + 5 * df_midfield["performance_gls"]
    + df_midfield["clean_sheets"]
    + df_midfield["defensive_contribution_points"]
)
df_midfield["points_per_90"] = df_midfield["total_points"] / (df_midfield["playing_time_min"] / 90)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["defensive_contribution_points"] = df_midfield[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["observable_points"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["points_per_90"] = df_midfield["total_points"] / (df_midfield["playing_time_min"] / 90)


In [5]:
df_midfield.sort_values(by="defensive_contribution_points", ascending=False).head(20)[["player", "defensive_contribution_points"]]

Unnamed: 0,player,defensive_contribution_points
84,Christian Nørgaard,40.710315
429,Elliot Anderson,39.837063
157,Moisés Caicedo,39.365115
114,Carlos Baleba,36.047068
79,Ryan Christie,32.87127
185,Jefferson Lerma,31.878004
208,Idrissa Gana Gueye,31.294717
335,Ryan Gravenberch,29.75249
69,Lewis Cook,29.121764
543,André,27.948351


In [6]:
df_midfield.columns

Index(['team', 'player', 'nation', 'age', 'born', 'playing_time_mp',
       'playing_time_starts', 'playing_time_min', 'playing_time_90s',
       'performance_gls', 'performance_ast', 'performance_g+a',
       'performance_g-pk', 'performance_pk', 'performance_pkatt',
       'performance_crdy', 'performance_crdr', 'expected_xg', 'expected_npxg',
       'expected_xag', 'expected_npxg+xag', 'progression_prgc',
       'progression_prgp', 'progression_prgr', 'per_90_minutes_gls',
       'per_90_minutes_ast', 'per_90_minutes_g+a', 'per_90_minutes_g-pk',
       'per_90_minutes_g+a-pk', 'per_90_minutes_xg', 'per_90_minutes_xag',
       'per_90_minutes_xg+xag', 'per_90_minutes_npxg',
       'per_90_minutes_npxg+xag', 'performance_recov', 'aerial_duels_won',
       'aerial_duels_lost', 'aerial_duels_won%', 'att', 'pass_types_live',
       'pass_types_dead', 'pass_types_fk', 'pass_types_tb', 'pass_types_sw',
       'pass_types_crs', 'pass_types_ti', 'pass_types_ck', 'touches_touches',
       'to

In [7]:
df_midfield_starter = df_midfield[df_midfield["playing_time_min"] >= 1500]

In [8]:
Y = df_midfield_starter["observable_points"]
X = df_midfield_starter[[
    "expected_npxg+xag",
    "progression_prgr",
    "touches_def_pen",
    "touches_mid_3rd",
    "touches_att_pen",
]]
model = sm.OLS(Y, X).fit()
print(rmse(Y, model.fittedvalues))
print(model.summary())

14.504552229273639
                                 OLS Regression Results                                
Dep. Variable:      observable_points   R-squared (uncentered):                   0.982
Model:                            OLS   Adj. R-squared (uncentered):              0.981
Method:                 Least Squares   F-statistic:                              1225.
Date:                Thu, 14 Aug 2025   Prob (F-statistic):                    5.49e-96
Time:                        17:30:53   Log-Likelihood:                         -478.93
No. Observations:                 117   AIC:                                      967.9
Df Residuals:                     112   BIC:                                      981.7
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
---------------

In [9]:
df_midfield["predicted_points"] = model.predict(df_midfield[[
    "expected_npxg+xag",
    "progression_prgr",
    "touches_def_pen",
    "touches_mid_3rd",
    "touches_att_pen",
]])
df_midfield["predicted_points_per_90"] = df_midfield["predicted_points"] / (df_midfield["playing_time_min"] / 90)

df_midfield["residual_points"] = df_midfield["total_points"] - df_midfield["predicted_points"]
df_midfield["residual_points_per_90"] = df_midfield["residual_points"] / (df_midfield["playing_time_min"] / 90)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["predicted_points"] = model.predict(df_midfield[[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["predicted_points_per_90"] = df_midfield["predicted_points"] / (df_midfield["playing_time_min"] / 90)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midfield["residual_points"

In [13]:
# Zubimendi, Gittens, Wirtz, Reijnders, Cherki, Palhinha
new_input = pd.DataFrame(
    {
        "expected_npxg+xag": [8],
        "progression_prgr": [55],
        "touches_def_pen": [114],
        "touches_mid_3rd": [2337],
        "touches_att_pen": [80],
    }
)

print(model.predict(new_input))

0    203.459823
dtype: float64


In [11]:
df_midfield[df_midfield["playing_time_min"] >= 1000].sort_values(by="predicted_points", ascending=False)[[
    "player",
    "playing_time_starts",
    "playing_time_min",
    "points_per_90",
    "residual_points_per_90",
]].head(60)

Unnamed: 0,player,playing_time_starts,playing_time_min,points_per_90,residual_points_per_90
334,Mohamed Salah,38,3371,9.184218,2.214439
145,Cole Palmer,36,3191,6.035725,0.659449
376,Bruno Fernandes,35,3018,5.188867,-0.028359
83,Bryan Mbeumo,38,3414,6.221441,1.770285
55,Antoine Semenyo,36,3203,4.636278,0.08546
183,Ismaila Sarr,30,2711,4.946514,-0.144443
52,Youri Tielemans,35,3026,3.59881,-0.915689
333,Luis Díaz,28,2399,6.865361,1.306092
403,Bruno Guimarães,38,3271,3.71446,-0.343736
146,Enzo Fernández,32,2947,4.122837,-0.355503


In [12]:
df_midfield[df_midfield["player"] == "Donyell Malen"][
    [
        "player",
        "playing_time_starts",
        "playing_time_min",
        "points_per_90",
        "predicted_points",
        "residual_points_per_90",
    ]
]

Unnamed: 0,player,playing_time_starts,playing_time_min,points_per_90,predicted_points,residual_points_per_90
30,Donyell Malen,2,305,10.032787,18.799437,4.485412
