
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder

from modeling_functions import *
from config import *

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'WDL', 'Team_score', 'Opp_score', 'Team_CS', 'Was_home', 'GW',
       'Clean_name', 'FPL_name', 'FPL_team', 'FPL_opp_team', 'Name_original',
       'FPL_pos', 'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity',
       'Element', 'Fixture', 'Player_GC', 'Goals_scored', 'ICT_index',
       'Influence', 'Kickoff_time', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Value', 'FPL_GW',
       'xP', 'FPL_xA', 'FPL_xGI', 'FPL_xG', 'FPL_xGC', 'Opp_rating',
       'Team_rating', 'Min_points', 'Avg_FPL_points'],
      dtype='object')

In [5]:
df_original["Name"].unique().size

484

In [6]:
df.shape

(19421, 81)

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
df = df[df["FPL_pos"].isin(["GK"])] # only goalkeepers

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
df = df[df["Season"].isin(['2021-22', '2022-23'])]
# df = df[df["FPL_pos"].isin(['GK'])]

In [11]:
df = df.sort_values(by=["Name", "Date"])

In [12]:
df.shape

(1740, 81)

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [13]:
rolling_gameweeks = [2, 4, 30]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", "Opp_rating", 
             "Rating_difference", # Team ratings
             "Value", # FPl price
             "Transfers_balance",
             # "Transfers_result"
             "Avg_FPL_points"
             ]
features_to_roll = [
                "Min", 
                # "Start", # time played
                # 'Gls', 
                # 'Sh', 'SoT', # Goals
                # 'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                # "xG", 'xA', # Expected
                'Team_CS', # Defence
                "Player_GC",
                'Team_score', 'Opp_score', 'Team_result', # Team form
                "Saves",
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                "FPL_points", 
                # "Baseline_points", "Bonus" # FPL points
            ]
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "FPL_pos", "FPL_points"]

# Feature engineering

In [14]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]
df["Transfers_result"] = df["Transfers_balance"] >= 0

In [15]:
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [16]:
df = label_encoding(df, "Start")

In [17]:
df, features = add_rolling_features(df, standard_features, features_to_roll, rolling_gameweeks)

In [18]:
df, features = ohe(df, ["FPL_pos"], features)
if "GK" in df.columns:
    df = df.drop("GK", axis=1)
    features.remove("GK")

In [19]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [20]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [21]:
# dropping NaNs
df = df.dropna(axis=0)

In [22]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [23]:
df.columns

Index(['Avg_FPL_points', 'FPL_points', 'FPL_points_2', 'FPL_points_30',
       'FPL_points_4', 'FPL_pos', 'GW', 'Min_2', 'Min_30', 'Min_4',
       'Name_original', 'Opp_rating', 'Opp_score_2', 'Opp_score_30',
       'Opp_score_4', 'Opponent', 'Player_GC_2', 'Player_GC_30', 'Player_GC_4',
       'Rating_difference', 'Saves_2', 'Saves_30', 'Saves_4', 'Season', 'Team',
       'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating', 'Team_result_2',
       'Team_result_30', 'Team_result_4', 'Team_score_2', 'Team_score_30',
       'Team_score_4', 'Transfers_balance', 'Value', 'Was_home', 'xP'],
      dtype='object')

In [24]:
df.shape

(1740, 39)

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [25]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [26]:
df_train = shrink_df_to_top_players(df, 550, 10, to_predict)
df_train.shape

Unique players with min_fixtures: 37
Unique players left: 37


(1720, 39)

In [27]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [28]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))][features].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))][to_predict].reset_index(drop=True)

In [29]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][to_predict].reset_index(drop=True)

In [30]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ]["FPL_points"].reset_index(drop=True)

In [31]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((1520, 29), (24, 29), (206, 29))

# Model

In [32]:
# model = GradientBoostingRegressor(random_state=42)
model = XGBRegressor(
        random_state=42,
        n_estimators=500,
        early_stopping_rounds=5,
        learning_rate=0.2
    )
# model = LinearRegression()
# model = RandomForestRegressor(random_state=42)

In [33]:
%%time
model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test_all_remaining, y_test_all_remaining)],
        verbose=False
    )

CPU times: total: 250 ms
Wall time: 272 ms


In [34]:
def new_predict_gk(X):
    XGB_COMPONENT = 0.3
    OVERALL_COMPONENT = 0.67
    FORM_COMPONENT = 0.2
    
    model_pred = np.array(model.predict2(X))
    overall_pred = np.array([row["Avg_FPL_points"] for i, row in X.iterrows()])
    form_pred = np.array([row["FPL_points_4"] for i, row in X.iterrows()])
    
    return np.add( model_pred * XGB_COMPONENT, overall_pred * OVERALL_COMPONENT, form_pred * FORM_COMPONENT )

model.predict2 = model.predict
# model.predict = model.predict2
model.predict = new_predict_gk

# Getting predictions

In [35]:
preds, predictions = get_predictions(model, df, X_test, to_predict, info, False)

In [36]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, to_predict, info, True)

# Evaluation

In [37]:
# next gameweek
evaluate(model, df, predictions, y_test, to_predict, features, all_gw=False)

MAE: 1.2062915485774808
MSE: 4.023447803139751
Pairwise accuracy: 0.5978260869565217
Pairwise accuracy @TOP100: 0.5663272555055647
Pairwise accuracy @TOP20: 0.5222868217054264


In [38]:
# all remaining gameweeks
evaluate(model, df, predictions_all, y_test_all_remaining, to_predict, features, all_gw=True)

MAE: 1.3831586247270493
MSE: 4.630933134453064
Pairwise accuracy: 0.5663272555055647
Pairwise accuracy @TOP100: 0.5978260869565217
Pairwise accuracy @TOP20: 0.5428571428571428


In [39]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,358.309688
FPL,363.0
xP,363.0


In [40]:
features

['Was_home',
 'Rating_difference',
 'Value',
 'Transfers_balance',
 'Avg_FPL_points',
 'Min_2',
 'Team_CS_2',
 'Player_GC_2',
 'Team_score_2',
 'Opp_score_2',
 'Team_result_2',
 'Saves_2',
 'FPL_points_2',
 'Min_4',
 'Team_CS_4',
 'Player_GC_4',
 'Team_score_4',
 'Opp_score_4',
 'Team_result_4',
 'Saves_4',
 'FPL_points_4',
 'Min_30',
 'Team_CS_30',
 'Player_GC_30',
 'Team_score_30',
 'Opp_score_30',
 'Team_result_30',
 'Saves_30',
 'FPL_points_30']

In [41]:
len(features)

29

# Predictions - next gameweek only

In [42]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred
9,Hugo Lloris,31,2022-23,Tottenham,Bournemouth,1.0,1851.721558,1665.276123,GK,2.0,2.0,3.869798
5,David Raya Martin,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951904,GK,5.0,5.0,3.867118
18,Nick Pope,31,2022-23,Newcastle Utd,Aston Villa,0.0,1876.32251,1801.51355,GK,2.0,2.0,3.774895
11,José Malheiro de Sá,31,2022-23,Wolves,Brentford,1.0,1717.951904,1782.324097,GK,7.0,7.0,3.522179
0,Aaron Ramsdale,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,GK,1.0,1.0,3.446671
7,Emiliano Martínez Romero,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,GK,6.0,6.0,3.401329
6,Ederson Santana de Moraes,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,GK,2.0,2.0,3.34593
12,Lukasz Fabianski,31,2022-23,West Ham,Arsenal,1.0,1751.608521,1946.8479,GK,2.0,2.0,3.185506
10,Jordan Pickford,31,2022-23,Everton,Fulham,1.0,1687.977173,1710.906738,GK,2.0,2.0,2.991057
17,Norberto Murara Neto,31,2022-23,Bournemouth,Tottenham,0.0,1665.276123,1851.721558,GK,3.0,3.0,2.852714


In [43]:
preds[preds["Team"] == "Manchester City"].head(20)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred
6,Ederson Santana de Moraes,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,GK,2.0,2.0,3.34593


# Predictions - all remaining gameweeks

In [44]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred
9,Alisson Ramses Becker,33,2022-23,Liverpool,West Ham,0.0,1928.467651,1771.272461,GK,2.0,2.0,4.241102
10,Alisson Ramses Becker,34,2022-23,Liverpool,Tottenham,1.0,1933.697876,1817.128296,GK,2.0,2.0,4.18159
11,Alisson Ramses Becker,35,2022-23,Liverpool,Brentford,1.0,1940.118652,1782.052979,GK,5.0,5.0,4.18159
13,Alisson Ramses Becker,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286499,GK,2.0,2.0,4.18159
61,David Raya Martin,36,2022-23,Brentford,West Ham,1.0,1779.567871,1768.1698,GK,7.0,7.0,4.166154
8,Alisson Ramses Becker,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869263,GK,2.0,2.0,4.121414
12,Alisson Ramses Becker,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140503,GK,2.0,2.0,4.080839
60,David Raya Martin,35,2022-23,Brentford,Liverpool,0.0,1782.052979,1940.118652,GK,2.0,2.0,4.008748
58,David Raya Martin,33,2022-23,Brentford,Chelsea,0.0,1763.566284,1804.650879,GK,9.0,9.0,4.00539
59,David Raya Martin,34,2022-23,Brentford,Nott'ham Forest,1.0,1778.244751,1639.577637,GK,2.0,2.0,3.882068


In [45]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred


In [46]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred


# Saving to file

In [47]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [48]:
pickle.dump(model, open("models/GBR_gk.pkl", 'wb'))

# Feature importance and influence

In [49]:
# explainer = shap.Explainer(model.predict, X_test_all_remaining)
# shap_values = explainer(X_test_all_remaining)

In [50]:
# shap.plots.bar(shap_values, max_display=15)

In [51]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [52]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [53]:
# # Salah vs Nottingham Forest
# shap.plots.waterfall(shap_values[1396])