
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.preprocessing import LabelEncoder

from modeling_functions import *
from config import *

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'WDL', 'Team_score', 'Opp_score', 'Team_CS', 'Was_home', 'GW',
       'Clean_name', 'FPL_name', 'FPL_team', 'FPL_opp_team', 'Name_original',
       'FPL_pos', 'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity',
       'Element', 'Fixture', 'Player_GC', 'Goals_scored', 'ICT_index',
       'Influence', 'Kickoff_time', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Value', 'FPL_GW',
       'xP', 'FPL_xA', 'FPL_xGI', 'FPL_xG', 'FPL_xGC', 'Opp_rating',
       'Team_rating', 'Min_points', 'Avg_FPL_points'],
      dtype='object')

In [5]:
df.shape

(19421, 81)

In [6]:
df_original["Name"].unique().size

484

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
df = df.sort_values(by=["Name", "Date"])

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [9]:
rolling_gameweeks = [2, 4, 30]

to_predict = "xP"
standard_features = []
features_to_roll = [
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'Team_score', 'Opp_score', 'Team_CS',
       'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity',
       'Element', 'Player_GC', 'Goals_scored', 'ICT_index',
       'Influence', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Value',
       'xP', 'FPL_xA', 'FPL_xGI', 'FPL_xG', 'FPL_xGC', 'Opp_rating',
       'Team_rating', 'Min_points']
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "FPL_pos", "FPL_points"]

# Feature engineering

In [10]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]
df["Transfers_result"] = df["Transfers_balance"] >= 0

In [11]:
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [12]:
df = label_encoding(df, "Start")

In [13]:
df, features = add_rolling_features(df, standard_features, features_to_roll, rolling_gameweeks)

In [14]:
df, features = ohe(df, ["FPL_pos"], features)
if "GK" in df.columns:
    df = df.drop("GK", axis=1)
    features.remove("GK")

In [15]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [16]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [17]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Att_2', 'Att_30', 'Att_4', 'Blocks_2',
       'Blocks_30', 'Blocks_4', 'Bonus_2',
       ...
       'xA_2', 'xA_30', 'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP', 'xP_2', 'xP_30',
       'xP_4'],
      dtype='object', length=191)

In [18]:
df.shape

(19421, 191)

# Saving to file

In [19]:
# saving to file
df.set_index(["Name_original", "Season", "GW"], drop=False).to_csv("data/misc/df_features.csv", index=False) # keeping GKs on

In [20]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [21]:
df_train = shrink_df_to_top_players(df, 550, 10, to_predict)
df_train.shape

Unique players with min_fixtures: 391
Unique players left: 391


(17349, 191)

In [22]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [23]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))][features].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))][to_predict].reset_index(drop=True)

In [24]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][to_predict].reset_index(drop=True)

In [25]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ]["FPL_points"].reset_index(drop=True)

In [26]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((15183, 180), (285, 180), (2296, 180))

In [27]:
preds.head(30)

NameError: name 'preds' is not defined

In [None]:
preds[preds["Team"] == "Manchester City"].head(20)

In [None]:
preds_all_gameweeks.head(15)

In [None]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

In [None]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

In [None]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [None]:
pickle.dump(model, open("models/GBR_gk.pkl", 'wb'))

In [None]:
# explainer = shap.Explainer(model.predict, X_test_all_remaining)
# shap_values = explainer(X_test_all_remaining)

In [None]:
# shap.plots.bar(shap_values, max_display=15)

In [None]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [None]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [None]:
# # Salah vs Nottingham Forest
# shap.plots.waterfall(shap_values[1396])