# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import xgboost as xgb
from xgboost import plot_importance
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt
import pandas as pd
import shap

# Reading data and pre-processing

In [3]:
# loading csv
df = pd.read_csv('predictions/train_predictions_fwd.csv')

In [4]:
# removing unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.columns

Index(['Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 'Opponent', 'Start',
       'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att.1', 'Succ',
       'Match Report', 'Name', 'Season', 'WDL', 'GoalsTeams', 'Team Score',
       'Opp Score', 'CS', 'Was Home', 'GW', 'FPL', 'neutral_name',
       'Total Points', 'FPL position', 'Opp rating', 'ScoreForLast5',
       'ScoreAgainstLast5', 'Points'],
      dtype='object')

In [6]:
df.shape

(326, 53)

In [7]:
# dropping NaNs
# df = df.dropna(axis=0)

In [8]:
df.shape

(326, 53)

In [9]:
# df[df["Name"] == "Mohamed-Salah"]

# Choosing features and splitting data

In [10]:
features = ["Points", "Was Home", "Opp rating", 'ScoreForLast5', 'ScoreAgainstLast5']
info = ["Name", "GW", "Squad", "Opponent", "Was Home", "Opp rating"]
to_predict = ["Total Points"]

In [11]:
GAMEWEEK_TO_PREDICT = 20

In [12]:
# gameweeks 9-14 is training data, gw 15 is test data
X_train = df[(df["GW"] >= 9) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1)][features]
y_train = df[(df["GW"] >= 9) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1)][to_predict]

In [13]:
X_test = df[df["GW"] == GAMEWEEK_TO_PREDICT][features]
y_test = df[df["GW"] == GAMEWEEK_TO_PREDICT][to_predict]

In [14]:
X_train.size, y_train.size, X_test.size, y_test.size

(580, 116, 0, 0)

# Model

In [15]:
model = xgb.XGBRegressor()

In [16]:
%%time
model.fit(X_train, y_train)

CPU times: total: 422 ms
Wall time: 101 ms


In [17]:
# make predictions on the test data
predictions = model.predict(X_test)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# calculate the MAE and MSE scores
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [None]:
print("MAE: ", mae)
print("MSE: ", mse)

# Predictions

In [None]:
X_test = df[df["GW"] == GAMEWEEK_TO_PREDICT][features]
y_test = df[df["GW"] == GAMEWEEK_TO_PREDICT][to_predict]

In [None]:
df_predictions = df[df["GW"] == GAMEWEEK_TO_PREDICT]

In [None]:
df_predictions.shape

In [None]:
len(predictions)

In [None]:
df_predictions["pred"] = predictions

In [None]:
model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

In [None]:
df_predictions[info + to_predict + ["pred"]].sort_values(by=["pred"], ascending = False).head(20)

# Feature importance and influence

In [None]:
plot_importance(model)

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
plot_partial_dependence(model, X_train, features, ax=ax)

In [None]:
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values[3])