
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

from statistics import mean
from scipy.stats import tmean
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
from sklearn.inspection import plot_partial_dependence

In [3]:
# loading csv
df_original = pd.read_csv('data/final_dataset.csv').drop("Unnamed: 0", axis=1)

df = df_original.copy()

Columns (53) have mixed types.Specify dtype option on import or set low_memory=False.


In [4]:
# df_original[df_original["Name"] == "Mohamed-Salah"].groupby(["Pos"]).mean()["Total Points"]

In [5]:
df = df.rename(columns={'GW_x': 'GW'})
# df = df[~(df['position'].isin(['GK']))]
# df = df[df['position'].isin(['MID'])]

In [6]:
# df_original[["expected_goals", "xG"]]

In [7]:
df_original["goals_conceded"].unique()

array([ 2.,  1.,  0., nan,  3.,  4.,  5.,  7.,  6.,  8.,  9.])

In [8]:
# df_original.columns.to_list()

In [9]:
df["position"].unique()

array(['DEF', nan, 'GK', 'MID', 'FWD'], dtype=object)

In [10]:
df.shape

(24683, 117)

# Features

In [11]:
# rolling_gameweeks = [2, 3, 5, 10, 20, 30]
rolling_gameweeks = [2, 4, 30]

to_predict = ["total_points"]
features1 = [
             "Was Home", # Home/Away
             "Team rating", "Opp rating", # Team ratings
             ]
features_to_roll = [
                "Min", "Start", # time played
                'Gls', 'Sh', 'SoT', # Goals
                'Ast', # Assists
    #             'CrdY', 'CrdR', # Cards
                "xG", 'xAG', # Expected
    #             'CS', # Defence
    #             'Team Score', 'Opp Score', # Team form
    #             # "xGPoints", "CSPoints", # Position-scaled
    #             # "Cmp%", "PrgP", "PrgC", "Succ", "PKwon",
    #             'bonus', 
    # # 'bps', # Bonus
                # 'influence', 'creativity', 'threat', 'ict_index', # ICT
                # "total_points" # FPL points
            ]
# features_to_roll = "Min"
info = ["Name", "GW", "Season", "Squad", "Opponent", "Was Home", "Team rating", "Opp rating", "position"]

In [12]:
# df[features_to_roll + info].isnull().sum(axis=0).sort_values(ascending=False).head(30)

# Preparing dataset

In [13]:
def calculate_xg_points(row):
    if row["position"] == "GK":
        return row["xG"] * 6
    elif row["position"] == "DEF":
        return row["xG"] * 6
    elif row["position"] == "MID":
        return row["xG"] * 5
    elif row["position"] == "FWD":
        return row["xG"] * 4
    else:
        return row["xG"] * 5

df["xGPoints"] = df.apply(calculate_xg_points, axis=1)

In [14]:
def calculate_cs_points(row):
    if row["position"] == "GK":
        return row["xG"] * 4
    elif row["position"] == "DEF":
        return row["xG"] * 4
    elif row["position"] == "MID":
        return row["xG"] * 1
    elif row["position"] == "FWD":
        return row["xG"] * 0
    else:
        return row["xG"] * 1

df["CSPoints"] = df.apply(calculate_cs_points, axis=1)

In [15]:
def add_rolling_features(df, features1, features_to_roll):
    features = features1
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        form_means = form_means.fillna(method='bfill')
        
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df, form_means], axis=1)
        
    return df, features

In [16]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [17]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 2,
        'Y*': 1,
        'N': 0,
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [18]:
df = df.reset_index()

In [19]:
df, features = add_rolling_features(df, features1, features_to_roll)

In [20]:
# df, features = ohe(df, ["position"], features)

In [21]:
df = label_encoding(df, "Start")

In [22]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + to_predict)]

In [23]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [24]:
# dropping NaNs
df = df.dropna(axis=0)

In [25]:
df = df.set_index(["Name", "Season", "GW"], drop=False)

In [26]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'GW', 'Gls_2', 'Gls_30', 'Gls_4', 'Min_2',
       'Min_30', 'Min_4', 'Name', 'Opp rating', 'Opponent', 'Season', 'Sh_2',
       'Sh_30', 'Sh_4', 'SoT_2', 'SoT_30', 'SoT_4', 'Squad', 'Team rating',
       'Was Home', 'position', 'total_points', 'xAG_2', 'xAG_30', 'xAG_4',
       'xG_2', 'xG_30', 'xG_4'],
      dtype='object')

In [27]:
df.shape

(22151, 31)

# Choosing features and splitting data

In [28]:
GAMEWEEK_TO_PREDICT = 32
SEASON_TO_PREDICT = "2022-23"

In [29]:
# gameweeks 6-14 is training data, gw 15 is test data
X_train = df[((df["Season"] <= SEASON_TO_PREDICT) & (df["GW"] >= 1) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1))][features].reset_index(drop=True)
y_train = df[((df["Season"] <= SEASON_TO_PREDICT) & (df["GW"] >= 1) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1))][to_predict].reset_index(drop=True)

In [30]:
X_test = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) ][to_predict].reset_index(drop=True)

In [31]:
X_train.shape, X_test.shape

((18220, 24), (370, 24))

# Model

In [32]:
class HybridModel:
    def __init__(self, models, names, weights):
        self.models = models
        self.model_names = names
        self.weights = weights
    
    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
        
    def predict(self, X):
        model_preds = [model.predict(X) for model in self.models]
        preds = []
        for i in range(len(model_preds[0])):
            preds.append(np.mean([x[i] for x in model_preds]))
        return preds
    
    def evaluate(self, X, y):
        ys = [np.expm1(model.predict(X)) for model in self.models]
        y_ensemble = self.predict(X)
        
        rmsle_ys = [(mean_squared_log_error(y, y_model) ** 0.5) for y_model in ys]
        rmsle_ensemble = mean_squared_log_error(y, y_ensemble) ** 0.5
        
        for i in range(0, len(self.models)):
            print(f"RMSLE on {self.model_names[i]} model: {rmsle_ys[i]}")
        print(f"RMSLE on ensemble model: {rmsle_ensemble}")

In [33]:
lgbm = LGBMRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

names = [
    "LGBM",
    "XGB",
    "GBR"
]

weights = [
    1,
    1,
    1
]

# hybrid arguments
models = [
    lgbm, 
    xgb, 
    gbr
]

model = HybridModel(models, names, weights)

In [34]:
model = GradientBoostingRegressor(random_state=42)
# model = LGBMRegressor(random_state=42)
# model = XGBRegressor(random_state=42)
# model = RandomForestRegressor()

In [35]:
%%time
model.fit(X_train, np.array(y_train).ravel())

CPU times: total: 3.59 s
Wall time: 3.75 s


# Evaluation

In [36]:
# make predictions on the test data
predictions = model.predict(X_test)

In [37]:
def pairwise_accuracy(predicted_scores, true_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] > true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] < true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [38]:
df_predictions = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
df_predictions.loc[:, "pred"] = predictions

In [39]:
def get_top_performer_names(df, no_top, no_gws):
    return df.reset_index(drop=True).groupby("Name")["total_points"].mean().groupby("Name").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [40]:
def pairwise_accuracy_topX(model, df, top_x):
    top_performers = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, top_x, 50)))]
    X = top_performers[features].reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(preds, y)

In [41]:
def evaluate(model, df, predictions, y_true):
    mae = mean_absolute_error(y_true, predictions)
    mse = mean_squared_error(y_true, predictions)

    pairwise_acc = pairwise_accuracy(np.array(y_true), predictions)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100)
    
    print("MAE:", mae)
    print("MSE:", mse)
    print("Pairwise accuracy:", pairwise_acc)
    print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
    print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)

In [42]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [43]:
evaluate(model, df, predictions, y_test)

MAE: 1.6498740115039152
MSE: 5.711435439640201
Pairwise accuracy: 0.5954735223027906
Pairwise accuracy @TOP100: 0.5482900136798906
Pairwise accuracy @TOP20: 0.6544117647058824


In [44]:
# df[df["Name"] == "Abdoulaye-Doucoure"][info]

In [45]:
# model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

# Valid predictions

In [46]:
preds = df_predictions[info + to_predict + ["pred"]].sort_values(by=["pred"], ascending = False)

In [47]:
preds.head(30)

Unnamed: 0,Name,GW,Season,Squad,Opponent,Was Home,Team rating,Opp rating,position,total_points,pred
111,Erling-Haaland,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,FWD,5.0,6.200745
43,Bukayo-Saka,32,2022-23,Arsenal,Southampton,1.0,1946.556396,1638.623901,MID,13.0,5.740645
266,Mohamed-Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1930.005737,1626.679321,MID,7.0,5.559678
129,Harry-Kane,32,2022-23,Tottenham,Newcastle Utd,0.0,1834.483765,1857.315674,FWD,6.0,5.531446
40,Bruno-Fernandes,32,2022-23,Manchester Utd,Chelsea,1.0,1859.799316,1795.493042,MID,8.0,5.473929
248,Martinelli,32,2022-23,Arsenal,Southampton,1.0,1946.556396,1638.623901,MID,9.0,5.459473
247,Martin-Odegaard,32,2022-23,Arsenal,Southampton,1.0,1946.556396,1638.623901,MID,7.0,5.414817
146,Ivan-Toney,32,2022-23,Brentford,Aston Villa,1.0,1766.344727,1812.850708,FWD,9.0,5.05144
94,Dominic-Solanke,32,2022-23,Bournemouth,West Ham,1.0,1674.964966,1759.928589,FWD,2.0,5.001139
241,Marcus-Rashford,32,2022-23,Manchester Utd,Chelsea,1.0,1859.799316,1795.493042,MID,7.0,4.730713


In [48]:
preds[preds["Squad"] == "Manchester City"].head(20)

Unnamed: 0,Name,GW,Season,Squad,Opponent,Was Home,Team rating,Opp rating,position,total_points,pred
111,Erling-Haaland,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,FWD,5.0,6.200745
308,Riyad-Mahrez,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,2.0,3.633562
298,Phil-Foden,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,9.0,3.111144
206,Kevin-De-Bruyne,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,1.0,3.065194
195,Julian-Alvarez,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,FWD,1.0,2.769742
215,Kyle-Walker,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,DEF,2.0,2.122381
141,Ilkay-Gundogan,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,2.0,2.07558
312,Rodri,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,2.0,1.861889
33,Bernardo-Silva,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,MID,1.0,1.14828
100,Ederson,32,2022-23,Manchester City,Brighton,0.0,2091.929932,1829.306519,GK,0.0,1.125166


In [49]:
# preds[preds["position"] == "FWD"].head(15)

# Feature importance and influence

In [50]:
# plot_importance(model)

In [51]:
# fig, ax = plt.subplots(figsize=(16, 8))
# plot_partial_dependence(model, X_train, features, ax=ax)

In [None]:
explainer = shap.Explainer(model.predict, X_test)
shap_values = explainer(X_test)

Permutation explainer:  58%|███████████████████████████████████████████████████████████████████▋                                                 | 214/370 [00:16<00:09, 16.44it/s]

In [None]:
shap.plots.bar(shap_values, max_display=15)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values[285], max_display=15)