
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from PositionalModel import PositionalModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore", message="Empty dataset at worker")

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'WDL', 'Team_score', 'Opp_score', 'Team_CS', 'Was_home', 'GW',
       'Clean_name', 'FPL_name', 'FPL_team', 'FPL_opp_team', 'Name_original',
       'FPL_pos', 'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity',
       'Element', 'Fixture', 'Player_GC', 'Goals_scored', 'ICT_index',
       'Influence', 'Kickoff_time', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Price', 'FPL_GW',
       'xP', 'FPL_xA', 'FPL_xGI', 'FPL_xG', 'FPL_xGC', 'Opp_rating',
       'Team_rating', 'Min_points', 'Avg_FPL_points'],
      dtype='object')

In [5]:
df_original["Name"].unique().size

485

In [6]:
df.shape

(19726, 81)

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
df = df[~df["FPL_pos"].isin(["GK"])]

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
df = df[df["Season"].isin(['2021-22', '2022-23'])]

In [11]:
df = df.sort_values(by=["Name", "Date"])

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [12]:
rolling_gameweeks = [2, 4, 30]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", "Opp_rating", 
             "Rating_difference", # Team ratings
             "Price", # FPl price
             "Transfers_balance",
             # "Transfers_result"
             "Avg_FPL_points"
             ]
features_to_roll = [
                "Min", 
                # "Start", # time played
                'Gls', 
                # 'Sh', 'SoT', # Goals
                'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                "xG", 'xA', # Expected
                'Team_CS', # Defence
                'Team_score', 'Opp_score', 'Team_result', # Team form
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                "FPL_points", 
                # "xP"
                # "Baseline_points", "Bonus" # FPL points
            ]
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "Price", "FPL_pos", "FPL_points"]

In [13]:
# df[df["Name"] == "Erling-Haaland"]

# Feature engineering

In [14]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]

In [15]:
def calculate_team_points(row):
    # win - 3 points
    if row['Team_score'] > row['Opp_score']:
        return 3
    # draw - 1 point
    elif row['Team_score'] == row['Opp_score']:
        return 1
    # loss - 0 points
    else:
        return 0
    
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [16]:
def add_rolling_features(df, standard_features, features_to_roll):
    features = standard_features
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        # print(form_means[form_means["Name"] == "Erling-Haaland"])
        form_means = form_means.fillna(method='bfill') # slightly incorrect, better to drop Nan
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df.reset_index(), form_means], axis=1)
        # df = df.merge(form_means, left_index=True, right_index=True)
        df = df.drop([col for col in df.columns if col.startswith('level')], axis=1)
        
    return df.reset_index(), features

In [17]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [18]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 1, # Starting eleven
        'Y*': 1, # Starting eleven as captain
        'N': 0, # Not in starting eleven
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [19]:
df = label_encoding(df, "Start")

In [20]:
df, features = add_rolling_features(df, standard_features, features_to_roll)

In [21]:
df, features = ohe(df, ["FPL_pos"], features)
if "GK" in df.columns:
    df = df.drop("GK", axis=1)
    features.remove("GK")

In [22]:
df.columns

Index(['level_0', 'index', 'Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name',
       'Start', 'Pos',
       ...
       'xG_30', 'xA_30', 'Team_CS_30', 'Team_score_30', 'Opp_score_30',
       'Team_result_30', 'FPL_points_30', 'DEF', 'FWD', 'MID'],
      dtype='object', length=122)

In [23]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [24]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [25]:
# dropping NaNs
df = df.dropna(axis=0)

In [26]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [27]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'DEF', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'FWD', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'MID', 'Min_2', 'Min_30', 'Min_4',
       'Name_original', 'Opp_rating', 'Opp_score_2', 'Opp_score_30',
       'Opp_score_4', 'Opponent', 'Price', 'Rating_difference', 'Season',
       'Team', 'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating',
       'Team_result_2', 'Team_result_30', 'Team_result_4', 'Team_score_2',
       'Team_score_30', 'Team_score_4', 'Transfers_balance', 'Was_home',
       'xA_2', 'xA_30', 'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP'],
      dtype='object')

In [28]:
df.shape

(15408, 48)

In [29]:
# df = df[~df["FPL_pos"].isin(["GK"])] # only outfield players

In [30]:
features

['Was_home',
 'Rating_difference',
 'Price',
 'Transfers_balance',
 'Avg_FPL_points',
 'Min_2',
 'Gls_2',
 'Ast_2',
 'xG_2',
 'xA_2',
 'Team_CS_2',
 'Team_score_2',
 'Opp_score_2',
 'Team_result_2',
 'FPL_points_2',
 'Min_4',
 'Gls_4',
 'Ast_4',
 'xG_4',
 'xA_4',
 'Team_CS_4',
 'Team_score_4',
 'Opp_score_4',
 'Team_result_4',
 'FPL_points_4',
 'Min_30',
 'Gls_30',
 'Ast_30',
 'xG_30',
 'xA_30',
 'Team_CS_30',
 'Team_score_30',
 'Opp_score_30',
 'Team_result_30',
 'FPL_points_30',
 'DEF',
 'FWD',
 'MID']

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [31]:
def shrink_df_to_top_players(df, n_players, min_fixtures):
    # Calculate average scores
    average_scores = df.groupby('Name_original')[to_predict].mean()
    
    # Calculate player counts
    player_counts = df['Name_original'].value_counts()
    
    # Filter out players with less than min_fixtures occurrences
    popular_players = player_counts[player_counts >= min_fixtures].index
    
    print("Unique players with min_fixtures:", popular_players.size)

    # Sort players by average scores
    sorted_players = average_scores.sort_values(ascending=False)

    # Get the top n_players players
    top_popular_players = sorted_players[sorted_players.index.isin(popular_players)].head(n_players)

    # Filter the original DataFrame
    top_players_df = df[df['Name_original'].isin(top_popular_players.index)]
    
    print("Unique players left:", top_players_df['Name_original'].unique().size)
    
    return top_players_df

In [32]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [33]:
df_train = shrink_df_to_top_players(df, 550, 10)
# df_train["Name_original"].unique()

Unique players with min_fixtures: 354
Unique players left: 354


In [34]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [35]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)

In [36]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [37]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [38]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((13389, 48), (231, 48), (1962, 48))

# Model

In [39]:
# model = GradientBoostingRegressor(random_state=42)
model = PositionalModel(features, to_predict)
# model = RandomForestRegressor(random_state=42)

In [40]:
%%time
model.fit(
    X_train,
    y_train,
    X_test_all_remaining,
    y_test_all_remaining
)

CPU times: total: 6.02 s
Wall time: 813 ms


# Getting predictions

In [41]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    
    if all_remaining:
        df_predictions = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    else:
        df_predictions = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    
    df_predictions.loc[:, "Pred"] = predictions
    
    preds = df_predictions[info + [to_predict] + ["Pred"]].sort_values(by=["Pred"], ascending = False)
    
    return preds, predictions

In [42]:
X_test.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'DEF', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'FWD', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'MID', 'Min_2', 'Min_30', 'Min_4',
       'Name_original', 'Opp_rating', 'Opp_score_2', 'Opp_score_30',
       'Opp_score_4', 'Opponent', 'Price', 'Rating_difference', 'Season',
       'Team', 'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating',
       'Team_result_2', 'Team_result_30', 'Team_result_4', 'Team_score_2',
       'Team_score_30', 'Team_score_4', 'Transfers_balance', 'Was_home',
       'xA_2', 'xA_30', 'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP'],
      dtype='object')

In [43]:
preds, predictions = get_predictions(model, df, X_test)

In [44]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, True)

In [45]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'DEF', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'FWD', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'MID', 'Min_2', 'Min_30', 'Min_4',
       'Name_original', 'Opp_rating', 'Opp_score_2', 'Opp_score_30',
       'Opp_score_4', 'Opponent', 'Price', 'Rating_difference', 'Season',
       'Team', 'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating',
       'Team_result_2', 'Team_result_30', 'Team_result_4', 'Team_score_2',
       'Team_score_30', 'Team_score_4', 'Transfers_balance', 'Was_home',
       'xA_2', 'xA_30', 'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP'],
      dtype='object')

# Custom metric
Such models are usually used to compare players and decide who to pick for upcoming gameweek(s). For each player pair in a subset of players model can either predict the outcome well (e.g. player A scores more than player B and model predicts exactly that) or predict wrongly (e.g. player A scores more than player B but model predicts player B > player A). Pairwise_accuracy returns the percentage of corrected predicted pairs. Pairwise_accuracy_topX is a variation of this metric calculated only for X highest scoring players of last two seasons.

In [46]:
def get_top_performer_names(df, no_top, no_gws):
    # takes no_top players that recorded highest average FPL points in no_gws last gameweeks
    return df.reset_index(drop=True).groupby("Name_original")[to_predict].mean().groupby("Name_original").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [47]:
def pairwise_accuracy(true_scores, predicted_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # print(i)
            # print(predicted_scores)
            # print(predicted_scores[i])
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] >= true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] <= true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [48]:
def pairwise_accuracy_topX(model, df, top_x, all_gw=False):
    # pairwise_accuracy for top_X players
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(y, preds)

# Evaluation

In [49]:
def evaluate(model, df, predictions, y_true, all_gw=False):
    # function to calculate different metrics for a given model
    mae = mean_absolute_error(y_true["FPL_points"], predictions)
    mse = mean_squared_error(y_true["FPL_points"], predictions)

    pairwise_acc = pairwise_accuracy(y_true["FPL_points"], predictions)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20, all_gw)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100, all_gw)
    
    print("MAE:", mae)
    print("MSE:", mse)
    print("Pairwise accuracy:", pairwise_acc)
    print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
    print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)
    print("Avg pred vs avg true:", np.mean(predictions).round(2), "vs", np.mean(y_true[to_predict]).round(2).item())

In [50]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [51]:
y_test

Unnamed: 0,Ast_2,Ast_30,Ast_4,Avg_FPL_points,DEF,FPL_points,FPL_points_2,FPL_points_30,FPL_points_4,FPL_pos,...,Team_score_4,Transfers_balance,Was_home,xA_2,xA_30,xA_4,xG_2,xG_30,xG_4,xP
0,0.0,0.000000,0.00,2.681818,1,0.0,4.0,1.827586,2.25,DEF,...,1.00,1240.0,1.0,5.000000e-02,0.110000,2.500000e-02,0.000000e+00,0.006667,0.000000e+00,1.9
1,0.0,0.000000,0.00,2.133333,1,1.0,1.0,1.800000,0.50,DEF,...,1.25,139.0,0.0,0.000000e+00,0.005000,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,1.0
2,0.0,0.000000,0.00,2.087719,1,6.0,3.0,1.724138,3.00,DEF,...,0.75,8919.0,0.0,5.000000e-02,0.033333,2.500000e-02,4.000000e-01,0.036667,2.000000e-01,5.6
3,0.0,0.034483,0.00,2.000000,1,1.0,0.5,2.068966,1.25,DEF,...,0.75,-51.0,0.0,0.000000e+00,0.024138,2.500000e-02,0.000000e+00,0.020690,0.000000e+00,1.0
4,0.0,0.000000,0.00,2.327586,1,2.0,5.0,2.200000,2.75,DEF,...,1.75,-168.0,0.0,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.053333,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,0.5,0.033333,0.25,1.016129,0,1.0,1.0,0.733333,1.00,MID,...,2.00,51.0,0.0,3.000000e-01,0.023333,1.750000e-01,1.665335e-15,0.003333,6.661338e-16,1.0
227,0.0,0.100000,0.00,3.642857,0,5.0,1.5,3.550000,1.75,MID,...,1.00,1726.0,0.0,5.000000e-02,0.130000,2.500000e-02,1.665335e-15,0.130000,2.500000e-02,3.2
228,0.0,0.133333,0.00,1.918919,0,0.0,0.5,2.333333,1.00,FWD,...,0.75,-5317.0,0.0,5.000000e-02,0.080000,2.500000e-02,1.665335e-15,0.203333,5.000000e-02,1.0
229,0.0,0.066667,0.00,3.083333,0,1.0,1.0,2.700000,2.25,MID,...,1.25,-1566.0,0.0,2.109424e-15,0.073333,4.440892e-16,1.609823e-15,0.120000,6.661338e-16,1.8


In [52]:
# next gameweek
evaluate(model, df, predictions, y_test, all_gw=False)

MAE: 1.7593723631149467
MSE: 7.91061761300065
Pairwise accuracy: 0.7797101449275362
Pairwise accuracy @TOP100: 0.6212731957374606
Pairwise accuracy @TOP20: 0.5719045715047314
Avg pred vs avg true: 2.1 vs 2.05


In [53]:
# all remaining gameweeks
evaluate(model, df, predictions_all, y_test_all_remaining, all_gw=True)

MAE: 1.8135652952160894
MSE: 7.683204160286226
Pairwise accuracy: 0.7489625682459333
Pairwise accuracy @TOP100: 0.6133333333333333
Pairwise accuracy @TOP20: 0.7582417582417582
Avg pred vs avg true: 2.15 vs 2.05


In [54]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,4223.513067
FPL,4636.0
xP,4027.2


In [55]:
features

['Was_home',
 'Rating_difference',
 'Price',
 'Transfers_balance',
 'Avg_FPL_points',
 'Min_2',
 'Gls_2',
 'Ast_2',
 'xG_2',
 'xA_2',
 'Team_CS_2',
 'Team_score_2',
 'Opp_score_2',
 'Team_result_2',
 'FPL_points_2',
 'Min_4',
 'Gls_4',
 'Ast_4',
 'xG_4',
 'xA_4',
 'Team_CS_4',
 'Team_score_4',
 'Opp_score_4',
 'Team_result_4',
 'FPL_points_4',
 'Min_30',
 'Gls_30',
 'Ast_30',
 'xG_30',
 'xA_30',
 'Team_CS_30',
 'Team_score_30',
 'Opp_score_30',
 'Team_result_30',
 'FPL_points_30',
 'DEF',
 'FWD',
 'MID']

In [56]:
len(features)

38

# Predictions - next gameweek only

In [57]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
68,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.4,FWD,2.0,6.8,6.08425
90,Ivan Toney,31,2022-23,Brentford,Wolves,0.0,1782.324219,1717.951782,7.8,FWD,2.0,3.0,5.406989
31,Bukayo Saka,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,8.6,MID,0.0,6.0,5.265573
150,Gabriel Martinelli Silva,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,6.8,MID,5.0,3.6,5.065613
22,Benjamin White,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,4.9,DEF,6.0,5.4,4.249402
128,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.2,MID,3.0,3.5,4.247989
149,Martin Ødegaard,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,6.7,MID,10.0,6.5,4.213922
78,Gabriel dos Santos Magalhães,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,5.3,DEF,1.0,2.2,4.089115
79,Gabriel Fernando de Jesus,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,8.1,FWD,7.0,6.2,4.057181
207,Solly March,31,2022-23,Brighton,Chelsea,0.0,1826.634155,1827.184204,5.2,MID,5.0,1.8,3.959532


In [58]:
preds[preds["FPL_pos"] == "GK"].head(20)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred


In [59]:
preds[preds["Team"] == "Manchester City"].head(20)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
68,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.4,FWD,2.0,6.8,6.08425
128,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.2,MID,3.0,3.5,4.247989
118,John Stones,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,5.6,DEF,0.0,1.0,3.149311
197,Rodrigo Hernandez,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,5.6,MID,1.0,2.0,3.025287
123,Julián Álvarez,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,6.0,FWD,2.0,1.0,2.945723
195,Riyad Mahrez,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,7.3,MID,10.0,4.9,2.936882
143,Manuel Akanji,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,5.0,DEF,2.0,1.0,2.846375
91,Jack Grealish,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,7.3,MID,0.0,1.5,2.630676
198,Rúben Gato Alves Dias,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,6.0,DEF,0.0,1.0,2.254357
24,Bernardo Veiga de Carvalho e Silva,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,6.7,MID,1.0,1.0,2.211246


# Predictions - all remaining gameweeks

In [60]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
1394,Mohamed Salah,34,2022-23,Liverpool,Tottenham,1.0,1933.697876,1817.128296,13.0,MID,7.0,5.5,7.782479
591,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209717,12.0,FWD,0.0,5.0,7.092627
593,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,12.4,FWD,1.0,1.4,6.928568
1397,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286499,13.1,MID,5.0,2.7,6.704248
1395,Mohamed Salah,35,2022-23,Liverpool,Brentford,1.0,1940.118774,1782.053101,13.0,MID,10.0,9.6,6.473321
1082,Kevin De Bruyne,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209717,12.0,MID,13.0,5.7,6.187939
1396,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140381,12.8,MID,14.0,10.2,6.098482
588,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.4,FWD,2.0,6.8,6.08425
589,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,12.3,FWD,14.0,10.6,5.948582
244,Bruno Borges Fernandes,36,2022-23,Manchester Utd,Wolves,1.0,1841.589111,1725.502563,9.4,MID,6.0,7.0,5.792518


In [61]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
1392,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,12.9,MID,7.0,3.3,5.372824
1393,Mohamed Salah,33,2022-23,Liverpool,West Ham,0.0,1928.467651,1771.272461,13.0,MID,2.0,1.5,5.483853
1394,Mohamed Salah,34,2022-23,Liverpool,Tottenham,1.0,1933.697876,1817.128296,13.0,MID,7.0,5.5,7.782479
1395,Mohamed Salah,35,2022-23,Liverpool,Brentford,1.0,1940.118774,1782.053101,13.0,MID,10.0,9.6,6.473321
1396,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140381,12.8,MID,14.0,10.2,6.098482
1397,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286499,13.1,MID,5.0,2.7,6.704248
1398,Mohamed Salah,38,2022-23,Liverpool,Southampton,0.0,1950.095581,1616.916626,13.1,MID,5.0,7.6,5.592773


In [62]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
588,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042969,12.4,FWD,2.0,6.8,6.08425
594,Erling Haaland,32,2022-23,Manchester City,Brighton,0.0,2089.150146,1826.267822,12.4,FWD,5.0,6.6,5.139112
589,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,12.3,FWD,14.0,10.6,5.948582
590,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714233,12.4,FWD,8.0,8.0,5.153257
591,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209717,12.0,FWD,0.0,5.0,7.092627
592,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975464,12.4,FWD,7.0,3.6,5.540182
593,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,12.4,FWD,1.0,1.4,6.928568
595,Erling Haaland,38,2022-23,Manchester City,Brentford,0.0,2083.640137,1807.119507,12.4,FWD,0.0,0.0,4.554991


# Saving to file

In [63]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [64]:
pickle.dump(model, open("models/GBR_positional.pkl", 'wb'))

# Feature importance and influence

In [65]:
# explainer = shap.Explainer(model.predict, X_test_all_remaining)
# shap_values = explainer(X_test_all_remaining)

In [66]:
# shap.plots.bar(shap_values, max_display=15)

In [67]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [68]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [69]:
# # Salah vs Nottingham Forest
# shap.plots.waterfall(shap_values[1396])