
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from PositionalModel import PositionalModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import LocalOutlierFactor
import copy
from datetime import datetime

from config import *
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore", message="Empty dataset at worker")

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df["Team"].unique()

array(['Brighton', 'West Ham', 'Brentford', 'Arsenal', 'Manchester Utd',
       'Everton', 'Sheffield Utd', 'Bournemouth', 'Crystal Palace',
       'Fulham', 'Newcastle Utd', 'Liverpool', 'Luton Town', 'Burnley',
       "Nott'ham Forest", 'Southampton', 'Chelsea', 'Tottenham',
       'Manchester City', 'Aston Villa', 'Norwich City', 'Wolves',
       'Watford', 'Leicester City', 'Leeds United'], dtype=object)

In [5]:
df = df.drop(columns=["FPL_GW",
    "Min_points",
    "Clean_name",
    "FPL_xA",
    "FPL_xGC",
    "FPL_xG",
    "FPL_xGI"]
    )
df = df.dropna()

In [6]:
df.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'WDL', 'Team_score', 'Opp_score', 'Team_CS', 'Was_home', 'GW',
       'FPL_name', 'FPL_team', 'FPL_opp_team', 'Name_original', 'FPL_pos',
       'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity', 'Element',
       'Fixture', 'Player_GC', 'Goals_scored', 'ICT_index', 'Influence',
       'Kickoff_time', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Price', 'xP',
       'Opp_rating', 'Team_rating', 'Avg_FPL_points'],
      dtype='object')

In [7]:
df_original["Name"].unique().size

461

In [8]:
df.shape

(18206, 74)

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
# df = df[~df["FPL_pos"].isin(["GK"])]

In [11]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [12]:
df = df[df["Season"].isin(['2021-22', '2022-23'])]

In [13]:
df = df.sort_values(by=["Name", "Date"])

In [14]:
# df = df.reset_index()
# lof = LocalOutlierFactor(n_neighbors=3, contamination=0.01)
# outliers = lof.fit_predict(df.select_dtypes(include=np.number))
# outliers_indices = np.where(outliers == -1)[0]
# df = df.drop(outliers_indices, axis=0)

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [15]:
rolling_gameweeks = [2, 4, 30]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", 
             "Opp_rating", 
             "Rating_difference", # Team ratings
             "Price", # FPl price
             "Transfers_balance",
             # "Transfers_result"
             "Avg_FPL_points",
             "Selected"
             ]
cat_features = ["FPL_pos", 
                "Pos",
                # "Team",
                # "Opponent"
               ]
features_to_roll_outfield = [
                "Min", 
                # "Start", # time played
                'Gls', 
                'Sh', 'SoT', # Goals
                'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                "xG", 'xA', # Expected
                'Team_CS', # Defence
                'Team_score', 'Opp_score', 'Team_result', # Team form
                "Player_GC",
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                "FPL_points", 
                "xP"
                # "Baseline_points", "Bonus" # FPL points
            ]
features_to_roll_gk = [
                "Min", 
                # "Start", # time played
                # 'Gls', 
                # 'Sh', 'SoT', # Goals
                # 'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                # "xG", 'xA', # Expected
                'Team_CS', # Defence
                "Player_GC",
                'Team_score', 'Opp_score', 'Team_result', # Team form
                "Saves",
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                "FPL_points", 
                # "Baseline_points", "Bonus" # FPL points
            ]
features_to_roll = np.unique([features_to_roll_gk + features_to_roll_outfield])
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "Price", "FPL_pos", "FPL_points", "Avg_FPL_points"]

In [16]:
features_to_roll

array(['Ast', 'FPL_points', 'Gls', 'Min', 'Opp_score', 'Player_GC',
       'Saves', 'Sh', 'SoT', 'Team_CS', 'Team_result', 'Team_score', 'xA',
       'xG', 'xP'], dtype='<U11')

In [17]:
# df[df["Name"] == "Erling-Haaland"]

# Feature engineering

In [18]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]

In [19]:
def calculate_team_points(row):
    # win - 3 points
    if row['Team_score'] > row['Opp_score']:
        return 3
    # draw - 1 point
    elif row['Team_score'] == row['Opp_score']:
        return 1
    # loss - 0 points
    else:
        return 0
    
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [20]:
def add_rolling_features(df, standard_features, features_to_roll_outfield, features_to_roll_gk):
    features_outfield = copy.copy(standard_features)
    features_gk = copy.copy(standard_features)
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        # print(form_means[form_means["Name"] == "Erling-Haaland"])
        form_means = form_means.fillna(method='bfill') # slightly incorrect, better to drop Nan
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        # features += form_means.columns.tolist()
        features_outfield += [f'{col}{"_"}{r}' for col in features_to_roll_outfield]
        features_gk += [f'{col}{"_"}{r}' for col in features_to_roll_gk]
        # features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df.reset_index(), form_means], axis=1)
        # df = df.merge(form_means, left_index=True, right_index=True)
        df = df.drop([col for col in df.columns if col.startswith('level')], axis=1)
        
    return df.reset_index(), features_outfield, features_gk

In [21]:
def ohe(df, ohe_columns, features_outfield, features_gk):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features_outfield += ohe_c.columns.tolist()
        features_gk += ohe_c.columns.tolist()
        
    return df, features_outfield, features_gk

In [22]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 1, # Starting eleven
        'Y*': 1, # Starting eleven as captain
        'N': 0, # Not in starting eleven
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [23]:
df = label_encoding(df, "Start")

In [24]:
df, features_outfield, features_gk = add_rolling_features(df, standard_features, features_to_roll_outfield, features_to_roll_gk)

In [25]:
# df, features_outfield, features_gk = ohe(df, ["FPL_pos"], features_outfield, features_gk)

In [26]:
# dropping unwanted columns
# features = [col for col in features if not col.startswith('level')]
df = df[np.unique(cat_features + features_outfield + features_gk + info + [to_predict])]

In [27]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [28]:
# dropping NaNs
df = df.dropna(axis=0)

In [29]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [30]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'Min_2', 'Min_30', 'Min_4', 'Name_original',
       'Opp_rating', 'Opp_score_2', 'Opp_score_30', 'Opp_score_4', 'Opponent',
       'Player_GC_2', 'Player_GC_30', 'Player_GC_4', 'Pos', 'Price',
       'Rating_difference', 'Saves_2', 'Saves_30', 'Saves_4', 'Season',
       'Selected', 'Sh_2', 'Sh_30', 'Sh_4', 'SoT_2', 'SoT_30', 'SoT_4', 'Team',
       'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating', 'Team_result_2',
       'Team_result_30', 'Team_result_4', 'Team_score_2', 'Team_score_30',
       'Team_score_4', 'Transfers_balance', 'Was_home', 'xA_2', 'xA_30',
       'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP', 'xP_2', 'xP_30', 'xP_4'],
      dtype='object')

In [31]:
df.shape

(17422, 62)

In [32]:
# df = df[~df["FPL_pos"].isin(["GK"])] # only outfield players

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [33]:
def shrink_df_to_top_players(df, n_players, min_fixtures):
    # Calculate average scores
    average_scores = df.groupby('Name_original')[to_predict].mean()
    
    # Calculate player counts
    player_counts = df['Name_original'].value_counts()
    
    # Filter out players with less than min_fixtures occurrences
    popular_players = player_counts[player_counts >= min_fixtures].index
    
    print("Unique players with min_fixtures:", popular_players.size)

    # Sort players by average scores
    sorted_players = average_scores.sort_values(ascending=False)

    # Get the top n_players players
    top_popular_players = sorted_players[sorted_players.index.isin(popular_players)].head(n_players)

    # Filter the original DataFrame
    top_players_df = df[df['Name_original'].isin(top_popular_players.index)]
    
    print("Unique players left:", top_players_df['Name_original'].unique().size)
    
    return top_players_df

In [34]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [35]:
df_train = shrink_df_to_top_players(df, 550, 10)
# df_train["Name_original"].unique()

Unique players with min_fixtures: 385
Unique players left: 385


In [36]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [37]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)

In [38]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [39]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [40]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((15187, 62), (264, 62), (2170, 62))

# Model

In [41]:
# model = GradientBoostingRegressor(random_state=42)
model = PositionalModel(features_gk, features_outfield, cat_features, to_predict)
# model = RandomForestRegressor(random_state=42)

In [42]:
%%time
model.fit(
    X_train,
    y_train,
    X_test_all_remaining,
    y_test_all_remaining
)

CPU times: total: 7min 17s
Wall time: 1min 42s


# Getting predictions

In [43]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    
    if all_remaining:
        df_predictions = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    else:
        df_predictions = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    
    df_predictions.loc[:, "Pred"] = predictions
    preds = df_predictions[info + [to_predict] + ["Pred"]].sort_values(by=["Pred"], ascending = False)
    
    return preds, predictions

In [44]:
X_test.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'Min_2', 'Min_30', 'Min_4', 'Name_original',
       'Opp_rating', 'Opp_score_2', 'Opp_score_30', 'Opp_score_4', 'Opponent',
       'Player_GC_2', 'Player_GC_30', 'Player_GC_4', 'Pos', 'Price',
       'Rating_difference', 'Saves_2', 'Saves_30', 'Saves_4', 'Season',
       'Selected', 'Sh_2', 'Sh_30', 'Sh_4', 'SoT_2', 'SoT_30', 'SoT_4', 'Team',
       'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating', 'Team_result_2',
       'Team_result_30', 'Team_result_4', 'Team_score_2', 'Team_score_30',
       'Team_score_4', 'Transfers_balance', 'Was_home', 'xA_2', 'xA_30',
       'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP', 'xP_2', 'xP_30', 'xP_4'],
      dtype='object')

In [45]:
predictions_train = model.predict(X_train)

In [46]:
preds, predictions = get_predictions(model, df, X_test)

In [47]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, True)

In [48]:
df.columns

Index(['Ast_2', 'Ast_30', 'Ast_4', 'Avg_FPL_points', 'FPL_points',
       'FPL_points_2', 'FPL_points_30', 'FPL_points_4', 'FPL_pos', 'GW',
       'Gls_2', 'Gls_30', 'Gls_4', 'Min_2', 'Min_30', 'Min_4', 'Name_original',
       'Opp_rating', 'Opp_score_2', 'Opp_score_30', 'Opp_score_4', 'Opponent',
       'Player_GC_2', 'Player_GC_30', 'Player_GC_4', 'Pos', 'Price',
       'Rating_difference', 'Saves_2', 'Saves_30', 'Saves_4', 'Season',
       'Selected', 'Sh_2', 'Sh_30', 'Sh_4', 'SoT_2', 'SoT_30', 'SoT_4', 'Team',
       'Team_CS_2', 'Team_CS_30', 'Team_CS_4', 'Team_rating', 'Team_result_2',
       'Team_result_30', 'Team_result_4', 'Team_score_2', 'Team_score_30',
       'Team_score_4', 'Transfers_balance', 'Was_home', 'xA_2', 'xA_30',
       'xA_4', 'xG_2', 'xG_30', 'xG_4', 'xP', 'xP_2', 'xP_30', 'xP_4'],
      dtype='object')

# Custom metric
Such models are usually used to compare players and decide who to pick for upcoming gameweek(s). For each player pair in a subset of players model can either predict the outcome well (e.g. player A scores more than player B and model predicts exactly that) or predict wrongly (e.g. player A scores more than player B but model predicts player B > player A). Pairwise_accuracy returns the percentage of corrected predicted pairs. Pairwise_accuracy_topX is a variation of this metric calculated only for X highest scoring players of last two seasons.

In [49]:
def get_top_performer_names(df, no_top, no_gws):
    # takes no_top players that recorded highest average FPL points in no_gws last gameweeks
    return df.reset_index(drop=True).groupby("Name_original")[to_predict].mean().groupby("Name_original").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [50]:
def pairwise_accuracy(true_scores, predicted_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # print(i)
            # print(predicted_scores)
            # print(predicted_scores[i])
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] >= true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] <= true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [51]:
def get_top_performers(df, top_x, all_gw=False):
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    
    return X, y

In [52]:
def pairwise_accuracy_topX(model, df, top_x, all_gw=False):
    # pairwise_accuracy for top_X players
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(y, preds)

In [53]:
def mae_topX(model, df, top_x, all_gw=False):
    X, y = get_top_performers(df, top_x, all_gw)
    preds = model.predict(X)
    
    return mean_absolute_error(y, preds)

In [54]:
def mse_topX(model, df, top_x, all_gw=False):
    X, y = get_top_performers(df, top_x, all_gw)
    preds = model.predict(X)
    
    return mean_squared_error(y, preds)

In [55]:
def r2_topX(model, df, top_x, all_gw=False):
    X, y = get_top_performers(df, top_x, all_gw)
    preds = model.predict(X)
    
    return r2_score(y, preds)

# Evaluation

In [56]:
def evaluate(model, df, predictions, y_true, all_gw=False):
    # function to calculate different metrics for a given model
    mae = mean_absolute_error(y_true["FPL_points"], predictions)
    mae_top100 = mae_topX(model, df, 100, all_gw)
    mae_top20 = mae_topX(model, df, 20, all_gw)
    
    mse = mean_squared_error(y_true["FPL_points"], predictions)
    mse_top100 = mse_topX(model, df, 100, all_gw)
    mse_top20 = mse_topX(model, df, 20, all_gw)

    pairwise_acc = pairwise_accuracy(y_true["FPL_points"], predictions)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100, all_gw)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20, all_gw)
    
    r2 = r2_score(y_true["FPL_points"], predictions)
    r2_top100 = r2_topX(model, df, 100, all_gw)
    r2_top20 = r2_topX(model, df, 20, all_gw)
    
    return pd.DataFrame( {'MAE': [mae, mae_top100, mae_top20], 
                          'MSE': [mse, mse_top100, mse_top20], 
                          'PWA': [pairwise_acc, pairwise_accuracy_top100, pairwise_accuracy_top20],
                          'R2': [r2, r2_top100, r2_top20]},
                        index=["All", "Top100", "Top20"])

In [57]:
# next gameweek
# evaluate(model, df, predictions, y_test, all_gw=False)

In [78]:
%%time
# training dataset
evaluation = evaluate(model, df, predictions_train, y_train, all_gw=False)
evaluation

CPU times: total: 8min 49s
Wall time: 9min 10s


Unnamed: 0,MAE,MSE,PWA,R2
All,1.432736,4.552215,0.893029,0.501491
Top100,2.055498,6.957531,0.623291,0.095703
Top20,2.500011,8.477576,0.620952,0.097543


In [59]:
%%time
# all remaining gameweeks
evaluation = evaluate(model, df, predictions_all, y_test_all_remaining, all_gw=True)
evaluation

CPU times: total: 10.5 s
Wall time: 10.9 s


Unnamed: 0,MAE,MSE,PWA,R2
All,1.833332,7.310155,0.791357,0.193995
Top100,2.07474,7.222351,0.627975,0.07489
Top20,2.118241,5.325067,0.705882,0.24222


In [60]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,5254.625008
FPL,5242.0
xP,5640.7


In [61]:
model.model_outfield.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'od_pval': 0,
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 5,
 'random_strength': 1,
 'od_type': 'Iter',
 'rsm': 1,
 'boost_from_average': True,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBor

In [62]:
features_outfield

['Was_home',
 'Opp_rating',
 'Rating_difference',
 'Price',
 'Transfers_balance',
 'Avg_FPL_points',
 'Selected',
 'Min_2',
 'Gls_2',
 'Sh_2',
 'SoT_2',
 'Ast_2',
 'xG_2',
 'xA_2',
 'Team_CS_2',
 'Team_score_2',
 'Opp_score_2',
 'Team_result_2',
 'Player_GC_2',
 'FPL_points_2',
 'xP_2',
 'Min_4',
 'Gls_4',
 'Sh_4',
 'SoT_4',
 'Ast_4',
 'xG_4',
 'xA_4',
 'Team_CS_4',
 'Team_score_4',
 'Opp_score_4',
 'Team_result_4',
 'Player_GC_4',
 'FPL_points_4',
 'xP_4',
 'Min_30',
 'Gls_30',
 'Sh_30',
 'SoT_30',
 'Ast_30',
 'xG_30',
 'xA_30',
 'Team_CS_30',
 'Team_score_30',
 'Opp_score_30',
 'Team_result_30',
 'Player_GC_30',
 'FPL_points_30',
 'xP_30']

In [63]:
features_gk

['Was_home',
 'Opp_rating',
 'Rating_difference',
 'Price',
 'Transfers_balance',
 'Avg_FPL_points',
 'Selected',
 'Min_2',
 'Team_CS_2',
 'Player_GC_2',
 'Team_score_2',
 'Opp_score_2',
 'Team_result_2',
 'Saves_2',
 'FPL_points_2',
 'Min_4',
 'Team_CS_4',
 'Player_GC_4',
 'Team_score_4',
 'Opp_score_4',
 'Team_result_4',
 'Saves_4',
 'FPL_points_4',
 'Min_30',
 'Team_CS_30',
 'Player_GC_30',
 'Team_score_30',
 'Opp_score_30',
 'Team_result_30',
 'Saves_30',
 'FPL_points_30']

# Predictions - next gameweek only

In [64]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
80,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,7.461538,9.8,6.529261
191,Mohamed Salah,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,12.8,MID,14,6.828947,7.5,5.526532
107,Ivan Toney,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951782,7.8,FWD,2,4.863636,4.0,5.199889
33,Bruno Borges Fernandes,31,2022-23,Manchester Utd,Nott'ham Forest,0.0,1878.940308,1635.660278,9.6,MID,3,4.8,8.1,5.018323
146,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.1,MID,8,5.507246,6.5,4.820758
202,Nick Pope,31,2022-23,Newcastle Utd,Aston Villa,0.0,1876.32251,1801.51355,5.4,GK,2,3.868421,2.0,4.748034
13,Andrew Robertson,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,6.8,DEF,2,4.408451,2.0,4.603553
174,Gabriel Martinelli Silva,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,6.8,MID,5,5.333333,4.6,4.463053
120,Jarrod Bowen,31,2022-23,West Ham,Arsenal,1.0,1751.608521,1946.8479,8.0,MID,7,4.857143,3.0,4.427487
150,Kieran Trippier,31,2022-23,Newcastle Utd,Aston Villa,0.0,1876.32251,1801.51355,6.2,DEF,1,4.791667,2.6,4.388555


In [65]:
preds[preds["FPL_pos"] == "GK"].head(20)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
202,Nick Pope,31,2022-23,Newcastle Utd,Aston Villa,0.0,1876.32251,1801.51355,5.4,GK,2,3.868421,2.0,4.748034
10,Alisson Ramses Becker,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,5.4,GK,2,4.565789,2.0,4.102623
101,Hugo Lloris,31,2022-23,Tottenham,Bournemouth,1.0,1851.721558,1665.276123,5.4,GK,2,3.859375,2.0,3.913668
60,David Raya Martin,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951782,4.9,GK,5,4.078125,5.0,3.840729
139,José Malheiro de Sá,31,2022-23,Wolves,Brentford,1.0,1717.951782,1782.324097,5.0,GK,7,3.898734,7.0,3.806143
77,Emiliano Martínez Romero,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,5.0,GK,6,3.692308,6.0,3.721304
2,Aaron Ramsdale,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,4.9,GK,1,3.727273,1.0,3.709931
162,Lukasz Fabianski,31,2022-23,West Ham,Arsenal,1.0,1751.608521,1946.8479,4.9,GK,2,3.460526,2.0,3.167967
137,Jordan Pickford,31,2022-23,Everton,Fulham,1.0,1687.977173,1710.906738,4.4,GK,2,3.253333,2.0,3.166197
72,Ederson Santana de Moraes,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.4,GK,3,3.717949,3.0,3.105434


In [66]:
preds[preds["Team"] == "Manchester City"].head(20)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
80,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,7.461538,9.8,6.529261
146,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.1,MID,8,5.507246,6.5,4.820758
108,Jack Grealish,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,7.2,MID,7,3.071429,4.5,4.014416
225,Rodrigo Hernandez,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.6,MID,4,3.44,2.0,3.50706
222,Riyad Mahrez,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,7.3,MID,2,3.527027,3.9,3.404324
164,Manuel Akanji,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.0,DEF,1,2.96875,1.0,3.379599
133,John Stones,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.4,DEF,8,2.910714,2.0,3.295887
21,Aymeric Laporte,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.6,DEF,2,3.225806,2.0,3.117026
72,Ederson Santana de Moraes,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,5.4,GK,3,3.717949,3.0,3.105434
226,Rúben Gato Alves Dias,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,6.0,DEF,2,2.617647,2.0,2.989073


# Predictions - all remaining gameweeks

In [67]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
1557,Mohamed Salah,34,2022-23,Liverpool,Tottenham,1.0,1933.697876,1817.128296,13.0,MID,7,6.828947,6.5,6.946256
668,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209595,12.4,FWD,2,7.461538,6.0,6.720062
669,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975464,12.4,FWD,7,7.461538,4.6,6.678877
665,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,7.461538,9.8,6.529261
1560,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286377,13.1,MID,5,6.828947,3.7,6.519778
1555,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,12.9,MID,7,6.828947,4.3,6.463896
1558,Mohamed Salah,35,2022-23,Liverpool,Brentford,1.0,1940.118652,1782.053101,13.0,MID,10,6.828947,10.6,6.39548
2025,Trent Alexander-Arnold,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,7.4,DEF,5,4.906667,6.3,6.302125
667,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714233,12.4,FWD,8,7.461538,9.0,6.24773
670,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,12.4,FWD,1,7.461538,1.4,6.186495


In [68]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
1554,Mohamed Salah,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,12.8,MID,14,6.828947,7.5,5.526532
1555,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,12.9,MID,7,6.828947,4.3,6.463896
1556,Mohamed Salah,33,2022-23,Liverpool,West Ham,0.0,1928.467651,1771.272461,13.0,MID,2,6.828947,2.5,5.697387
1557,Mohamed Salah,34,2022-23,Liverpool,Tottenham,1.0,1933.697876,1817.128296,13.0,MID,7,6.828947,6.5,6.946256
1558,Mohamed Salah,35,2022-23,Liverpool,Brentford,1.0,1940.118652,1782.053101,13.0,MID,10,6.828947,10.6,6.39548
1559,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140259,13.1,MID,12,6.828947,10.2,5.703812
1560,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286377,13.1,MID,5,6.828947,3.7,6.519778
1561,Mohamed Salah,38,2022-23,Liverpool,Southampton,0.0,1950.095459,1616.916626,13.1,MID,5,6.828947,8.6,5.999616


In [69]:
preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Avg_FPL_points,xP,Pred
665,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,7.461538,9.8,6.529261
671,Erling Haaland,32,2022-23,Manchester City,Brighton,0.0,2089.150146,1826.267822,12.4,FWD,5,7.461538,7.6,5.848953
666,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,12.3,FWD,14,7.461538,11.6,5.965395
667,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714233,12.4,FWD,8,7.461538,9.0,6.24773
668,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209595,12.4,FWD,2,7.461538,6.0,6.720062
669,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975464,12.4,FWD,7,7.461538,4.6,6.678877
670,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,12.4,FWD,1,7.461538,1.4,6.186495
672,Erling Haaland,38,2022-23,Manchester City,Brentford,0.0,2083.640137,1807.119507,12.4,FWD,0,7.461538,0.0,4.124507


# Saving to file

In [70]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [71]:
pickle.dump(model, open("models/GBR_positional.pkl", 'wb'))

In [72]:
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
evaluation.to_csv(f"evaluations/eval_{current_time}.csv", index=False)

# Feature importance and influence

In [73]:
# explainer = shap.Explainer(model.model_outfield.predict, X_test_all_remaining)
# shap_values = explainer(X_test_all_remaining)

In [74]:
# shap.plots.bar(shap_values, max_display=15)

In [75]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [76]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [77]:
# # Salah vs Nottingham Forest
# shap.plots.waterfall(shap_values[1396])