
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder

from PositionalModelLinear import PositionalModelLinear

from config import *
from src.match_names import name_fbref_to_fpl, neutralize_name

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls',
       ...
       'xG_team_15', 'xGA_team_15', 'xG_opp_15', 'xGA_opp_15', 'xG_team_30',
       'xGA_team_30', 'xG_opp_30', 'xGA_opp_30', 'xG_diff', 'Avg_FPL_points'],
      dtype='object', length=114)

In [5]:
df_original["Name"].unique().size

533

In [6]:
df.shape

(18835, 114)

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
# df = df[~df["FPL_pos"].isin(["GK"])]

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
# df = df[df["Season"].isin(['2021-22', '2022-23'])]
allowed_pos = ['FWD', "MID", "DEF", "GK"]
df = df[df["FPL_pos"].isin(allowed_pos)]

In [11]:
df = df.sort_values(by=["Name", "Date"])

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [12]:
rolling_gameweeks = [4]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", "Opp_rating", 
             # "Rating_difference", # Team ratings
             # "Price", # FPl price
             # "Transfers_balance",
             # "Transfers_result"
             # "Avg_FPL_points",
             # "Avg_xP",
             # 'Avg_FPL_points_venue', 
             # "Was_home_xP",
             "RD_xP",
             # "Team_xP",
             # "Opp_xP"
             "xG_team_4",
             # "xGA_team_4",
             # "xG_opp_4",
             # "xGA_opp_4"
             ]
features_to_roll = [
                # "Min", 
                # "Start", # time played
                # 'Gls', 
                # 'Sh', 
                # 'SoT', # Goals
                # 'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                # "xG", 'xA', # Expected
                # 'Team_CS', # Defence
                # 'Team_score', 'Opp_score', 'Team_result', # Team form
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                # "FPL_points", 
                "xP"
                # "Baseline_points", "Bonus" # FPL points
            ]
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "Price", "FPL_pos", "FPL_points"]

In [13]:
# df[df["Name"] == "Erling-Haaland"]

# Feature engineering

In [14]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]
df["Transfers_result"] = df["Transfers_balance"] >= 0

In [15]:
def add_linear_fixtures(df):
    RD_coef_gk = 0.08
    RD_coef_outfield = 0.37
    df["Was_home_xP"] = df["Avg_FPL_points"] * df["Was_home"]
    df.loc[df["FPL_pos"] == "GK", "RD_xP"] = df["Avg_FPL_points"] * ( 1 + (df["Rating_difference"] - 1) * RD_coef_gk )
    df.loc[df["FPL_pos"] != "GK", "RD_xP"] = df["Avg_FPL_points"] * ( 1 + (df["Rating_difference"] - 1) * RD_coef_outfield )
    df["Team_xP"] = df["Avg_FPL_points"] * df["Team_rating"]
    df["Opp_xP"] = df["Avg_FPL_points"] * df["Opp_rating"]
    df['Avg_xP'] = df.groupby('Name_original')['xP'].transform('mean')
    df.loc[df["Was_home"] == True, 'Avg_FPL_points_venue'] = df[df["Was_home"] == True].groupby('Name_original')['FPL_points'].transform('mean')
    df.loc[df["Was_home"] == False, 'Avg_FPL_points_venue'] = df[df["Was_home"] == False].groupby('Name_original')['FPL_points'].transform('mean')
    
    return df

In [16]:
df = add_linear_fixtures(df)

In [17]:
def calculate_team_points(row):
    # win - 3 points
    if row['Team_score'] > row['Opp_score']:
        return 3
    # draw - 1 point
    elif row['Team_score'] == row['Opp_score']:
        return 1
    # loss - 0 points
    else:
        return 0
    
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [18]:
def add_rolling_features(df, standard_features, features_to_roll):
    features = standard_features
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        # print(form_means[form_means["Name"] == "Erling-Haaland"])
        form_means = form_means.fillna(method='bfill') # slightly incorrect, better to drop Nan
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df.reset_index(), form_means], axis=1)
        # df = df.merge(form_means, left_index=True, right_index=True)
        df = df.drop([col for col in df.columns if col.startswith('level')], axis=1)
        
    return df.reset_index(), features

In [19]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [20]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 1, # Starting eleven
        'Y*': 1, # Starting eleven as captain
        'N': 0, # Not in starting eleven
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [21]:
df = label_encoding(df, "Start")

In [22]:
df, features = add_rolling_features(df, standard_features, features_to_roll)

In [23]:
# df, features = ohe(df, ["FPL_pos"], features)
# if "GK" in df.columns:
#     df = df.drop("GK", axis=1)
#     features.remove("GK")

In [24]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [25]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [26]:
# dropping NaNs
df = df.dropna(axis=0)

In [27]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [28]:
df.columns

Index(['FPL_points', 'FPL_pos', 'GW', 'Name_original', 'Opp_rating',
       'Opponent', 'Price', 'RD_xP', 'Season', 'Team', 'Team_rating',
       'Was_home', 'xG_team_10', 'xP', 'xP_4'],
      dtype='object')

In [29]:
df.shape

(18432, 15)

In [30]:
# df = df[~df["FPL_pos"].isin(["GK"])] # only outfield players

In [31]:
df.shape

(18432, 15)

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [32]:
def shrink_df_to_top_players(df, n_players, min_fixtures):
    # Calculate average scores
    average_scores = df.groupby('Name_original')[to_predict].mean()
    
    # Calculate player counts
    player_counts = df['Name_original'].value_counts()
    
    # Filter out players with less than min_fixtures occurrences
    popular_players = player_counts[player_counts >= min_fixtures].index
    
    print("Unique players with min_fixtures:", popular_players.size)

    # Sort players by average scores
    sorted_players = average_scores.sort_values(ascending=False)

    # Get the top n_players players
    top_popular_players = sorted_players[sorted_players.index.isin(popular_players)].head(n_players)

    # Filter the original DataFrame
    top_players_df = df[df['Name_original'].isin(top_popular_players.index)]
    
    print("Unique players left:", top_players_df['Name_original'].unique().size)
    
    return top_players_df

In [33]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [34]:
df_train = shrink_df_to_top_players(df, 550, 10)
# df_train["Name_original"].unique()

Unique players with min_fixtures: 391
Unique players left: 391


In [35]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [36]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)

In [37]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [38]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [39]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((15916, 15), (273, 15), (2213, 15))

# Model

In [40]:
# model = GradientBoostingRegressor(random_state=42)
# model = XGBRegressor(
#         random_state=42,
#         n_estimators=500,
#         # early_stopping_rounds=5,
#         learning_rate=0.2
#     )
model = PositionalModelLinear(features, features, [], to_predict)
# model = RandomForestRegressor(random_state=42)

In [41]:
%%time
model.fit(
        X_train, 
        y_train,
        # eval_set=[(X_test_all_remaining, y_test_all_remaining)],
        # verbose=False
    )

CPU times: total: 15.6 ms
Wall time: 18.9 ms


In [42]:
features

['Was_home', 'RD_xP', 'xG_team_10', 'xP_4']

In [43]:
# model.model_GK.feature_names_in_

In [44]:
# model.model_GK.coef_

In [45]:
# model.model_outfield.coef_

# Getting predictions

In [46]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    if all_remaining:
        df_predictions = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    else:
        df_predictions = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    
    preds = df_predictions[info + [to_predict] + ["Pred"]].sort_values(by=["Pred"], ascending = False)
    
    return preds, predictions

In [47]:
preds, predictions = get_predictions(model, df, X_test)

In [48]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, True)

# Custom metric
Such models are usually used to compare players and decide who to pick for upcoming gameweek(s). For each player pair in a subset of players model can either predict the outcome well (e.g. player A scores more than player B and model predicts exactly that) or predict wrongly (e.g. player A scores more than player B but model predicts player B > player A). Pairwise_accuracy returns the percentage of corrected predicted pairs. Pairwise_accuracy_topX is a variation of this metric calculated only for X highest scoring players of last two seasons.

In [49]:
def get_top_performer_names(df, no_top, no_gws):
    # takes no_top players that recorded highest average FPL points in no_gws last gameweeks
    return df.reset_index(drop=True).groupby("Name_original")[to_predict].mean().groupby("Name_original").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [50]:
def pairwise_accuracy(predicted_scores, true_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] > true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] < true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [51]:
def pairwise_accuracy_topX(model, df, top_x, all_gw=False):
    # pairwise_accuracy for top_X players
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(preds, y)

# Evaluation

In [52]:
def evaluate(model, df, predictions, y_true, all_gw=False):
    # function to calculate different metrics for a given model
    mae = mean_absolute_error(y_true[to_predict], predictions)
    mse = mean_squared_error(y_true[to_predict], predictions)

    pairwise_acc = pairwise_accuracy(np.array(y_true[to_predict]), predictions)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20, all_gw)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100, all_gw)
    
    print("MAE:", mae)
    print("MSE:", mse)
    print("Pairwise accuracy:", pairwise_acc)
    print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
    print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)

In [53]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [54]:
# next gameweek
# evaluate(model, df, predictions, y_test, all_gw=False)

In [55]:
# all remaining gameweeks
evaluate(model, df, predictions_all, y_test_all_remaining, all_gw=True)

MAE: 1.5348241180503477
MSE: 4.548110658307953
Pairwise accuracy: 0.6419178469491065
Pairwise accuracy @TOP100: 0.54337899543379
Pairwise accuracy @TOP20: 0.6888888888888889


In [56]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,5654.818709
FPL,5261.0
xP,5660.7


In [57]:
# model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

In [58]:
features

['Was_home', 'RD_xP', 'xG_team_10', 'xP_4']

In [59]:
len(features)

4

# Predictions - next gameweek only

In [60]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
83,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,9.8,7.424335
197,Mohamed Salah,31,2022-23,Liverpool,Leeds United,0.0,1925.248047,1694.963013,12.8,MID,14,7.5,6.588426
152,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.1,MID,8,6.5,5.888141
180,Gabriel Martinelli Silva,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,6.8,MID,5,4.6,5.501109
110,Ivan Toney,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951782,7.8,FWD,2,4.0,5.227188
211,Ollie Watkins,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,7.6,FWD,16,10.5,5.208167
243,Solly March,31,2022-23,Brighton,Chelsea,0.0,1826.634033,1827.184204,5.2,MID,5,2.8,5.007136
260,Tyrone Mings,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,4.5,DEF,6,6.0,4.842957
80,Emiliano Martínez Romero,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,5.0,GK,6,6.0,4.774498
124,Jarrod Bowen,31,2022-23,West Ham,Arsenal,1.0,1751.608521,1946.8479,8.0,MID,7,3.0,4.766228


In [61]:
# preds[preds["Team"] == "Manchester City"].head(20)

# Predictions - all remaining gameweeks

In [62]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
690,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209717,12.4,FWD,2,6.0,8.496247
689,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714233,12.4,FWD,8,9.0,8.023803
1589,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286499,13.1,MID,5,3.7,7.942524
691,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975464,12.4,FWD,7,4.6,7.829776
692,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,12.4,FWD,1,1.4,7.60542
1590,Mohamed Salah,38,2022-23,Liverpool,Southampton,0.0,1950.095459,1616.916626,13.1,MID,5,8.6,7.599337
688,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,12.3,FWD,14,11.6,7.561245
687,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,9.8,7.424335
1588,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140381,13.1,MID,12,10.2,7.13315
1584,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,12.9,MID,7,4.3,7.068867


In [63]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

In [64]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

# Saving to file

In [65]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [66]:
# pickle.dump(model, open("models/GBR.pkl", 'wb'))

# Future preds

In [67]:
df_upcoming = pd.read_csv("data/upcoming/upcoming_fixtures.csv")

Columns (2,3,6,36,43,44,45,59,84,86,147,208) have mixed types.Specify dtype option on import or set low_memory=False.


In [68]:
df_upcoming.shape

(37191, 278)

In [69]:
df_upcoming = add_linear_fixtures(df_upcoming)

In [70]:
df_upcoming.shape

(37191, 284)

In [71]:
def adjust_goalkeepers(df, df_upcoming):
    treshold = 30
    gk_last_season = pd.read_csv('data/misc/gk_last_season.csv')
    for gk in df_upcoming[df_upcoming["FPL_pos"] == "GK"]["Name_original"].unique():
        gk_games = df[df["Name_original"] == gk].shape[0]
        if gk_games < treshold:
            current_avg = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Avg_FPL_points"].item()
            gk_team = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Team"].item()
            if gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)].shape[0] > 0:
                last_season_avg = gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)]["total_points"].item()
            else:
                last_season_avg = 3.2 # default for newly-promoted teams
            df_upcoming.loc[df_upcoming["Name_original"] == gk, "Avg_FPL_points"] = ( current_avg * gk_games + last_season_avg * (30 - gk_games) ) / 30
            # print(gk, gk_games, gk_team, current_avg, last_season_avg, current_avg * gk_games)
    return df_upcoming

In [72]:
df_upcoming = adjust_goalkeepers(df, df_upcoming)

In [73]:
# df_upcoming[df_upcoming["FPL_pos"] == "GK"].groupby(["Name_original"]).tail(1)[features + ["Name_original"]]

In [74]:
df_upcoming = df_upcoming[~df_upcoming[features].isnull().any(axis=1)] # for some reason 45 rows are missing some feature values

KeyError: "['xG_team_10'] not in index"

In [None]:
df_upcoming = df_upcoming[(df_upcoming["Season"] == CURRENT_SEASON) & (df_upcoming["GW"] >= NEXT_GAMEWEEK)].reset_index(drop=True)

In [None]:
df_upcoming

In [None]:
X_test = df_upcoming

In [None]:
def adjust_for_injuries(df):
    active_players = pd.read_csv("data/misc/active_players.csv")
    injured_players = active_players[active_players["chance_of_playing_next_round"] == 0]["name"].to_list()
    df.loc[df["Name_original"].isin(injured_players),"Pred"] = 0
    return df

In [None]:
def adjust_first_choice_goalkeepers(df):
    gks = pd.read_csv("data/misc/goalkeepers.csv")
    gk_names = gks["Name"].to_list()
    gk_names = [name_fbref_to_fpl(neutralize_name(n)) for n in gk_names]
    # print(gk_names)
    df.loc[(df["FPL_pos"] == "GK") & ~(df["Name_original"].apply(neutralize_name).isin(gk_names)), "Pred"] = 0
    return df

In [None]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    df_predictions = df[(df["Season"] == CURRENT_SEASON)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    preds = df_predictions[info + ["Pred"]]
    
    preds = adjust_first_choice_goalkeepers(preds)
    preds = adjust_for_injuries(preds).sort_values(by=["Pred"], ascending = False)
    
    return preds

In [None]:
# df_upcoming[df_upcoming["Name_original"] == "Erling Haaland"][["Name_original", "Was_home", 'Avg_FPL_points_venue']]

In [None]:
preds = get_predictions(model, df_upcoming[df_upcoming["FPL_pos"].isin(allowed_pos)], X_test[X_test["FPL_pos"].isin(allowed_pos)])

In [None]:
# X_test[(X_test["GW"] == NEXT_GAMEWEEK) & (X_test["FPL_pos"] == "GK")][features + ["Name_original"]]

In [None]:
# next gameweek
preds[preds["GW"] == NEXT_GAMEWEEK].head(30)

In [None]:
# preds[(preds["GW"] == NEXT_GAMEWEEK) & (preds["FPL_pos"] == "GK")].head(21)

In [None]:
n_gameweeks = 5
pivot = pd.pivot_table(preds[preds["GW"] < NEXT_GAMEWEEK + n_gameweeks], values='Pred', index=['Name_original', "FPL_pos"],
                       columns=['GW'], aggfunc=np.sum)
pivot['Summary'] = pivot.sum(axis=1)
pivot.columns = ['GW' + str(col).split(".")[0] for col in pivot.columns[:-1]] + ['Summary']
pivot = pivot.sort_values(by=["Summary"], ascending = False)
pivot.head(20)

In [None]:
pivot[pivot.index.get_level_values('FPL_pos') == 'GK']

In [None]:
# points all season
pd.DataFrame(preds.groupby(["Name_original"])["Pred"].sum().sort_values(ascending=False)).head(30)

In [None]:
preds.to_csv("predictions/preds_next_season.csv", index=False)

# Feature importance and influence

In [None]:
# explainer = shap.Explainer(model.model_outfield.predict, X_test[2000:3000])
# shap_values = explainer(X_test[2000:3000])

In [None]:
# shap.plots.bar(shap_values, max_display=15)

In [None]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [None]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [None]:
# shap.plots.waterfall(shap_values[410])

In [None]:
# shap.plots.waterfall(shap_values[26])

In [None]:
# shap.plots.waterfall(shap_values[96])