
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder

from PositionalModelLinear import PositionalModelLinear

from config import *
from src.match_names import name_fbref_to_fpl, neutralize_name

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'T_att', 'T_succ',
       'Season', 'WDL', 'Team_score', 'Opp_score', 'Team_CS', 'Was_home', 'GW',
       'Clean_name', 'FPL_name', 'FPL_team', 'FPL_opp_team', 'Name_original',
       'FPL_pos', 'FPL_ast', 'Bonus', 'Bps', 'Player_CS', 'Creativity',
       'Element', 'Fixture', 'Player_GC', 'Goals_scored', 'ICT_index',
       'Influence', 'Kickoff_time', 'FPL_min', 'Own_goals', 'Penalties_missed',
       'Penalties_saved', 'Saves', 'Selected', 'Threat', 'FPL_points',
       'Transfers_balance', 'Transfers_in', 'Transfers_out', 'Price', 'FPL_GW',
       'xP', 'FPL_xA', 'FPL_xGI', 'FPL_xG', 'FPL_xGC', 'Opp_rating',
       'Team_rating', 'Min_points', 'Avg_FPL_points'],
      dtype='object')

In [5]:
df_original["Name"].unique().size

461

In [6]:
df.shape

(18674, 81)

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
# df = df[~df["FPL_pos"].isin(["GK"])]

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
# df = df[df["Season"].isin(['2021-22', '2022-23'])]
# df = df[df["FPL_pos"].isin(['GK'])]

In [11]:
df = df.sort_values(by=["Name", "Date"])

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [12]:
rolling_gameweeks = [4]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", "Opp_rating", 
             # "Rating_difference", # Team ratings
             # "Price", # FPl price
             # "Transfers_balance",
             # "Transfers_result"
             "Avg_FPL_points",
             # "Avg_xP",
             # 'Avg_FPL_points_venue', 
             # "Was_home_xP",
             "RD_xP",
             # "Team_xP",
             # "Opp_xP"
             ]
features_to_roll = [
                # "Min", 
                # "Start", # time played
                # 'Gls', 
                # 'Sh', 'SoT', # Goals
                # 'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                # "xG", 'xA', # Expected
                # 'Team_CS', # Defence
                # 'Team_score', 'Opp_score', 'Team_result', # Team form
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                # "FPL_points", 
                "xP"
                # "Baseline_points", "Bonus" # FPL points
            ]
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "FPL_pos", "FPL_points"]

In [13]:
# df[df["Name"] == "Erling-Haaland"]

# Feature engineering

In [14]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]
df["Transfers_result"] = df["Transfers_balance"] >= 0

In [15]:
def add_linear_fixtures(df):
    df["Was_home_xP"] = df["Avg_FPL_points"] * df["Was_home"]
    df["RD_xP"] = df["Avg_FPL_points"] * df["Rating_difference"]
    df["Team_xP"] = df["Avg_FPL_points"] * df["Team_rating"]
    df["Opp_xP"] = df["Avg_FPL_points"] * df["Opp_rating"]
    df['Avg_xP'] = df.groupby('Name_original')['xP'].transform('mean')
    df.loc[df["Was_home"] == True, 'Avg_FPL_points_venue'] = df[df["Was_home"] == True].groupby('Name_original')['FPL_points'].transform('mean')
    df.loc[df["Was_home"] == False, 'Avg_FPL_points_venue'] = df[df["Was_home"] == False].groupby('Name_original')['FPL_points'].transform('mean')
    
    return df

In [16]:
df = add_linear_fixtures(df)

In [17]:
def calculate_team_points(row):
    # win - 3 points
    if row['Team_score'] > row['Opp_score']:
        return 3
    # draw - 1 point
    elif row['Team_score'] == row['Opp_score']:
        return 1
    # loss - 0 points
    else:
        return 0
    
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [18]:
def add_rolling_features(df, standard_features, features_to_roll):
    features = standard_features
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        # print(form_means[form_means["Name"] == "Erling-Haaland"])
        form_means = form_means.fillna(method='bfill') # slightly incorrect, better to drop Nan
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df.reset_index(), form_means], axis=1)
        # df = df.merge(form_means, left_index=True, right_index=True)
        df = df.drop([col for col in df.columns if col.startswith('level')], axis=1)
        
    return df.reset_index(), features

In [19]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [20]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 1, # Starting eleven
        'Y*': 1, # Starting eleven as captain
        'N': 0, # Not in starting eleven
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [21]:
df = label_encoding(df, "Start")

In [22]:
df, features = add_rolling_features(df, standard_features, features_to_roll)

In [23]:
# df, features = ohe(df, ["FPL_pos"], features)
# if "GK" in df.columns:
#     df = df.drop("GK", axis=1)
#     features.remove("GK")

In [24]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [25]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [26]:
# dropping NaNs
df = df.dropna(axis=0)

In [27]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [28]:
df.columns

Index(['Avg_FPL_points', 'FPL_points', 'FPL_pos', 'GW', 'Name_original',
       'Opp_rating', 'Opponent', 'RD_xP', 'Season', 'Team', 'Team_rating',
       'Was_home', 'xP', 'xP_4'],
      dtype='object')

In [29]:
df.shape

(18550, 14)

In [30]:
# df = df[~df["FPL_pos"].isin(["GK"])] # only outfield players

In [31]:
df.shape

(18550, 14)

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [32]:
def shrink_df_to_top_players(df, n_players, min_fixtures):
    # Calculate average scores
    average_scores = df.groupby('Name_original')[to_predict].mean()
    
    # Calculate player counts
    player_counts = df['Name_original'].value_counts()
    
    # Filter out players with less than min_fixtures occurrences
    popular_players = player_counts[player_counts >= min_fixtures].index
    
    print("Unique players with min_fixtures:", popular_players.size)

    # Sort players by average scores
    sorted_players = average_scores.sort_values(ascending=False)

    # Get the top n_players players
    top_popular_players = sorted_players[sorted_players.index.isin(popular_players)].head(n_players)

    # Filter the original DataFrame
    top_players_df = df[df['Name_original'].isin(top_popular_players.index)]
    
    print("Unique players left:", top_players_df['Name_original'].unique().size)
    
    return top_players_df

In [33]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [34]:
df_train = shrink_df_to_top_players(df, 550, 10)
# df_train["Name_original"].unique()

Unique players with min_fixtures: 393
Unique players left: 393


In [35]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [36]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)

In [37]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [38]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [39]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((16151, 14), (273, 14), (2213, 14))

# Model

In [40]:
# model = GradientBoostingRegressor(random_state=42)
# model = XGBRegressor(
#         random_state=42,
#         n_estimators=500,
#         # early_stopping_rounds=5,
#         learning_rate=0.2
#     )
model = PositionalModelLinear(features, features, [], to_predict)
# model = RandomForestRegressor(random_state=42)

In [41]:
%%time
model.fit(
        X_train, 
        y_train,
        # eval_set=[(X_test_all_remaining, y_test_all_remaining)],
        # verbose=False
    )

CPU times: total: 15.6 ms
Wall time: 26.9 ms


In [42]:
features

['Was_home', 'Avg_FPL_points', 'RD_xP', 'xP_4']

In [43]:
model.model_GK.feature_names_in_

array(['Was_home', 'Avg_FPL_points', 'RD_xP', 'xP_4'], dtype=object)

In [44]:
model.model_GK.coef_

array([0.21035593, 0.32848757, 0.40240087, 0.27307862])

In [45]:
model.model_outfield.coef_

array([ 3.53329939e-01, -5.70087331e-05,  6.03362854e-01,  3.16152434e-01])

# Getting predictions

In [46]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    if all_remaining:
        df_predictions = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    else:
        df_predictions = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    
    preds = df_predictions[info + [to_predict] + ["Pred"]].sort_values(by=["Pred"], ascending = False)
    
    return preds, predictions

In [47]:
preds, predictions = get_predictions(model, df, X_test)

In [48]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, True)

# Custom metric
Such models are usually used to compare players and decide who to pick for upcoming gameweek(s). For each player pair in a subset of players model can either predict the outcome well (e.g. player A scores more than player B and model predicts exactly that) or predict wrongly (e.g. player A scores more than player B but model predicts player B > player A). Pairwise_accuracy returns the percentage of corrected predicted pairs. Pairwise_accuracy_topX is a variation of this metric calculated only for X highest scoring players of last two seasons.

In [49]:
def get_top_performer_names(df, no_top, no_gws):
    # takes no_top players that recorded highest average FPL points in no_gws last gameweeks
    return df.reset_index(drop=True).groupby("Name_original")[to_predict].mean().groupby("Name_original").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [50]:
def pairwise_accuracy(predicted_scores, true_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] > true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] < true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [51]:
def pairwise_accuracy_topX(model, df, top_x, all_gw=False):
    # pairwise_accuracy for top_X players
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(preds, y)

# Evaluation

In [52]:
def evaluate(model, df, predictions, y_true, all_gw=False):
    # function to calculate different metrics for a given model
    mae = mean_absolute_error(y_true[to_predict], predictions)
    mse = mean_squared_error(y_true[to_predict], predictions)

    pairwise_acc = pairwise_accuracy(np.array(y_true[to_predict]), predictions)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20, all_gw)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100, all_gw)
    
    print("MAE:", mae)
    print("MSE:", mse)
    print("Pairwise accuracy:", pairwise_acc)
    print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
    print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)

In [53]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [54]:
# next gameweek
evaluate(model, df, predictions, y_test, all_gw=False)

MAE: 1.442827538400599
MSE: 4.089328291913736
Pairwise accuracy: 0.6667205343675932
Pairwise accuracy @TOP100: 0.5529604163924134
Pairwise accuracy @TOP20: 0.5660182729148246


In [55]:
# all remaining gameweeks
evaluate(model, df, predictions_all, y_test_all_remaining, all_gw=True)

MAE: 1.5439999084159353
MSE: 4.570458707145452
Pairwise accuracy: 0.6402929753413374
Pairwise accuracy @TOP100: 0.5478740668614086
Pairwise accuracy @TOP20: 0.7166666666666667


In [56]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,5726.208765
FPL,5261.0
xP,5660.7


In [57]:
# model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

In [58]:
features

['Was_home', 'Avg_FPL_points', 'RD_xP', 'xP_4']

In [59]:
len(features)

4

# Predictions - next gameweek only

In [60]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred
83,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,FWD,12,9.8,7.822601
197,Mohamed Salah,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,MID,14,7.5,6.746698
152,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,MID,8,6.5,6.301136
180,Gabriel Martinelli Silva,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,MID,5,4.6,5.682778
110,Ivan Toney,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951782,FWD,2,4.0,5.190204
243,Solly March,31,2022-23,Brighton,Chelsea,0.0,1826.634155,1827.184204,MID,5,2.8,5.075947
211,Ollie Watkins,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,FWD,16,10.5,5.065446
37,Bukayo Saka,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608521,MID,0,7.0,4.878466
34,Bruno Borges Fernandes,31,2022-23,Manchester Utd,Nott'ham Forest,0.0,1878.940308,1635.660278,MID,3,8.1,4.816595
260,Tyrone Mings,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.51355,1876.32251,DEF,6,6.0,4.807041


In [61]:
# preds[preds["Team"] == "Manchester City"].head(20)

# Predictions - all remaining gameweeks

In [62]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,xP,Pred
690,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209595,FWD,2,6.0,9.098834
689,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714233,FWD,8,9.0,8.513004
691,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975464,FWD,7,4.6,8.342109
1589,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286377,MID,5,3.7,8.082093
1590,Mohamed Salah,38,2022-23,Liverpool,Southampton,0.0,1950.095459,1616.916626,MID,5,8.6,8.031799
692,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.31189,FWD,1,1.4,7.878728
687,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,FWD,12,9.8,7.822601
688,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,FWD,14,11.6,7.619056
1584,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,MID,7,4.3,7.388081
1588,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140259,MID,12,10.2,7.362218


In [63]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

In [64]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

# Saving to file

In [65]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [66]:
pickle.dump(model, open("models/GBR.pkl", 'wb'))

# Future preds

In [67]:
df_upcoming = pd.read_csv("data/upcoming/upcoming_fixtures.csv")

Columns (2,3,6,36,43,44,45,59,84,86,147,208) have mixed types.Specify dtype option on import or set low_memory=False.


In [68]:
df_upcoming = add_linear_fixtures(df_upcoming)

In [69]:
def adjust_goalkeepers(df, df_upcoming):
    treshold = 30
    gk_last_season = pd.read_csv('data/misc/gk_last_season.csv')
    for gk in df_upcoming[df_upcoming["FPL_pos"] == "GK"]["Name_original"].unique():
        gk_games = df[df["Name_original"] == gk].shape[0]
        if gk_games < treshold:
            current_avg = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Avg_FPL_points"].item()
            gk_team = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Team"].item()
            if gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)].shape[0] > 0:
                last_season_avg = gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)]["total_points"].item()
            else:
                last_season_avg = 3.2 # default for newly-promoted teams
            df_upcoming.loc[df_upcoming["Name_original"] == gk, "Avg_FPL_points"] = ( current_avg * gk_games + last_season_avg * (30 - gk_games) ) / 30
            # print(gk, gk_games, gk_team, current_avg, last_season_avg, current_avg * gk_games)
    return df_upcoming

In [70]:
df_upcoming = adjust_goalkeepers(df, df_upcoming)

In [93]:
# df_upcoming[df_upcoming["FPL_pos"] == "GK"].groupby(["Name_original"]).tail(1)[features + ["Name_original"]]

In [72]:
df_upcoming = df_upcoming[~df_upcoming[features].isnull().any(axis=1)] # for some reason 45 rows are missing some feature values

In [73]:
df_upcoming = df_upcoming[(df_upcoming["Season"] == CURRENT_SEASON) & (df_upcoming["GW"] >= NEXT_GAMEWEEK)].reset_index(drop=True)

In [74]:
X_test = df_upcoming

In [75]:
def adjust_for_injuries(df):
    active_players = pd.read_csv("data/misc/active_players.csv")
    injured_players = active_players[active_players["chance_of_playing_next_round"] == 0]["name"].to_list()
    df.loc[df["Name_original"].isin(injured_players),"Pred"] = 0
    return df

In [76]:
def adjust_goalkeepers(df):
    gks = pd.read_csv("data/misc/goalkeepers.csv")
    gk_names = gks["Name"].to_list()
    gk_names = [name_fbref_to_fpl(neutralize_name(n)) for n in gk_names]
    # print(gk_names)
    df.loc[(df["FPL_pos"] == "GK") & ~(df["Name_original"].apply(neutralize_name).isin(gk_names)), "Pred"] = 0
    return df

In [77]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    df_predictions = df[(df["Season"] == CURRENT_SEASON)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    preds = df_predictions[info + ["Pred"]]
    
    preds = adjust_goalkeepers(preds)
    preds = adjust_for_injuries(preds).sort_values(by=["Pred"], ascending = False)
    
    return preds

In [78]:
# df_upcoming[df_upcoming["Name_original"] == "Erling Haaland"][["Name_original", "Was_home", 'Avg_FPL_points_venue']]

In [79]:
preds = get_predictions(model, df_upcoming, X_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [80]:
# next gameweek
preds[preds["GW"] == NEXT_GAMEWEEK].head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,Pred
4610,Erling Haaland,4.0,2023-24,Manchester City,Fulham,1.0,2083.702148,1732.639404,FWD,,7.462574
11201,Mohamed Salah,4.0,2023-24,Liverpool,Aston Villa,1.0,1946.521606,1825.741455,MID,,6.876028
2096,Bryan Mbeumo,4.0,2023-24,Brentford,Bournemouth,1.0,1835.573242,1660.290894,MID,,6.034037
2410,Carlton Morris,4.0,2023-24,Luton,West Ham,1.0,1602.477417,1787.825562,FWD,,5.843293
12459,Phil Foden,4.0,2023-24,Manchester City,Fulham,1.0,2083.702148,1732.639404,MID,,5.609241
2026,Bruno Borges Fernandes,4.0,2023-24,Manchester Utd,Arsenal,0.0,1865.755249,1927.032104,MID,,5.495123
5412,Guglielmo Vicario,4.0,2023-24,Tottenham,Burnley,0.0,1829.239014,1722.952881,GK,,5.464136
6880,Jarrod Bowen,4.0,2023-24,West Ham,Luton,0.0,1787.825562,1602.477417,MID,,5.217698
2130,Bukayo Saka,4.0,2023-24,Arsenal,Manchester Utd,1.0,1927.032104,1865.755249,MID,,5.006219
14059,Solly March,4.0,2023-24,Brighton,Newcastle Utd,1.0,1843.758179,1884.512817,MID,,4.94064


In [94]:
preds[(preds["GW"] == NEXT_GAMEWEEK) & (preds["FPL_pos"] == "GK")]

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,FPL_pos,FPL_points,Pred
5412,Guglielmo Vicario,4.0,2023-24,Tottenham,Burnley,0.0,1829.239014,1722.952881,GK,,5.464136
594,Alisson Ramses Becker,4.0,2023-24,Liverpool,Aston Villa,1.0,1946.521606,1825.741455,GK,,4.642424
1014,André Onana,4.0,2023-24,Manchester Utd,Arsenal,0.0,1865.755249,1927.032104,GK,,4.184632
4122,Ederson Santana de Moraes,4.0,2023-24,Manchester City,Fulham,1.0,2083.702148,1732.639404,GK,,4.136071
104,Aaron Ramsdale,4.0,2023-24,Arsenal,Manchester Utd,1.0,1927.032104,1865.755249,GK,,3.957278
11724,Nick Pope,4.0,2023-24,Newcastle Utd,Brighton,0.0,1884.512817,1843.758179,GK,,3.698695
13085,Robert Sánchez,4.0,2023-24,Chelsea,Nott'ham Forest,1.0,1781.043579,1677.87793,GK,,3.289579
11829,Norberto Murara Neto,4.0,2023-24,Bournemouth,Brentford,0.0,1660.290894,1835.573242,GK,,3.286968
4470,Emiliano Martínez Romero,4.0,2023-24,Aston Villa,Liverpool,0.0,1825.741455,1946.521606,GK,,3.033757
7859,Jordan Pickford,4.0,2023-24,Everton,Sheffield United,0.0,1691.463989,1631.764282,GK,,2.989544


In [82]:
n_gameweeks = 5
pivot = pd.pivot_table(preds[preds["GW"] < NEXT_GAMEWEEK + n_gameweeks], values='Pred', index=['Name_original', "FPL_pos"],
                       columns=['GW'], aggfunc=np.sum)
pivot['Summary'] = pivot.sum(axis=1)
pivot.columns = ['GW' + str(col).split(".")[0] for col in pivot.columns[:-1]] + ['Summary']
pivot = pivot.sort_values(by=["Summary"], ascending = False)
pivot.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,GW4,GW5,GW6,GW7,GW8,Summary
Name_original,FPL_pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Erling Haaland,FWD,7.462574,6.94212,7.639279,7.198507,6.563077,35.805557
Mohamed Salah,MID,6.876028,6.835064,6.969192,6.514299,6.479772,33.674355
Bruno Borges Fernandes,MID,5.495123,5.975099,5.827255,6.116795,5.988167,29.402439
Carlton Morris,FWD,5.843293,5.576089,5.97542,5.644011,5.782074,28.820887
Bryan Mbeumo,MID,6.034037,5.377666,5.987098,5.654011,5.400226,28.453038
Guglielmo Vicario,GK,5.464136,5.825699,5.177587,5.363721,5.667556,27.498699
Phil Foden,MID,5.609241,5.160151,5.71049,5.307057,4.942964,26.729903
Jarrod Bowen,MID,5.217698,4.815927,4.639805,5.512346,5.081704,25.26748
Bukayo Saka,MID,5.006219,4.979646,5.069522,5.04532,4.674533,24.77524
Solly March,MID,4.94064,4.61583,5.323747,4.678627,4.85027,24.409115


In [83]:
pivot[pivot.index.get_level_values('FPL_pos') == 'GK']

Unnamed: 0_level_0,Unnamed: 1_level_0,GW4,GW5,GW6,GW7,GW8,Summary
Name_original,FPL_pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Guglielmo Vicario,GK,5.464136,5.825699,5.177587,5.363721,5.667556,27.498699
Alisson Ramses Becker,GK,4.642424,4.571353,4.683966,4.428322,4.412927,22.738992
André Onana,GK,4.184632,4.459509,4.35384,4.531698,4.466167,21.995846
Ederson Santana de Moraes,GK,4.136071,3.870176,4.194793,3.955378,3.744213,19.900631
Aaron Ramsdale,GK,3.957278,3.906546,3.988202,3.938628,3.795246,19.5859
Nick Pope,GK,3.698695,3.916146,3.905401,4.020609,3.748472,19.289323
Norberto Murara Neto,GK,3.286968,3.533457,3.281729,3.441313,3.387516,16.930983
Emiliano Martínez Romero,GK,3.033757,3.392921,3.163237,3.321787,3.231599,16.143302
Robert Sánchez,GK,3.289579,3.094305,3.174267,3.034222,3.041974,15.634348
Jordan Pickford,GK,2.989544,2.991968,2.838868,3.224701,3.176583,15.221664


In [84]:
# points all season
pd.DataFrame(preds.groupby(["Name_original"])["Pred"].sum().sort_values(ascending=False)).head(30)

Unnamed: 0_level_0,Pred
Name_original,Unnamed: 1_level_1
Erling Haaland,243.872545
Mohamed Salah,239.060067
Bruno Borges Fernandes,206.319929
Carlton Morris,198.181536
Bryan Mbeumo,193.658605
Guglielmo Vicario,192.006134
Phil Foden,182.502514
Jarrod Bowen,177.324234
Bukayo Saka,174.198763
Solly March,171.984733


In [85]:
preds.to_csv("predictions/preds_next_season.csv", index=False)

# Feature importance and influence

In [86]:
# explainer = shap.Explainer(model.model_outfield.predict, X_test[2000:3000])
# shap_values = explainer(X_test[2000:3000])

In [87]:
# shap.plots.bar(shap_values, max_display=15)

In [88]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [89]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [90]:
# shap.plots.waterfall(shap_values[410])

In [91]:
# shap.plots.waterfall(shap_values[26])

In [92]:
# shap.plots.waterfall(shap_values[96])