
# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
import pickle

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder

from PositionalModelLinear import PositionalModelLinear

from config import *
from src.match_names import name_fbref_to_fpl, neutralize_name

# Loading data
Data consists of all PL players logs for 2021-22 and 2022-23 seasons\
A single log is just a summary of player performance in a particular match

In [3]:
# loading csv
df_original = pd.read_csv('data/previous/FPL_logs.csv')

df = df_original.copy()

In [4]:
df_original.columns

Index(['Date', 'Day', 'Venue', 'Team', 'Opponent', 'Name', 'Start', 'Pos',
       'Min', 'Gls',
       ...
       'xG_team_15', 'xGA_team_15', 'xG_opp_15', 'xGA_opp_15', 'xG_team_30',
       'xGA_team_30', 'xG_opp_30', 'xGA_opp_30', 'xG_diff', 'Avg_FPL_points'],
      dtype='object', length=114)

In [5]:
df_original["Name"].unique().size

544

In [6]:
df.shape

(19208, 114)

In [7]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [8]:
# df = df[~df["FPL_pos"].isin(["GK"])]

In [9]:
df["Season"].unique()

array(['2021-22', '2022-23', '2023-24'], dtype=object)

In [10]:
# df = df[df["Season"].isin(['2021-22', '2022-23'])]
allowed_pos = ['FWD', "MID", "DEF", "GK"]
df = df[df["FPL_pos"].isin(allowed_pos)]

In [11]:
df = df.sort_values(by=["Name", "Date"])

# Feature selection
Model will use a set of standard features and moving averages for some other feature values. In this case we will use moving averages for last 2, 4 and 30 gameweeks.

In [12]:
rolling_gameweeks = [4]
# rolling_gameweeks = [5]

# to_predict = "FPL_points"
to_predict = "xP"
standard_features = [
             "Was_home", # Home/Away
             # "Team_rating", "Opp_rating", 
             # "Rating_difference", # Team ratings
             # "Price", # FPl price
             # "Transfers_balance",
             # "Transfers_result"
             # "Avg_FPL_points",
             # "Avg_xP",
             # 'Avg_FPL_points_venue', 
             # "Was_home_xP",
             "RD_xP",
             # "Team_xP",
             # "Opp_xP"
             "xG_team_4",
             # "xGA_team_4",
             # "xG_opp_4",
             # "xGA_opp_4"
             ]
features_to_roll = [
                # "Min", 
                # "Start", # time played
                # 'Gls', 
                # 'Sh', 
                # 'SoT', # Goals
                # 'Ast', # Assists
                # 'CrdY', 'CrdR', # Cards
                # "xG", 'xA', # Expected
                # 'Team_CS', # Defence
                # 'Team_score', 'Opp_score', 'Team_result', # Team form
                # "xGPoints", "CSPoints", # Position-scaled
                # "Cmp%", "PrgP", "PrgC", "T_succ",
                # 'bonus', 'bps', # Bonus
                # 'ICT_index', # ICT
                # "FPL_points", 
                "xP"
                # "Baseline_points", "Bonus" # FPL points
            ]
info = ["Name_original", "GW", "Season", "Team", "Opponent", "Was_home", "Team_rating", "Opp_rating", "Price", "FPL_pos", "FPL_points"]

In [13]:
# df[df["Name"] == "Erling-Haaland"]

# Feature engineering

In [14]:
df["Rating_difference"] = df["Team_rating"] / df["Opp_rating"]
df["Baseline_points"] = df["FPL_points"] - df["Bonus"]
df["Transfers_result"] = df["Transfers_balance"] >= 0

In [15]:
def add_linear_fixtures(df):
    RD_coef_gk = 0.08
    RD_coef_outfield = 0.37
    df["Was_home_xP"] = df["Avg_FPL_points"] * df["Was_home"]
    df.loc[df["FPL_pos"] == "GK", "RD_xP"] = df["Avg_FPL_points"] * ( 1 + (df["Rating_difference"] - 1) * RD_coef_gk )
    df.loc[df["FPL_pos"] != "GK", "RD_xP"] = df["Avg_FPL_points"] * ( 1 + (df["Rating_difference"] - 1) * RD_coef_outfield )
    df["Team_xP"] = df["Avg_FPL_points"] * df["Team_rating"]
    df["Opp_xP"] = df["Avg_FPL_points"] * df["Opp_rating"]
    df['Avg_xP'] = df.groupby('Name_original')['xP'].transform('mean')
    df.loc[df["Was_home"] == True, 'Avg_FPL_points_venue'] = df[df["Was_home"] == True].groupby('Name_original')['FPL_points'].transform('mean')
    df.loc[df["Was_home"] == False, 'Avg_FPL_points_venue'] = df[df["Was_home"] == False].groupby('Name_original')['FPL_points'].transform('mean')
    
    return df

In [16]:
df = add_linear_fixtures(df)

In [17]:
def calculate_team_points(row):
    # win - 3 points
    if row['Team_score'] > row['Opp_score']:
        return 3
    # draw - 1 point
    elif row['Team_score'] == row['Opp_score']:
        return 1
    # loss - 0 points
    else:
        return 0
    
df['Team_result'] = df.apply(calculate_team_points, axis=1)

In [18]:
def add_rolling_features(df, standard_features, features_to_roll):
    features = standard_features
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().groupby(["Name"]).shift(1).reset_index()
        # print(form_means[form_means["Name"] == "Erling-Haaland"])
        form_means = form_means.fillna(method='bfill') # slightly incorrect, better to drop Nan
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df.reset_index(), form_means], axis=1)
        # df = df.merge(form_means, left_index=True, right_index=True)
        df = df.drop([col for col in df.columns if col.startswith('level')], axis=1)
        
    return df.reset_index(), features

In [19]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [20]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 1, # Starting eleven
        'Y*': 1, # Starting eleven as captain
        'N': 0, # Not in starting eleven
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [21]:
df = label_encoding(df, "Start")

In [22]:
df, features = add_rolling_features(df, standard_features, features_to_roll)

In [23]:
# df, features = ohe(df, ["FPL_pos"], features)
# if "GK" in df.columns:
#     df = df.drop("GK", axis=1)
#     features.remove("GK")

In [24]:
# dropping unwanted columns
features = [col for col in features if not col.startswith('level')]
df = df[np.unique(features + info + [to_predict])]

In [25]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [26]:
# dropping NaNs
df = df.dropna(axis=0)

In [27]:
# df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [28]:
df.columns

Index(['FPL_points', 'FPL_pos', 'GW', 'Name_original', 'Opp_rating',
       'Opponent', 'Price', 'RD_xP', 'Season', 'Team', 'Team_rating',
       'Was_home', 'xG_team_4', 'xP', 'xP_4'],
      dtype='object')

In [29]:
df.shape

(18803, 15)

In [30]:
# df = df[~df["FPL_pos"].isin(["GK"])] # only outfield players

In [31]:
df.shape

(18803, 15)

# Data split into train and valid
Let's take entire 2021-22 season and 30 first gameweeks of 2022-23 season as training data and 8 last gameweeks of that season as valid data (~10% of all rows).

In [32]:
def shrink_df_to_top_players(df, n_players, min_fixtures):
    # Calculate average scores
    average_scores = df.groupby('Name_original')[to_predict].mean()
    
    # Calculate player counts
    player_counts = df['Name_original'].value_counts()
    
    # Filter out players with less than min_fixtures occurrences
    popular_players = player_counts[player_counts >= min_fixtures].index
    
    print("Unique players with min_fixtures:", popular_players.size)

    # Sort players by average scores
    sorted_players = average_scores.sort_values(ascending=False)

    # Get the top n_players players
    top_popular_players = sorted_players[sorted_players.index.isin(popular_players)].head(n_players)

    # Filter the original DataFrame
    top_players_df = df[df['Name_original'].isin(top_popular_players.index)]
    
    print("Unique players left:", top_players_df['Name_original'].unique().size)
    
    return top_players_df

In [33]:
CUT_OFF_GAMEWEEK = 31
SEASON_TO_PREDICT = "2022-23"

In [34]:
df_train = shrink_df_to_top_players(df, 550, 10)
# df_train["Name_original"].unique()

Unique players with min_fixtures: 391
Unique players left: 391


In [35]:
df = df.set_index(["Name_original", "Season", "GW"], drop=False)

In [36]:
# training data
X_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)
y_train = df_train[((df_train["Season"] < SEASON_TO_PREDICT) | (df_train["GW"] <= CUT_OFF_GAMEWEEK - 1))].reset_index(drop=True)

In [37]:
# only 31st gameweek
X_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [38]:
# all remaining gameweeks
X_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)
y_test_all_remaining = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) ].reset_index(drop=True)

In [39]:
X_train.shape, X_test.shape, X_test_all_remaining.shape

((16189, 15), (273, 15), (2213, 15))

# Model

In [40]:
# model = GradientBoostingRegressor(random_state=42)
# model = XGBRegressor(
#         random_state=42,
#         n_estimators=500,
#         # early_stopping_rounds=5,
#         learning_rate=0.2
#     )
model = PositionalModelLinear(features, features, [], to_predict)
# model = RandomForestRegressor(random_state=42)

In [41]:
%%time
model.fit(
        X_train, 
        y_train,
        # eval_set=[(X_test_all_remaining, y_test_all_remaining)],
        # verbose=False
    )

CPU times: total: 31.2 ms
Wall time: 26.9 ms


In [42]:
features

['Was_home', 'RD_xP', 'xG_team_4', 'xP_4']

In [43]:
# model.model_GK.feature_names_in_

In [44]:
# model.model_GK.coef_

In [45]:
# model.model_outfield.coef_

# Getting predictions

In [46]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    if all_remaining:
        df_predictions = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
    else:
        df_predictions = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    
    preds = df_predictions[info + [to_predict] + ["Pred"]].sort_values(by=["Pred"], ascending = False)
    
    return preds, predictions

In [47]:
preds, predictions = get_predictions(model, df, X_test)

In [48]:
preds_all_gameweeks, predictions_all = get_predictions(model, df, X_test_all_remaining, True)

# Custom metric
Such models are usually used to compare players and decide who to pick for upcoming gameweek(s). For each player pair in a subset of players model can either predict the outcome well (e.g. player A scores more than player B and model predicts exactly that) or predict wrongly (e.g. player A scores more than player B but model predicts player B > player A). Pairwise_accuracy returns the percentage of corrected predicted pairs. Pairwise_accuracy_topX is a variation of this metric calculated only for X highest scoring players of last two seasons.

In [49]:
def get_top_performer_names(df, no_top, no_gws):
    # takes no_top players that recorded highest average FPL points in no_gws last gameweeks
    return df.reset_index(drop=True).groupby("Name_original")[to_predict].mean().groupby("Name_original").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [50]:
def pairwise_accuracy(predicted_scores, true_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] > true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] < true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [51]:
def pairwise_accuracy_topX(model, df, top_x, all_gw=False):
    # pairwise_accuracy for top_X players
    if all_gw:
        top_performers = df[(df["GW"] == CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]
    else:
        top_performers = df[(df["GW"] >= CUT_OFF_GAMEWEEK) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name_original"].isin(get_top_performer_names(df, top_x, 50)))]

    X = top_performers.reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    
    return  pairwise_accuracy(preds, y)

# Evaluation

In [52]:
def evaluate(model, df, predictions, y_true, all_gw=False):
    # function to calculate different metrics for a given model
    mae = mean_absolute_error(y_true[to_predict], predictions)
    mse = mean_squared_error(y_true[to_predict], predictions)

    pairwise_acc = pairwise_accuracy(np.array(y_true[to_predict]), predictions)
    pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20, all_gw)
    pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100, all_gw)
    
    print("MAE:", mae)
    print("MSE:", mse)
    print("Pairwise accuracy:", pairwise_acc)
    print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
    print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)

In [53]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [54]:
# next gameweek
# evaluate(model, df, predictions, y_test, all_gw=False)

In [55]:
# all remaining gameweeks
evaluate(model, df, predictions_all, y_test_all_remaining, all_gw=True)

MAE: 1.532341090639905
MSE: 4.550491202979937
Pairwise accuracy: 0.6420367399935774
Pairwise accuracy @TOP100: 0.5459459459459459
Pairwise accuracy @TOP20: 0.7272727272727273


In [56]:
pred_sum = preds_all_gameweeks["Pred"].sum()
fpl_sum = preds_all_gameweeks["FPL_points"].sum()
xp_sum = preds_all_gameweeks["xP"].sum()

pd.DataFrame([pred_sum, fpl_sum, xp_sum], index=["Pred", "FPL", "xP"], columns=["Sum"])

Unnamed: 0,Sum
Pred,5651.22661
FPL,5261.0
xP,5660.7


In [57]:
# model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

In [58]:
features

['Was_home', 'RD_xP', 'xG_team_4', 'xP_4']

In [59]:
len(features)

4

# Predictions - next gameweek only

In [60]:
preds.head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
83,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,9.8,7.640441
197,Mohamed Salah,31,2022-23,Liverpool,Leeds United,0.0,1925.248169,1694.963013,12.8,MID,14,7.5,6.633067
152,Kevin De Bruyne,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.1,MID,8,6.5,5.899091
180,Gabriel Martinelli Silva,31,2022-23,Arsenal,West Ham,0.0,1946.8479,1751.608398,6.8,MID,5,4.6,5.47378
110,Ivan Toney,31,2022-23,Brentford,Wolves,0.0,1782.324097,1717.951904,7.8,FWD,2,4.0,5.337934
211,Ollie Watkins,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.513672,1876.32251,7.6,FWD,16,10.5,5.142981
243,Solly March,31,2022-23,Brighton,Chelsea,0.0,1826.634033,1827.184204,5.2,MID,5,2.8,5.036337
244,Son Heung-min,31,2022-23,Tottenham,Bournemouth,1.0,1851.721558,1665.276123,11.6,MID,9,8.2,4.930364
260,Tyrone Mings,31,2022-23,Aston Villa,Newcastle Utd,1.0,1801.513672,1876.32251,4.5,DEF,6,6.0,4.816047
124,Jarrod Bowen,31,2022-23,West Ham,Arsenal,1.0,1751.608398,1946.8479,8.0,MID,7,3.0,4.769841


In [61]:
# preds[preds["Team"] == "Manchester City"].head(20)

# Predictions - all remaining gameweeks

In [62]:
preds_all_gameweeks.head(15)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,xP,Pred
690,Erling Haaland,35,2022-23,Manchester City,Leeds United,1.0,2069.102051,1662.209595,12.4,FWD,2,6.0,8.767133
689,Erling Haaland,34,2022-23,Manchester City,Fulham,0.0,2064.563232,1717.714355,12.4,FWD,8,9.0,8.268556
691,Erling Haaland,36,2022-23,Manchester City,Everton,0.0,2071.091064,1694.975342,12.4,FWD,7,4.6,8.036137
1589,Mohamed Salah,37,2022-23,Liverpool,Aston Villa,1.0,1955.095703,1812.286499,13.1,MID,5,3.7,7.991993
692,Erling Haaland,37,2022-23,Manchester City,Chelsea,1.0,2087.472656,1794.312012,12.4,FWD,1,1.4,7.79113
1590,Mohamed Salah,38,2022-23,Liverpool,Southampton,0.0,1950.095581,1616.916504,13.1,MID,5,8.6,7.741121
688,Erling Haaland,33,2022-23,Manchester City,Arsenal,1.0,2057.953369,1928.582397,12.3,FWD,14,11.6,7.73343
687,Erling Haaland,31,2022-23,Manchester City,Leicester City,1.0,2060.531738,1722.042847,12.2,FWD,12,9.8,7.640441
1588,Mohamed Salah,36,2022-23,Liverpool,Leicester City,0.0,1944.225098,1709.140259,13.1,MID,12,10.2,7.183079
1584,Mohamed Salah,32,2022-23,Liverpool,Nott'ham Forest,1.0,1926.870483,1623.869141,12.9,MID,7,4.3,7.105369


In [63]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Mohamed Salah"].sort_values(by=["GW"])

In [64]:
# preds_all_gameweeks[preds_all_gameweeks["Name_original"] == "Erling Haaland"].sort_values(by=["GW"])

# Saving to file

In [65]:
preds_all_gameweeks.to_csv("predictions/all_predictions.csv", index=False)

In [66]:
# pickle.dump(model, open("models/GBR.pkl", 'wb'))

# Future preds

In [67]:
df_upcoming = pd.read_csv("data/upcoming/upcoming_fixtures.csv")

Columns (2,3,6,36,43,44,45,59,117,119,180,241) have mixed types.Specify dtype option on import or set low_memory=False.


In [68]:
df_upcoming.shape

(37410, 307)

In [69]:
df_upcoming = add_linear_fixtures(df_upcoming)

In [70]:
df_upcoming.shape

(37410, 313)

In [71]:
def adjust_goalkeepers(df, df_upcoming):
    treshold = 30
    gk_last_season = pd.read_csv('data/misc/gk_last_season.csv')
    for gk in df_upcoming[df_upcoming["FPL_pos"] == "GK"]["Name_original"].unique():
        gk_games = df[df["Name_original"] == gk].shape[0]
        if gk_games < treshold:
            current_avg = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Avg_FPL_points"].item()
            gk_team = df_upcoming[df_upcoming["Name_original"] == gk].tail(1)["Team"].item()
            if gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)].shape[0] > 0:
                last_season_avg = gk_last_season[gk_last_season["team"] == current_teams_to_fpl(gk_team)]["total_points"].item()
            else:
                last_season_avg = 3.2 # default for newly-promoted teams
            df_upcoming.loc[df_upcoming["Name_original"] == gk, "Avg_FPL_points"] = ( current_avg * gk_games + last_season_avg * (30 - gk_games) ) / 30
            # print(gk, gk_games, gk_team, current_avg, last_season_avg, current_avg * gk_games)
    return df_upcoming

In [72]:
df_upcoming = adjust_goalkeepers(df, df_upcoming)

In [73]:
# df_upcoming[df_upcoming["FPL_pos"] == "GK"].groupby(["Name_original"]).tail(1)[features + ["Name_original"]]

In [74]:
df_upcoming = df_upcoming[~df_upcoming[features].isnull().any(axis=1)] # for some reason 45 rows are missing some feature values

In [75]:
df_upcoming.tail(5)

Unnamed: 0.1,Unnamed: 0,Date,Day,Venue,Team,Opponent,Name,Start,Pos,Min,...,FWD,GK,MID,Finished,Was_home_xP,RD_xP,Team_xP,Opp_xP,Avg_xP,Avg_FPL_points_venue
37405,2886,2024-04-20,,,Burnley,Sheffield United,,,FW,,...,1.0,0.0,0.0,False,0.0,1.693973,2838.312988,2717.960612,2.0,
37406,2887,2024-04-27,,,Burnley,Manchester Utd,,,FW,,...,1.0,0.0,0.0,False,0.0,1.614323,2838.312988,3101.582845,2.0,
37407,2758,2024-05-04,,,Burnley,Newcastle Utd,,,FW,,...,1.0,0.0,0.0,False,1.666667,1.612885,2838.312988,3109.506226,2.0,1.666667
37408,2888,2024-05-11,,,Burnley,Tottenham,,,FW,,...,1.0,0.0,0.0,False,0.0,1.618602,2838.312988,3078.240763,2.0,
37409,2759,2024-05-19,,,Burnley,Nott'ham Forest,,,FW,,...,1.0,0.0,0.0,False,1.666667,1.672378,2838.312988,2812.266032,2.0,1.666667


In [76]:
df_upcoming = df_upcoming[(df_upcoming["Season"] == CURRENT_SEASON) & (df_upcoming["GW"] >= NEXT_GAMEWEEK)].reset_index(drop=True)

In [77]:
df_upcoming.shape

(16946, 313)

In [78]:
df_upcoming

Unnamed: 0.1,Unnamed: 0,Date,Day,Venue,Team,Opponent,Name,Start,Pos,Min,...,FWD,GK,MID,Finished,Was_home_xP,RD_xP,Team_xP,Opp_xP,Avg_xP,Avg_FPL_points_venue
0,1953,2023-09-16,,,Brighton,Manchester Utd,,,FW,,...,1.0,0.0,0.0,False,0.000000,0.553218,1022.104763,1033.860948,0.722222,0.666667
1,1928,2023-09-24,,,Brighton,Bournemouth,,,FW,,...,1.0,0.0,0.0,False,0.555556,0.577955,1022.104763,921.671482,0.722222,0.500000
2,1954,2023-09-30,,,Brighton,Aston Villa,,,FW,,...,1.0,0.0,0.0,False,0.000000,0.556809,1022.104763,1015.911933,0.722222,0.666667
3,1929,2023-10-08,,,Brighton,Liverpool,,,FW,,...,1.0,0.0,0.0,False,0.555556,0.542817,1022.104763,1089.631890,0.722222,0.500000
4,1955,2023-10-21,,,Brighton,Manchester City,,,FW,,...,1.0,0.0,0.0,False,0.000000,0.531176,1022.104763,1159.640978,0.722222,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16941,2886,2024-04-20,,,Burnley,Sheffield United,,,FW,,...,1.0,0.0,0.0,False,0.000000,1.693973,2838.312988,2717.960612,2.000000,
16942,2887,2024-04-27,,,Burnley,Manchester Utd,,,FW,,...,1.0,0.0,0.0,False,0.000000,1.614323,2838.312988,3101.582845,2.000000,
16943,2758,2024-05-04,,,Burnley,Newcastle Utd,,,FW,,...,1.0,0.0,0.0,False,1.666667,1.612885,2838.312988,3109.506226,2.000000,1.666667
16944,2888,2024-05-11,,,Burnley,Tottenham,,,FW,,...,1.0,0.0,0.0,False,0.000000,1.618602,2838.312988,3078.240763,2.000000,


In [79]:
X_test = df_upcoming

In [80]:
def adjust_for_injuries(df):
    active_players = pd.read_csv("data/misc/active_players.csv")
    injured_players = active_players[active_players["chance_of_playing_next_round"] == 0]["name"].to_list()
    df.loc[df["Name_original"].isin(injured_players),"Pred"] = 0
    return df

In [81]:
def adjust_first_choice_goalkeepers(df):
    gks = pd.read_csv("data/misc/goalkeepers.csv")
    gk_names = gks["Name"].to_list()
    gk_names = [name_fbref_to_fpl(neutralize_name(n)) for n in gk_names]
    # print(gk_names)
    df.loc[(df["FPL_pos"] == "GK") & ~(df["Name_original"].apply(neutralize_name).isin(gk_names)), "Pred"] = 0
    return df

In [82]:
def get_predictions(model, df, X, all_remaining=False):
    # make predictions on the test data and glues them to the rest of the dataframe
    predictions = model.predict(X)
    df_predictions = df[(df["Season"] == CURRENT_SEASON)].reset_index(drop=True)
        
    df_predictions.loc[:, "Pred"] = predictions
    preds = df_predictions[info + ["Pred"]]
    
    preds = adjust_first_choice_goalkeepers(preds)
    preds = adjust_for_injuries(preds).sort_values(by=["Pred"], ascending = False)
    
    return preds

In [83]:
# df_upcoming[df_upcoming["Name_original"] == "Erling Haaland"][["Name_original", "Was_home", 'Avg_FPL_points_venue']]

In [84]:
preds = get_predictions(model, df_upcoming[df_upcoming["FPL_pos"].isin(allowed_pos)], X_test[X_test["FPL_pos"].isin(allowed_pos)])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [85]:
# X_test[(X_test["GW"] == NEXT_GAMEWEEK) & (X_test["FPL_pos"] == "GK")][features + ["Name_original"]]

In [86]:
# next gameweek
preds[preds["GW"] == NEXT_GAMEWEEK].head(30)

Unnamed: 0,Name_original,GW,Season,Team,Opponent,Was_home,Team_rating,Opp_rating,Price,FPL_pos,FPL_points,Pred
4785,Erling Haaland,5.0,2023-24,Manchester City,West Ham,0.0,2087.35376,1807.522339,14.1,FWD,,8.099314
6922,James Maddison,5.0,2023-24,Tottenham,Sheffield United,1.0,1846.944458,1630.776367,7.8,MID,,7.278045
11933,Mohamed Salah,5.0,2023-24,Liverpool,Wolves,0.0,1961.337402,1708.177246,12.5,MID,,6.676491
7126,James Ward-Prowse,5.0,2023-24,West Ham,Manchester City,1.0,1807.522339,2087.35376,6.1,MID,,6.405799
3869,Destiny Udogie,5.0,2023-24,Tottenham,Sheffield United,1.0,1846.944458,1630.776367,4.7,DEF,,6.225135
2206,Bruno Borges Fernandes,5.0,2023-24,Manchester Utd,Brighton,1.0,1860.949707,1839.788574,8.4,MID,,5.58988
2307,Bukayo Saka,5.0,2023-24,Arsenal,Everton,0.0,1928.752808,1682.138794,8.7,MID,,5.5801
7262,Jarrod Bowen,5.0,2023-24,West Ham,Manchester City,1.0,1807.522339,2087.35376,7.1,MID,,5.388615
5564,Guglielmo Vicario,5.0,2023-24,Tottenham,Sheffield United,1.0,1846.944458,1630.776367,5.1,GK,,5.358993
12069,Moussa Diaby,5.0,2023-24,Aston Villa,Crystal Palace,1.0,1828.641479,1768.190308,6.6,MID,,5.351518


In [87]:
# preds[(preds["GW"] == NEXT_GAMEWEEK) & (preds["FPL_pos"] == "GK")].head(21)

In [88]:
n_gameweeks = 5
pivot = pd.pivot_table(preds[preds["GW"] < NEXT_GAMEWEEK + n_gameweeks], values='Pred', index=['Name_original', "FPL_pos"],
                       columns=['GW'], aggfunc=np.sum)
pivot['Summary'] = pivot.sum(axis=1)
pivot.columns = ['GW' + str(col).split(".")[0] for col in pivot.columns[:-1]] + ['Summary']
pivot = pivot.sort_values(by=["Summary"], ascending = False)
pivot.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,GW5,GW6,GW7,GW8,GW9,Summary
Name_original,FPL_pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Erling Haaland,FWD,8.099314,8.789482,8.229331,7.958799,8.591072,41.667997
James Maddison,MID,7.278045,6.623418,6.93348,6.987837,7.153617,34.976397
Mohamed Salah,MID,6.676491,6.90773,6.529482,6.53652,7.045559,33.695782
James Ward-Prowse,MID,6.405799,6.168605,6.848472,6.593639,6.290711,32.307226
Destiny Udogie,DEF,6.225135,5.354165,5.95186,5.643188,6.126451,29.300799
Bukayo Saka,MID,5.5801,5.789597,5.6003,5.637617,5.5049,28.112514
Bruno Borges Fernandes,MID,5.58988,5.40298,5.638134,5.598659,5.459987,27.68964
Jarrod Bowen,MID,5.388615,5.1175,5.683472,5.513732,5.198833,26.902153
Bryan Mbeumo,MID,5.074894,5.516654,5.174636,5.077305,5.503841,26.34733
Guglielmo Vicario,GK,5.358993,5.108756,5.297262,5.174045,5.336701,26.275757


In [89]:
pivot[pivot.index.get_level_values('FPL_pos') == 'GK']

Unnamed: 0_level_0,Unnamed: 1_level_0,GW5,GW6,GW7,GW8,GW9,Summary
Name_original,FPL_pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Guglielmo Vicario,GK,5.358993,5.108756,5.297262,5.174045,5.336701,26.275757
Alisson Ramses Becker,GK,4.44135,4.767251,4.416849,4.418022,4.790222,22.833695
José Malheiro de Sá,GK,4.051828,3.906413,4.039237,4.066962,3.896185,19.960625
Ederson Santana de Moraes,GK,3.688374,4.043059,3.703719,3.671791,4.019643,19.126585
Emiliano Martínez Romero,GK,3.589637,3.3952,3.580649,3.404099,3.584611,17.554197
Nick Pope,GK,3.547332,3.382691,3.564854,3.356199,3.555287,17.406363
Aaron Ramsdale,GK,3.357929,3.528228,3.361578,3.500776,3.344346,17.092856
Mark Flekken,GK,3.302781,3.519463,3.325088,3.30332,3.516597,16.967251
Norberto Murara Neto,GK,3.454284,3.254113,3.439625,3.27065,3.461358,16.88003
Bernd Leno,GK,2.944002,2.877783,2.928357,2.940449,2.871902,14.562492


In [90]:
# points all season
pd.DataFrame(preds.groupby(["Name_original"])["Pred"].sum().sort_values(ascending=False)).head(30)

Unnamed: 0_level_0,Pred
Name_original,Unnamed: 1_level_1
Erling Haaland,277.263199
James Maddison,236.155647
Mohamed Salah,230.322216
James Ward-Prowse,221.32418
Destiny Udogie,197.065683
Bukayo Saka,192.519317
Bruno Borges Fernandes,186.678046
Jarrod Bowen,183.75222
Guglielmo Vicario,178.054381
Moussa Diaby,176.265443


In [91]:
preds.to_csv("predictions/preds_next_season.csv", index=False)

# Feature importance and influence

In [92]:
# explainer = shap.Explainer(model.model_outfield.predict, X_test[2000:3000])
# shap_values = explainer(X_test[2000:3000])

In [93]:
# shap.plots.bar(shap_values, max_display=15)

In [94]:
# shap.plots.beeswarm(shap_values, max_display=15)

In [95]:
# # explaining Erling Haaland`s score in gameweek 31
# shap.plots.bar(shap_values[69], max_display=15)

In [96]:
# shap.plots.waterfall(shap_values[410])

In [97]:
# shap.plots.waterfall(shap_values[26])

In [98]:
# shap.plots.waterfall(shap_values[96])