
# Imports

In [None]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

from statistics import mean
from scipy.stats import tmean
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
from sklearn.inspection import plot_partial_dependence

In [None]:
# loading csv
df_original = pd.read_csv('data/final_dataset.csv').drop("Unnamed: 0", axis=1)

df = df_original.copy()

In [None]:
# df_original[df_original["Name"] == "Mohamed-Salah"].groupby(["Pos"]).mean()["Total Points"]

In [None]:
df.columns

# Preparing dataset

In [None]:
# features = ["Was Home", "Team rating", "Opp rating", 'ScoreForLast5', 'ScoreAgainstLast5', 
#         'MinAvgOverall', 'GlsAvgOverall',
#        'AstAvgOverall', 'CrdYAvgOverall', 'CrdRAvgOverall', 'xGAvgOverall',
#        'xAGAvgOverall', 'CSAvgOverall', 'Team ScoreAvgOverall',
#        'Opp ScoreAvgOverall']

In [None]:
def calculate_xg_points(row):
    if row["FPL position"] == "GK":
        return row["xG"] * 6
    elif row["FPL position"] == "DEF":
        return row["xG"] * 6
    elif row["FPL position"] == "MID":
        return row["xG"] * 5
    elif row["FPL position"] == "FWD":
        return row["xG"] * 4
    else:
        return row["xG"] * 5

df["xGPoints"] = df.apply(calculate_xg_points, axis=1)

In [None]:
def calculate_cs_points(row):
    if row["FPL position"] == "GK":
        return row["xG"] * 4
    elif row["FPL position"] == "DEF":
        return row["xG"] * 4
    elif row["FPL position"] == "MID":
        return row["xG"] * 1
    elif row["FPL position"] == "FWD":
        return row["xG"] * 0
    else:
        return row["xG"] * 1

df["CSPoints"] = df.apply(calculate_cs_points, axis=1)

In [None]:
to_predict = ["Total Points"]
features1 = ["Was Home", "Team rating", "Opp rating", 
             # 'ScoreForLast5', 'ScoreAgainstLast5',
             ]
features_to_roll = [
                "Min", "Start",
                'Gls', 'Ast', 'Sh',
                'SoT', 'CrdY', 'CrdR', 'xAG', 
                # 'CS', "xG",
                'Team Score', 'Opp Score',
                "xGPoints", "CSPoints",
                # "Cmp%", "PrgP", "PrgC", "Succ", "PKwon",
                "Total Points"
            ]
info = ["Name", "GW", "Season", "Squad", "Opponent", "Was Home", "Team rating", "Opp rating", "FPL position"]

In [None]:
def add_rolling_features(df, features1, features_to_roll):
    rolling_gameweeks = [2,5]
    features = features1
    
    for r in rolling_gameweeks:
        form_means = df.groupby(["Name"])[features_to_roll].rolling(r, min_periods=1).mean().reset_index()
        form_means = form_means.shift(1)
        
        form_means.columns = [f'{col}{"_"}{r}' for col in form_means.columns]
        features += form_means.columns.tolist()
        features = list(filter(lambda x: x not in ["Name_" + str(r)], features))
        df = pd.concat([df, form_means], axis=1)
        
    return df, features

In [None]:
def ohe(df, ohe_columns, features):
    # one hot encoding
    for c in ohe_columns:
        ohe_c = pd.get_dummies(df[c], dtype="int64")
        df = pd.concat([df, ohe_c], axis=1)
        df = df.drop(c, axis=1)

        features += ohe_c.columns.tolist()
        
    return df, features

In [None]:
def label_encoding(df, column_to_encode):
    
    mapping_dict = {
        'Y': 2,
        'Y*': 1,
        'N': 0,
    }
    
    df[column_to_encode] = df[column_to_encode].map(mapping_dict)
    
    return df

In [None]:
df, features = add_rolling_features(df, features1, features_to_roll)

In [None]:
df, features = ohe(df, ["FPL position"], features)

In [None]:
df = label_encoding(df, "Start")

In [None]:
df = df[np.unique(features + info + to_predict)]

In [None]:
df = df[pd.to_numeric(df["GW"], errors="coerce").notna()]
df["GW"] = df["GW"].astype("uint64")

In [None]:
# dropping NaNs
df = df.dropna(axis=0)

In [None]:
df = df.set_index(["Name", "Season", "GW"], drop=False)

In [None]:
df.columns

# Choosing features and splitting data

In [None]:
GAMEWEEK_TO_PREDICT = 31
SEASON_TO_PREDICT = "2022-23"

In [None]:
# gameweeks 6-14 is training data, gw 15 is test data
X_train = df[((df["Season"] <= SEASON_TO_PREDICT) & (df["GW"] >= 1) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1))][features].reset_index(drop=True)
y_train = df[((df["Season"] <= SEASON_TO_PREDICT) & (df["GW"] >= 1) & (df["GW"] <= GAMEWEEK_TO_PREDICT - 1))][to_predict].reset_index(drop=True)

In [None]:
X_test = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) ][features].reset_index(drop=True)
y_test = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) ][to_predict].reset_index(drop=True)

In [None]:
X_train.shape, X_test.shape

# Model

In [None]:
class HybridModel:
    def __init__(self, models, names, weights):
        self.models = models
        self.model_names = names
        self.weights = weights
    
    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
        
    def predict(self, X):
        model_preds = [model.predict(X) for model in self.models]
        preds = []
        for i in range(len(model_preds[0])):
            preds.append(np.mean([x[i] for x in model_preds]))
        return preds
    
    def evaluate(self, X, y):
        ys = [np.expm1(model.predict(X)) for model in self.models]
        y_ensemble = self.predict(X)
        
        rmsle_ys = [(mean_squared_log_error(y, y_model) ** 0.5) for y_model in ys]
        rmsle_ensemble = mean_squared_log_error(y, y_ensemble) ** 0.5
        
        for i in range(0, len(self.models)):
            print(f"RMSLE on {self.model_names[i]} model: {rmsle_ys[i]}")
        print(f"RMSLE on ensemble model: {rmsle_ensemble}")

In [None]:
lgbm = LGBMRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

names = [
    "LGBM",
    "XGB",
    "GBR"
]

weights = [
    1,
    1,
    1
]

# hybrid arguments
models = [
    lgbm, 
    xgb, 
    gbr
]

model = HybridModel(models, names, weights)

In [None]:
# model = GradientBoostingRegressor(random_state=42)
# model = LGBMRegressor(random_state=42)
# model = XGBRegressor(random_state=42)
# model = RandomForestRegressor()

In [None]:
%%time
model.fit(X_train, np.array(y_train).ravel())

# Evaluation

In [None]:
# make predictions on the test data
predictions = model.predict(X_test)

In [None]:
def pairwise_accuracy(predicted_scores, true_scores):
    if len(predicted_scores) != len(true_scores):
        raise ValueError("The length of predicted_scores and true_scores must be the same.")

    num_pairs = 0
    num_correct_pairs = 0

    for i in range(len(predicted_scores)):
        for j in range(i + 1, len(predicted_scores)):
            # Check if the predicted order matches the true order
            if (predicted_scores[i] > predicted_scores[j] and true_scores[i] > true_scores[j]) or \
               (predicted_scores[i] < predicted_scores[j] and true_scores[i] < true_scores[j]):
                num_correct_pairs += 1
            num_pairs += 1

    pairwise_accuracy = num_correct_pairs / num_pairs
    return pairwise_accuracy

In [None]:
df_predictions = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT)].reset_index(drop=True)
df_predictions.loc[:, "pred"] = predictions

In [None]:
def get_top_performer_names(df, no_top, no_gws):
    return df.reset_index(drop=True).groupby("Name")["Total Points"].mean().groupby("Name").tail(no_gws).sort_values(ascending=False).head(no_top).index.to_list()

In [None]:
def pairwise_accuracy_topX(model, df, top_x):
    top_performers = df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, top_x, 50)))]
    X = top_performers[features].reset_index(drop=True)
    y = np.array(top_performers[to_predict].reset_index(drop=True))
    preds = model.predict(X)
    # print(preds)
    # print(y)
    
    return  pairwise_accuracy(preds, y)

In [None]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 20, 50)))].columns

In [None]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

pairwise_acc = pairwise_accuracy(np.array(y_test), predictions)
pairwise_accuracy_top20 = pairwise_accuracy_topX(model, df, 20)
pairwise_accuracy_top100 = pairwise_accuracy_topX(model, df, 100)

In [None]:
print("MAE:", mae)
print("MSE:", mse)
print("Pairwise accuracy:", pairwise_acc)
print("Pairwise accuracy @TOP100:", pairwise_accuracy_top100)
print("Pairwise accuracy @TOP20:", pairwise_accuracy_top20)

In [None]:
# df[df["Name"] == "Abdoulaye-Doucoure"][info]

In [None]:
# model.predict(df[df["GW"] == GAMEWEEK_TO_PREDICT][df["Name"] == "Mohamed-Salah"][features])

In [None]:
preds = df_predictions[info + to_predict + ["pred"]].sort_values(by=["pred"], ascending = False)

In [None]:
preds.head(30)

In [None]:
preds[preds["Squad"] == "Manchester City"].head(20)

In [None]:
preds[preds["FPL position"] == "FWD"].head(10)

# Feature importance and influence

In [None]:
# plot_importance(model)

In [None]:
# fig, ax = plt.subplots(figsize=(16, 8))
# plot_partial_dependence(model, X_train, features, ax=ax)

In [None]:
explainer = shap.Explainer(model.predict, X_test)
shap_values = explainer(X_test)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values[218], max_display=15)

In [None]:
# get_top_performer_names(df, 20, 50)

In [None]:
# df[(df["GW"] == GAMEWEEK_TO_PREDICT) & (df["Season"] == SEASON_TO_PREDICT) & (df["Name"].isin(get_top_performer_names(df, 50, 50)))]