In [1]:
import pandas as pd
import scipy
import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import os
from sklearn.metrics import r2_score
from scipy.stats import binom, poisson, nbinom, expon, gamma
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import warnings
import statsmodels.api as sm
from sklearn import linear_model
import sys
from catboost import CatBoostRegressor
pd.set_option('display.max_colwidth', None)

sys.path.insert(0,'C:/MyDevelopment/Goalscorers')
from helper_functions import *
import data_cleaning as dc
import feature_engineering as fe

In [2]:
goal_exps = pd.read_csv("C:/MyDevelopment/Goalscorers/goal_expectancies/fbref_matched_expectancies.csv")

In [3]:
def get_pre_processed_data(seasons_to_load, leagues_to_load, data_exp):
    data = dc.load_data(seasons_to_load, leagues_to_load)
    data = dc.add_datetime(data)
    data = dc.add_opposite_team(data)
    data = dc.map_position(data)
    data = dc.add_npg(data)
    data = dc.add_year_week(data)
    data = dc.add_goal_expectancies(data,data_exp)
    data = dc.add_supremacy(data)
    data = dc.add_npxg_per_minute(data)
    data = dc.add_team_scored_and_conceded_npxg(data)
    data = dc.add_solo_striker_position(data)
    data = dc.add_main_opposing_gk(data)
    data = dc.remove_gk(data)
    data = dc.drop_NAs(data)
    
    return data

In [4]:
preprocessed_data = get_pre_processed_data(None, None, goal_exps)

In [5]:
##feature engineering
#map npxg to be non-zero
data = preprocessed_data.copy(deep=True)

#add average npxg per minute features 
data = fe.add_player_avg_feature(data, "npxg_per_min", 5)
data = fe.add_player_avg_feature(data, "npxg_per_min", 10)

#add average npxg per minute / average team conceded npxg features, for 2,6,10 previous appearances
data, team_data = fe.add_team_avg_feature(data, "team_conceded_npxg", 5)
data["npxg_per_min_over_squad_opp_avg"] = data.npxg_per_min - data.avg_team_conceded_npxg_l5 
data = fe.add_player_avg_feature(data, "npxg_per_min_over_squad_opp_avg", 5)
data = fe.add_player_avg_feature(data, "npxg_per_min_over_squad_opp_avg", 10)

#add average touches in att_3rd and touches in att_pen_area in last 5 games
data["touches_att_3rd_per_min"] = data.touches_att_3rd/data.minutes
data["touches_att_pen_area_per_min"] = data.touches_att_pen_area/data.minutes

data = fe.add_player_avg_feature(data, "touches_att_3rd_per_min", 5)
data = fe.add_player_avg_feature(data, "touches_att_pen_area_per_min", 5)
data.npxg = data.npxg.transform(lambda x: 0.0001 if x == 0 else x)

#add features
data["frac_90"] = data.minutes/90
data["ln_frac_90"] = np.log(data.frac_90)
data["ln_frac_90_start"] = data.ln_frac_90 * data.start
data["ln_frac_90_not_start"] = data.ln_frac_90 * (1.0 - data.start)
data["goal_exp_2"] = data.goal_exp ** 2
data["supremacy_2"] = data.supremacy ** 2
data["is_home"] = np.where(data.squad == data.home_team, 1, 0)
data["start"] = np.where(data.start == True, 1, 0)

In [6]:
print(preprocessed_data.shape)
print(data.shape)

(442937, 72)
(442937, 89)


In [None]:
#get_week_difference
def get_week_difference(data, current_year, current_week):
    def func(row):
        d = (current_year - row["year"]) * 52 + (current_week - row["week"])
        assert(d >= 0)
        return d
        
    diff = data.apply(func, axis = 1)
    return diff

#get training weights
def get_weights(data, year, week, decay_factor):
    if decay_factor != None:
        week_diff = get_week_difference(data, year, week)
        weights = np.exp(-decay_factor*week_diff)
    else:
        weights = np.full(len(data), 1.0)
        
    return weights

#standardize
def standardize(X, cols_to_standardize, scaler=None):
    
    if scaler==None:
        scaler = StandardScaler()
        scaler.fit(X[cols_to_standardize])
    
    X[cols_to_standardize] = scaler.transform(X[cols_to_standardize])
    return X, scaler

#estimate gamma parameters
def get_gamma_parameters(y_train, train_preds, test_preds, n_features):
            
    def estimate_x2_scale(y, mu, n_sample, dof):
        resid = np.power(y- mu, 2)
        variance = mu ** 2
        df_residuals = n_sample - dof
        return np.sum(resid / variance) / df_residuals

    inv_shape_param =  estimate_x2_scale(y_train, train_preds, len(y_train), n_features)
    shape_from_model = 1/inv_shape_param

    scales = [m_i /shape_from_model for m_i in test_preds]
    shapes = np.full(len(scales), shape_from_model)

    return scales, shapes

#get log likelihoods given expectancies
def calculate_log_likelihood(y_true, parameters, distribution='Poisson',individual_scores=False):
    assert(len(y_true) == len(parameters[0]))
    if distribution =='Poisson' or distribution == 'Exponential':
        assert(len(parameters) == 1)
    elif distribution =='Gamma':
        assert(len(parameters) == 2)
    
    if distribution=='Poisson':
        expectancies = parameters[0]
        probs = poisson.pmf(y_true, expectancies)
        ind_log_ll = np.log(probs)
        
    elif distribution=='Exponential':
        expectancies = parameters[0]
        probs = expon.pdf(y_true, scale= 1/expectancies)
        ind_log_ll = np.maximum(-10000,np.log(probs))
        
    elif distribution=='Gamma':
        scales = parameters[0]
        shapes = parameters[1]
        probs = gamma.pdf(y_true, a = shapes,scale=scales)
        ind_log_ll = np.maximum(-10000,np.log(probs))
        
    else:
        raise ValueError('Invalid distribution argument passed.')
        
    
    log_ll = np.sum(ind_log_ll)
    avg_log_ll = log_ll/len(probs)
    
    if individual_scores == False:
        return log_ll, avg_log_ll
    else:
        return ind_log_ll

In [None]:
data.columns

In [None]:
def ohe(data, categorical_columns) -> pd.DataFrame:
    data_ohe = data.copy(deep=True)
    return (
        data_ohe
        .join(pd.get_dummies(data[categorical_columns].astype(str), dtype=int))
    )

def get_features(data, cols_to_ohe):
    features = [
        'goal_exp', 'supremacy', 'frac_90'
    ]
    
    for col in cols_to_ohe:
        new_features = [f"{col}_{val}" for val in data[col].unique()]
        features = features + new_features
    
    return features

#target_variable = 'npxg'
target_variable_cat = 'npg'

cols_to_standardize = [
    'goal_exp',
    'supremacy',
    'ln_frac_90_start',
    'ln_frac_90_not_start',
    'avg_npxg_per_min_l5',
    'avg_npxg_per_min_l10',
    'avg_team_conceded_npxg_l5',
    'avg_npxg_per_min_over_squad_opp_avg_l5',
    'avg_npxg_per_min_over_squad_opp_avg_l10',
    'avg_touches_att_3rd_per_min_l5',
    'avg_touches_att_pen_area_per_min_l5',
]

cols_to_ohe = [
    'position',
    'squad_opp',
    'player',
    'gk_opp'
]

cols_other = [
    'start',
    'is_home'
]

#data_model = ohe(data, cols_to_ohe)
data_catboost = data.copy()

#model_features = get_features(data_model, cols_to_ohe)
catboost_features = cols_other + cols_to_standardize + cols_to_ohe

In [None]:
data_catboost[catboost_features].head(3)

In [None]:
# train_data = data_model[(data.season == '2017-2018') | (data.season == '2018-2019') | (data.season == '2019-2020')]
# test_data = data_model[(data.season == '2020-2021')]

train_data_catboost = data_catboost[(data.season == '2017-2018') | (data.season == '2018-2019') | (data.season == '2019-2020')]
test_data_catboost = data_catboost[(data.season == '2020-2021')]

In [None]:
def constant_train_predict(train_data, test_data, reg_parameter, features, target_variable, cols_to_standardize):
    #get x and y data
    X_train, X_test = train_data[model_features].copy(deep=True), test_data[model_features].copy(deep=True)
    y_train, y_test = train_data[target_variable], test_data[target_variable]

    #standardize
    X_train, scaler = standardize(X_train, cols_to_standardize=cols_to_standardize)
    X_test, _ = standardize(X_test, cols_to_standardize=cols_to_standardize, scaler=scaler)

    #add constant
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    #train model
    model = linear_model.GammaRegressor(fit_intercept=False, alpha = reg_parameter, max_iter = 20_000)
    model.fit(X_train, y_train)
    
    #get predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_pred_df = train_data[["date", "position", "player_id", "player", "npxg", "npg"]].reset_index(drop=True).copy(deep=True)
    train_pred_df["npxg_pred"] = train_preds 

    test_pred_df = test_data[["date", "position", "player_id", "player", "npxg", "npg"]].reset_index(drop=True).copy(deep=True)
    test_pred_df["npxg_pred"] = test_preds
    
    return train_pred_df, test_pred_df

def catboost_train_predict(train_data, test_data, features, target_variable, cols_to_standardize, distribution="Poisson"):

    #get x and y data
    X_train, X_test = train_data_catboost[features].copy(deep=True), test_data[features].copy(deep=True)
    y_train, y_test = train_data_catboost[target_variable], test_data[target_variable]

    #standardize
    X_train, scaler = standardize(X_train, cols_to_standardize=cols_to_standardize)
    X_test, _ = standardize(X_test, cols_to_standardize=cols_to_standardize, scaler=scaler)

    # #catboost model
    if distribution == "Poisson":
        model = CatBoostRegressor(task_type="GPU",
                                    devices='0:1',
                                    early_stopping_rounds = 50,
                                    loss_function="Poisson")
    else:
        model = CatBoostRegressor(task_type="GPU",
                                    devices='0:1',
                                    early_stopping_rounds = 50,
                                    loss_function='Tweedie:variance_power=1.99')
    
    #fit
    model.fit(X_train,y=y_train, eval_set=(X_test, y_test), cat_features = cols_to_ohe,verbose=100)
    
    #get predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_pred_df = train_data_catboost[["date", "position", "player_id", "player", "npxg", "npg"]].reset_index(drop=True).copy(deep=True)
    train_pred_df["npg_pred"] = train_preds 

    test_pred_df = test_data_catboost[["date", "position", "player_id", "player", "npxg", "npg"]].reset_index(drop=True).copy(deep=True)
    test_pred_df["npg_pred"] = test_preds
    
    return train_pred_df, test_pred_df, model

In [None]:
#tune hyperparameters
reg_grid = [0.00001, 0.0001,0.0005, 0.001,0.005, 0.01,0.05,0.1]
grid_scores = []

for reg_param in reg_grid:
    print(f"Tryin reg = {reg_param}")
    train_preds, test_preds = constant_train_predict(train_data, test_data, reg_parameter=reg_param)
    
    _,train_poisson_ll = calculate_log_likelihood(train_preds.npg,
                                                           [train_preds.npxg_pred],
                                                           distribution='Poisson',
                                                           individual_scores=False)
    
    _,test_poisson_ll = calculate_log_likelihood(test_preds.npg,
                                                           [test_preds.npxg_pred],
                                                           distribution='Poisson',
                                                           individual_scores=False)
    
    r2score_train = r2_score(y_true=train_preds.npg, y_pred=train_preds.npxg_pred)
    r2score_test = r2_score(y_true=test_preds.npg, y_pred=test_preds.npxg_pred)
        
    grid_scores.append((reg_param, train_poisson_ll, test_poisson_ll, r2score_train, r2score_test))
    
grid_scores = pd.DataFrame(grid_scores, columns=["reg_parameter","train_ll", "test_ll", "r2score_train", "r2score_test"])
grid_scores.sort_values("train_ll", inplace=True)
grid_scores

In [None]:
#final model and validation predictions
train_preds, test_preds = constant_train_predict(train_data, test_data, 0.0005, model_features, target_variable, cols_to_standardize)

#get ll metrics for test
test_preds['poisson_ll'] = calculate_log_likelihood(test_preds.npg,
                                                           [test_preds.npxg_pred],
                                                           distribution='Poisson',
                                                           individual_scores=True)

In [None]:
#Poisson catboost
cat_train_preds, cat_test_preds, catboost_model = catboost_train_predict(train_data = train_data_catboost,
                                                         test_data = test_data_catboost,
                                                         features = catboost_features,
                                                         target_variable = target_variable_cat,
                                                         cols_to_standardize = cols_to_standardize)

cat_test_preds['poisson_ll'] = calculate_log_likelihood(cat_test_preds.npg,
                                                           [cat_test_preds.npg_pred],
                                                           distribution='Poisson',
                                                           individual_scores=True)

In [None]:
cat_test_preds.poisson_ll.mean()

In [None]:
cat_test_preds

In [None]:
r2_score(y_pred=cat_test_preds.npg_pred, y_true=cat_test_preds.npg)

In [None]:
cat_test_preds.groupby(["position"],as_index=False)[["poisson_ll", "npg_pred", "npg"]].mean()

In [None]:
player_name = 'Harry Kane'
data_to_check = cat_train_preds

data_to_check[data_to_check.player == player_name].npg.value_counts(normalize=True).sort_index().plot(kind='bar', alpha=0.5)

print(f"N obs = {len(data_to_check[data_to_check.player == player_name])}")
# #regression
# exps = test_preds[test_preds.player == 'Mohamed Salah'].npxg_pred.values
# _x = np.arange(0, 8)
# probs = poisson.pmf(_x,exps[:,np.newaxis])
# plt.plot(probs.mean(axis=0), label="regression")

#catboost
exps = data_to_check[data_to_check.player == player_name].npg_pred.values
_x = np.arange(0, 8)
probs = poisson.pmf(_x,exps[:,np.newaxis])
plt.plot(probs.mean(axis=0), label="catboost")

plt.legend()
plt.show()

In [None]:
feature_importances = catboost_model.get_feature_importance()

# Pair with feature names for clearer understanding
features = catboost_features
importance_dict = dict(zip(features, feature_importances))
plt.bar(importance_dict.keys(), importance_dict.values())
plt.xticks(rotation=90)