In [57]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import scipy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import plotly.express as px

from datetime import datetime
from scipy import stats

#Set Display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

pd.options.display.float_format = '{:.2f}'.format


In [58]:
data_orig = pd.read_csv(r"../data/2020-21/ML_dataset/ML_player_gameweek_history.csv" , parse_dates = ["kickoff_time"]) 
all_players = pd.read_csv(r"../data/2020-21/ML_dataset/ML_all_players.csv" , parse_dates= ["news_added"]) 


total_players = len(all_players)
data = data_orig.copy()

data = data[data.minutes_played > 59]
data['ekstra_points'] =data['total_points'] - 2



In [59]:
data.shape

(1643, 42)

In [60]:
data.head()

Unnamed: 0,player_id,fixture_id,opponent_team_id,total_points,home_game,kickoff_time,round,minutes_played,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,player_price,transfers_balance,selected,transfers_in,transfers_out,team_score,opponent_team_score,team_id,corners_and_indirect_freekicks_order,direct_freekicks_order,penalties_order,team_short_name,position_name_short,team_strength,full_name,opponent_team,opponent_team_strength,ekstra_points
1,4,2,8,7,False,2020-09-12 11:30:00,1,90,1,0,1,0,0,0,0,1,0,0,0,19,36.6,15.3,54.0,10.6,12.0,0,2823465,0,0,3,0,1,0.0,0.0,1.0,ARS,MID,4,Pierre-Emerick Aubameyang,FUL,2,5
2,6,2,8,7,False,2020-09-12 11:30:00,1,86,1,0,1,0,0,0,0,0,0,0,1,29,38.6,12.7,48.0,9.9,8.5,0,196064,0,0,3,0,1,0.0,0.0,0.0,ARS,FWD,4,Alexandre Lacazette,FUL,2,5
4,8,2,8,7,False,2020-09-12 11:30:00,1,90,0,0,1,0,0,0,0,0,0,2,1,29,14.0,0.0,0.0,1.4,5.0,0,400285,0,0,3,0,1,0.0,0.0,0.0,ARS,GKP,4,Bernd Leno,FUL,2,5
5,9,2,8,3,False,2020-09-12 11:30:00,1,77,0,0,1,0,0,0,0,0,0,0,0,11,6.8,17.2,21.0,4.5,5.5,0,42909,0,0,3,0,1,0.0,2.0,0.0,ARS,MID,4,Granit Xhaka,FUL,2,1
6,11,2,8,5,False,2020-09-12 11:30:00,1,90,0,0,1,0,0,0,0,1,0,0,0,23,9.8,26.4,4.0,4.0,5.0,0,240975,0,0,3,0,1,0.0,0.0,0.0,ARS,DEF,4,Héctor Bellerín,FUL,2,3


In [61]:
data.columns

Index(['player_id', 'fixture_id', 'opponent_team_id', 'total_points',
       'home_game', 'kickoff_time', 'round', 'minutes_played', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'player_price', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'team_score', 'opponent_team_score',
       'team_id', 'corners_and_indirect_freekicks_order',
       'direct_freekicks_order', 'penalties_order', 'team_short_name',
       'position_name_short', 'team_strength', 'full_name', 'opponent_team',
       'opponent_team_strength', 'ekstra_points'],
      dtype='object')

In [62]:
data = data[[
    'ekstra_points',
    'home_game',
    'minutes_played',
    'corners_and_indirect_freekicks_order',
    'direct_freekicks_order',
    'penalties_order',
    'team_short_name',
    'position_name_short',
    'team_strength',
    'full_name',
    'opponent_team',
    'opponent_team_strength'
    ]]

In [63]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1643 entries, 1 to 3129
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ekstra_points                         1643 non-null   int64  
 1   home_game                             1643 non-null   bool   
 2   minutes_played                        1643 non-null   int64  
 3   corners_and_indirect_freekicks_order  1643 non-null   float64
 4   direct_freekicks_order                1643 non-null   float64
 5   penalties_order                       1643 non-null   float64
 6   team_short_name                       1643 non-null   object 
 7   position_name_short                   1643 non-null   object 
 8   team_strength                         1643 non-null   int64  
 9   full_name                             1643 non-null   object 
 10  opponent_team                         1643 non-null   object 
 11  opponent_team_str

In [64]:
corr_matrix = data.corr()
corr_matrix['ekstra_points'].sort_values(ascending=False)

ekstra_points                           1.00
penalties_order                         0.17
team_strength                           0.09
corners_and_indirect_freekicks_order    0.05
minutes_played                          0.04
direct_freekicks_order                  0.04
home_game                              -0.07
opponent_team_strength                 -0.15
Name: ekstra_points, dtype: float64

In [65]:
# Replace categorical data with one-hot encoded data
features_data = pd.get_dummies(
    data, 
    columns=[
        'corners_and_indirect_freekicks_order' , 
        'direct_freekicks_order' , 
        'penalties_order' , 
        'team_short_name' , 
        'position_name_short',
        #'team_strength',
        'full_name',
        'opponent_team'
        #,'opponent_team_strength'
        ]
)
#features_df = pd.get_dummies(df)

features_data = features_data.drop('ekstra_points', axis=1)

list_features = list(features_data.columns.values)
print(list_features)

['home_game', 'minutes_played', 'team_strength', 'opponent_team_strength', 'corners_and_indirect_freekicks_order_0.0', 'corners_and_indirect_freekicks_order_1.0', 'corners_and_indirect_freekicks_order_2.0', 'direct_freekicks_order_0.0', 'direct_freekicks_order_1.0', 'direct_freekicks_order_2.0', 'penalties_order_0.0', 'penalties_order_1.0', 'penalties_order_2.0', 'team_short_name_ARS', 'team_short_name_AVL', 'team_short_name_BHA', 'team_short_name_BUR', 'team_short_name_CHE', 'team_short_name_CRY', 'team_short_name_EVE', 'team_short_name_FUL', 'team_short_name_LEE', 'team_short_name_LEI', 'team_short_name_LIV', 'team_short_name_MCI', 'team_short_name_MUN', 'team_short_name_NEW', 'team_short_name_SHU', 'team_short_name_SOU', 'team_short_name_TOT', 'team_short_name_WBA', 'team_short_name_WHU', 'team_short_name_WOL', 'position_name_short_DEF', 'position_name_short_FWD', 'position_name_short_GKP', 'position_name_short_MID', 'full_name_Aaron Connolly', 'full_name_Aaron Cresswell', 'full_nam

In [66]:
for i in list(data):
    print(i, data[i].values[0],type(data[i].values[0]))

# Create the X and y arrays
X = features_data.values
y = data['ekstra_points'].values

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


ekstra_points 5 <class 'numpy.int64'>
home_game False <class 'numpy.bool_'>
minutes_played 90 <class 'numpy.int64'>
corners_and_indirect_freekicks_order 0.0 <class 'numpy.float64'>
direct_freekicks_order 0.0 <class 'numpy.float64'>
penalties_order 1.0 <class 'numpy.float64'>
team_short_name ARS <class 'str'>
position_name_short MID <class 'str'>
team_strength 4 <class 'numpy.int64'>
full_name Pierre-Emerick Aubameyang <class 'str'>
opponent_team FUL <class 'str'>
opponent_team_strength 2 <class 'numpy.int64'>


In [67]:
# Create the model
model = ensemble.GradientBoostingRegressor()

In [68]:
#Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
    }

## THIS MODULE TAKES A LONG TIME TO RUN

In [69]:
#Define the grid search we want to run. Run it with four cpus in parallel.
#gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

#Run the grid search - on only the training data!
#gs_cv.fit(X_train, y_train)

#Print the parameters that gave us the best result!
#print(gs_cv.best_params_)

# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}
# That is the combination that worked best.

### Fit regression model


In [70]:
model = ensemble.GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    max_features=1.0,
    loss='huber',
    random_state=0
)

model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=4, max_features=1.0,
                          min_samples_leaf=3, n_estimators=3000,
                          random_state=0)

### Save the trained model to a file so we can use it in other programs


In [71]:
joblib.dump(model, 'Trained_PlayerAnalysis.pkl')

['Trained_PlayerAnalysis.pkl']

### Find the error rate on the training set


In [72]:
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 1.6432


### Find the error rate on the test set


In [73]:
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 2.1971


In [74]:
# Load the trained model created with train_model.py
model = joblib.load('Trained_PlayerAnalysis.pkl')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()


In [75]:
### Print each feature label, from most important to least important (reverse order)


In [76]:
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(list_features[index], (importance[index] * 100.0)))

full_name_John Fleck - 0.00%
full_name_Marc Albrighton - 0.00%
full_name_Mamadou Sakho - 0.00%
full_name_Luke Thomas - 0.00%
full_name_Lukasz Fabianski - 0.00%
full_name_Luka Milivojevic - 0.00%
full_name_Lucas Digne - 0.00%
full_name_Liam Cooper - 0.00%
full_name_Leandro Trossard - 0.00%
full_name_Leander Dendoncker - 0.00%
full_name_Kyle Walker-Peters - 0.00%
full_name_Kyle Walker - 0.00%
full_name_Kyle Bartley - 0.00%
full_name_Kieran Tierney - 0.00%
full_name_Kieran Gibbs - 0.00%
full_name_Kevin Long - 0.00%
full_name_Marcos Alonso - 0.00%
full_name_Marek Rodák - 0.00%
full_name_Mario Lemina - 0.00%
full_name_Mark Noble - 0.00%
full_name_Mohamed Naser El Sayed Elneny - 0.00%
full_name_Miguel Almirón - 0.00%
full_name_Michy Batshuayi - 0.00%
full_name_Michael Hector - 0.00%
full_name_Maxime Le Marchand - 0.00%
full_name_Max Lowe - 0.00%
full_name_Matthew Lowton - 0.00%
full_name_Kepa Arrizabalaga - 0.00%
full_name_Matthew Cash - 0.00%
full_name_Matt Ritchie - 0.00%
full_name_Matt Do