In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import scipy

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import plotly.express as px

from datetime import datetime
from scipy import stats

#Set Display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

pd.options.display.float_format = '{:.2f}'.format


In [2]:
data = pd.read_csv(r"./data/2019-20/player_past_history/merged_data.csv" , parse_dates = ["kickoff_time"]) 
all_players = pd.read_csv(r"./data/2019-20/all_players.csv" , parse_dates= ["news_added"]) 

total_players = len(all_players)
orig_df = data
data = data[data.minutes != 0]



In [3]:
orig_df.shape

(22495, 37)

In [4]:
data.shape

(10614, 37)

In [5]:
data.head()

Unnamed: 0,fixture_id,opponent_team_id,total_points,was_home,kickoff_time,team_h_score,team_a_score,gameweek_id,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,player_price,transfers_balance,selected,transfers_in,transfers_out,team_id,opponent_strength,opponent_team_name,player_id,web_name,position_id,player_price.1
2,4,8,2,True,2019-08-10 14:00:00,0,0,1,90,0,0,1,0,0,0,0,1,0,0,0,-2,1.6,14.8,0.0,1.6,70,0,495302,0,0,8,3,EVE,134,Milivojevic,3,6.7
3,4,8,1,True,2019-08-10 14:00:00,0,0,1,25,0,0,0,0,0,0,0,0,0,0,0,1,4.4,0.8,32.0,3.7,70,0,1082439,0,0,8,3,EVE,133,Zaha,3,6.6
6,3,16,14,True,2019-08-10 14:00:00,3,0,1,90,0,2,1,0,0,0,0,0,0,0,2,43,56.8,25.1,1.0,8.3,45,0,9332,0,0,16,3,SOU,447,Pieters,2,4.3
7,4,8,6,True,2019-08-10 14:00:00,0,0,1,90,0,0,1,0,0,0,0,0,0,2,0,24,14.0,0.0,0.0,1.4,50,0,22626,0,0,8,3,EVE,131,Guaita,1,4.9
8,4,8,1,True,2019-08-10 14:00:00,0,0,1,7,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.5,2.0,0.2,45,0,280588,0,0,8,3,EVE,130,Wickham,4,4.3


In [6]:
data.columns

Index(['fixture_id', 'opponent_team_id', 'total_points', 'was_home',
       'kickoff_time', 'team_h_score', 'team_a_score', 'gameweek_id',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'player_price', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'team_id', 'opponent_strength',
       'opponent_team_name', 'player_id', 'web_name', 'position_id',
       'player_price.1'],
      dtype='object')

In [7]:
data = data[['player_id' , 'total_points' , 'was_home' , 'minutes' , 'team_id' , 'opponent_strength' , 'position_id']]

In [8]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10614 entries, 2 to 22493
Data columns (total 7 columns):
player_id            10614 non-null int64
total_points         10614 non-null int64
was_home             10614 non-null bool
minutes              10614 non-null int64
team_id              10614 non-null int64
opponent_strength    10614 non-null int64
position_id          10614 non-null int64
dtypes: bool(1), int64(6)
memory usage: 590.8 KB


In [9]:
data['team_id'] = data['team_id'].astype(str)


In [10]:
corr_matrix = data.corr()
corr_matrix['total_points'].sort_values(ascending=False)

total_points         1.00
minutes              0.31
was_home             0.08
position_id         -0.02
player_id           -0.05
opponent_strength   -0.13
Name: total_points, dtype: float64

In [11]:
# Replace categorical data with one-hot encoded data
features_data = pd.get_dummies(data, columns=['team_id'])
#features_df = pd.get_dummies(df)

features_data = features_data.drop('total_points', axis=1)

list_features = list(features_data.columns.values)
print(list_features)

['player_id', 'was_home', 'minutes', 'opponent_strength', 'position_id', 'team_id_1', 'team_id_10', 'team_id_11', 'team_id_12', 'team_id_13', 'team_id_14', 'team_id_15', 'team_id_16', 'team_id_17', 'team_id_18', 'team_id_19', 'team_id_2', 'team_id_20', 'team_id_3', 'team_id_4', 'team_id_5', 'team_id_6', 'team_id_7', 'team_id_8', 'team_id_9']


In [12]:
for i in list(data):
    print(i, data[i].values[0],type(data[i].values[0]))

# Create the X and y arrays
X = features_data.values
y = data['total_points'].values

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


player_id134<class 'numpy.int64'>
total_points2<class 'numpy.int64'>
was_homeTrue<class 'numpy.bool_'>
minutes90<class 'numpy.int64'>
team_id8<class 'str'>
opponent_strength3<class 'numpy.int64'>
position_id3<class 'numpy.int64'>


In [13]:
# Create the model
model = ensemble.GradientBoostingRegressor()

In [14]:
#Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
    }

## THIS MODULE TAKES A LONG TIME TO RUN

In [15]:
#Define the grid search we want to run. Run it with four cpus in parallel.
#gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

#Run the grid search - on only the training data!
#gs_cv.fit(X_train, y_train)

#Print the parameters that gave us the best result!
#print(gs_cv.best_params_)

# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}
# That is the combination that worked best.

### Fit regression model


In [16]:
model = ensemble.GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    max_features=1.0,
    loss='huber',
    random_state=0
)

model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=4, max_features=1.0, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Save the trained model to a file so we can use it in other programs


In [17]:
joblib.dump(model, 'Trained_PlayerAnalysis.pkl')

['Trained_PlayerAnalysis.pkl']

### Find the error rate on the training set


In [18]:
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 1.2709


### Find the error rate on the test set


In [19]:
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 1.8239


In [20]:
# Load the trained model created with train_model.py
model = joblib.load('Trained_PlayerAnalysis.pkl')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feature_indexes_by_importance = importance.argsort()


In [21]:
### Print each feature label, from most important to least important (reverse order)


In [22]:
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(list_features[index], (importance[index] * 100.0)))

team_id_10 - 0.22%
team_id_11 - 0.38%
team_id_1 - 0.44%
team_id_17 - 0.55%
team_id_20 - 0.57%
team_id_4 - 0.60%
team_id_3 - 0.65%
team_id_15 - 0.67%
team_id_7 - 0.70%
team_id_5 - 0.72%
team_id_8 - 0.74%
team_id_12 - 0.76%
team_id_19 - 0.78%
team_id_6 - 0.79%
team_id_9 - 0.79%
team_id_18 - 1.00%
team_id_2 - 1.01%
team_id_16 - 1.06%
team_id_13 - 1.09%
team_id_14 - 1.34%
was_home - 4.12%
position_id - 5.39%
opponent_strength - 6.78%
player_id - 33.29%
minutes - 35.56%
