# Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# Loads Data

In [3]:
df = pd.read_csv(r"League_Result_Data/Encoded_PremierLeague_Stats_From_2014to2021.csv")
df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,...,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,...,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,...,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,...,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


## # Encodes the YearOfSeason values

In [4]:
df = df.dropna()

In [5]:
seasons = set(df['YearOfSeason'].values)
seasons = list(seasons)
seasons

['2019/20', '2015/16', '2016/17', '2017/18', '2020/21', '2014/15', '2018/19']

In [6]:
# re-orders the list 
seasons = ['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']
seasons

['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']

In [7]:
df['YearOfSeason Encoding'] = -9

for index, value in df['YearOfSeason'].iteritems():
    
    # 2014/15: 0
    if value == seasons[0]:
        df.at[index, 'YearOfSeason Encoding'] = 0
    
    # 2015/16: 1
    if value == seasons[1]:
        df.at[index, 'YearOfSeason Encoding'] = 1
        
    # 2016/17: 2
    if value == seasons[2]:
        df.at[index, 'YearOfSeason Encoding'] = 2
    
    # 2017/18: 3
    if value == seasons[3]:
        df.at[index, 'YearOfSeason Encoding'] = 3
        
    # 2018/19: 4
    if value == seasons[4]:
        df.at[index, 'YearOfSeason Encoding'] = 4
        
    # 2019/20: 5
    if value == seasons[5]:
        df.at[index, 'YearOfSeason Encoding'] = 5
    
    # 2020/21: 6
    if value == seasons[6]:
        df.at[index, 'YearOfSeason Encoding'] = 6 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,...,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0,0
2565,2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,...,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0,0
2566,2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,...,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0,0
2567,2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,...,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0,0


In [9]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "Unnamed: 0", "YearOfSeason", "Date", "HomeTeam", "FTHG", "FTAG", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

X

Unnamed: 0,Season Encoding,HomeTeam Encoding,AwayTeam Encoding,HTHG,HTAG,HTR Encoding,Referee Encoding,Fouls Called Per Game,HS,AS,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,8,0,0.0,1.0,1,7,14.0,5.0,13.0,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,0,6,21,1.0,0.0,0,27,14.0,5.0,9.0,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,0,13,11,3.0,2.0,0,28,14.0,22.0,6.0,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,0,28,17,0.0,0.0,2,11,13.0,15.0,15.0,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,0,27,12,0.0,0.0,2,10,15.0,7.0,13.0,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2,7,25,0.0,1.0,1,27,14.0,9.0,16.0,...,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0,0
2565,2,10,15,0.0,0.0,2,15,13.0,16.0,7.0,...,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0,0
2566,2,12,19,2.0,0.0,0,28,14.0,22.0,18.0,...,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0,0
2567,2,14,21,1.0,0.0,0,2,14.0,15.0,13.0,...,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0,0


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2414 entries, 0 to 2568
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        2414 non-null   int64  
 1   HomeTeam Encoding      2414 non-null   int64  
 2   AwayTeam Encoding      2414 non-null   int64  
 3   HTHG                   2414 non-null   float64
 4   HTAG                   2414 non-null   float64
 5   HTR Encoding           2414 non-null   int64  
 6   Referee Encoding       2414 non-null   int64  
 7   Fouls Called Per Game  2414 non-null   float64
 8   HS                     2414 non-null   float64
 9   AS                     2414 non-null   float64
 10  HST                    2414 non-null   float64
 11  AST                    2414 non-null   float64
 12  HF                     2414 non-null   float64
 13  AF                     2414 non-null   float64
 14  HC                     2414 non-null   float64
 15  AC  

In [11]:
"""
Target Varible - Full Time Results
0 - Home Team Win
1 - Away Team Win
2 - Draw
"""

y

0       1
1       0
2       0
3       1
4       1
       ..
2564    1
2565    2
2566    0
2567    0
2568    0
Name: FTR Encoding, Length: 2414, dtype: int64

# Split the Data

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=0)

# Scale the Data

In [13]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Fitting & Evaluating the Model

In [14]:
nn_clf = MLPClassifier(solver='adam', alpha=1e-3,
                     hidden_layer_sizes=(5, 2), random_state=1,max_iter=500)

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.6404308202154101




In [15]:
results = nn_clf.predict(X_test)

In [16]:
# Precision_Recall_F-Score
p_r_f = precision_recall_fscore_support(y_test, results, average='macro')
p_r_f

(0.5376839989779585, 0.5619476815878256, 0.5036511846218643, None)

In [17]:
test_acc = accuracy_score(y_test, results)
test_acc

0.6404308202154101

In [18]:
cv_dict = cross_validate(nn_clf, X, y, return_train_score=True)
cv_dict

{'fit_time': array([0.83802557, 0.87206841, 0.84600043, 0.83200002, 0.8204627 ]),
 'score_time': array([0.00197482, 0.00197387, 0.00099993, 0.00200272, 0.0010004 ]),
 'test_score': array([0.44927536, 0.44927536, 0.44720497, 0.45134576, 0.45228216]),
 'train_score': array([0.45157949, 0.45106163, 0.45106163, 0.45106163, 0.45031056])}

In [19]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')
print('Accuracy:',cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))



Accuracy: [0.60794045 0.61940299 0.61940299] 





[[413 114   4]
 [ 46 325   8]
 [110 182   5]]




# Evaluating the Model

In [20]:
nn_clf.classes_

array([0, 1, 2], dtype=int64)

In [21]:
nn_clf.loss_

0.7308440140952004

In [22]:
nn_clf.best_loss_

0.7308440140952004

In [23]:
nn_clf.loss_curve_

[1.4046995401066922,
 1.381461836074015,
 1.3610159575355978,
 1.3446645136568038,
 1.3311792027036684,
 1.3186569953653573,
 1.3090154115161399,
 1.3000961267670568,
 1.2922345270883828,
 1.284842640772582,
 1.2775008205701355,
 1.2711078352518654,
 1.2652345313842528,
 1.2600864595501304,
 1.254884212423295,
 1.250099101236511,
 1.2456553796725915,
 1.241438811939722,
 1.2373649770249684,
 1.2335134187328685,
 1.2296759971655933,
 1.2261153122674289,
 1.222642464269718,
 1.2192524529875524,
 1.2161338948837468,
 1.2130338048198757,
 1.2099995746159629,
 1.2069879821964782,
 1.2039511458608267,
 1.2010490754789782,
 1.1983796199511936,
 1.1958774483834997,
 1.1932082066940344,
 1.1908154436513425,
 1.1883449848756968,
 1.1857000545593546,
 1.1830711859774954,
 1.180544791436748,
 1.1783015305157265,
 1.1761204724834144,
 1.1739392548860348,
 1.171727657692443,
 1.1696805039193785,
 1.1677341224991338,
 1.1657124736684399,
 1.1636630967950308,
 1.1619399204254994,
 1.1602498853068388,


In [24]:
str(nn_clf.t_) + " training instances seen during fitting"

'603500 training instances seen during fitting'

In [25]:
str(nn_clf.n_iter_) + "  iterations"

'500  iterations'

In [26]:
str(nn_clf.n_layers_) + " layers"

'4 layers'

In [27]:
str(nn_clf.n_outputs_) + " outputs"

'3 outputs'

In [28]:
"Output Activation: " + str(nn_clf.out_activation_)

'Output Activation: softmax'

In [29]:
nn_clf.get_params(nn_clf)

{'activation': 'relu',
 'alpha': 0.001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (5, 2),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 500,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [30]:
nn_clf.coefs_

[array([[-0.01309971,  0.0455154 , -0.36603558,  0.07386936, -0.52947412],
        [-0.34137391, -0.43623716,  0.04258909, -0.18604306, -0.20583808],
        [ 0.05162558,  0.27696539, -0.35390468,  0.19501419, -0.09265584],
        [-0.28096055,  0.38287092,  0.59904437,  0.14368949, -0.30204271],
        [ 0.83012038,  0.22175208, -0.38930599, -0.48470932,  0.66197671],
        [ 0.01198219, -0.27750905, -0.20086156, -0.21933722,  0.06070314],
        [-0.12386292, -0.11009635,  0.0848654 , -0.0090255 ,  0.43882793],
        [-0.11609048, -0.17955703,  0.58587353, -0.60884508,  0.17737169],
        [ 0.74772707,  0.30140045, -0.20694746,  0.19033131, -0.16580849],
        [-0.01577331,  0.33782054, -0.15421824, -0.39399371, -0.21909558],
        [-0.72135003,  0.16605963,  0.37758369,  0.41985536, -0.28077922],
        [-0.08749846, -0.00111418, -0.55837874, -0.52339412,  0.48670022],
        [-0.17992381, -0.20588374, -0.08459855, -0.31295308, -0.23404143],
        [ 0.2579329 ,  0.

In [31]:
len(nn_clf.coefs_)

3

In [32]:
len(nn_clf.coefs_[0])

21

In [33]:
len(nn_clf.coefs_[1])

5

In [34]:
len(nn_clf.coefs_[2])

2