# Imports

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# Loads Data

In [5]:
df = pd.read_csv(r"League_Result_Data/TimeEncoded_PremierLeague_Stats_From_2014to2021.csv")
df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0


## Encodes the YearOfSeason values

In [6]:
df = df.dropna()

In [7]:
seasons = set(df['YearOfSeason'].values)
seasons = list(seasons)
seasons

['2020/21', '2018/19', '2019/20']

In [8]:
# re-orders the list 
seasons = ['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']
seasons

['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']

In [9]:
df['YearOfSeason Encoding'] = -1

for index, value in df['YearOfSeason'].iteritems():
    
    # 2014/15: 0
    if value == seasons[0]:
        df.at[index, 'YearOfSeason Encoding'] = 0
    
    # 2015/16: 1
    if value == seasons[1]:
        df.at[index, 'YearOfSeason Encoding'] = 1
        
    # 2016/17: 2
    if value == seasons[2]:
        df.at[index, 'YearOfSeason Encoding'] = 2
    
    # 2017/18: 3
    if value == seasons[3]:
        df.at[index, 'YearOfSeason Encoding'] = 3
        
    # 2018/19: 4
    if value == seasons[4]:
        df.at[index, 'YearOfSeason Encoding'] = 4
        
    # 2019/20: 5
    if value == seasons[5]:
        df.at[index, 'YearOfSeason Encoding'] = 5
    
    # 2020/21: 6
    if value == seasons[6]:
        df.at[index, 'YearOfSeason Encoding'] = 6 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [11]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "Unnamed: 0", "YearOfSeason", "Time", "Date", "HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

X

Unnamed: 0,Season Encoding,Time Encoding,HomeTeam Encoding,AwayTeam Encoding,FTHG,FTAG,HTHG,HTAG,Referee Encoding,Fouls Called Per Game,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,0,8,0,0.0,3.0,0.0,1.0,7,14.0,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,0,2,6,21,1.0,0.0,1.0,0.0,27,14.0,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,0,2,13,11,4.0,3.0,3.0,2.0,28,14.0,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,0,2,28,17,0.0,2.0,0.0,0.0,11,13.0,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,0,1,27,12,0.0,3.0,0.0,0.0,10,15.0,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,3,1,12,15,0.0,2.0,0.0,0.0,16,14.0,...,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,3,1,14,18,5.0,0.0,2.0,0.0,25,15.0,...,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,3,1,17,13,1.0,3.0,1.0,1.0,10,15.0,...,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,3,1,21,20,3.0,1.0,0.0,1.0,3,13.0,...,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 632 entries, 0 to 669
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        632 non-null    int64  
 1   Time Encoding          632 non-null    int64  
 2   HomeTeam Encoding      632 non-null    int64  
 3   AwayTeam Encoding      632 non-null    int64  
 4   FTHG                   632 non-null    float64
 5   FTAG                   632 non-null    float64
 6   HTHG                   632 non-null    float64
 7   HTAG                   632 non-null    float64
 8   Referee Encoding       632 non-null    int64  
 9   Fouls Called Per Game  632 non-null    float64
 10  HS                     632 non-null    float64
 11  AS                     632 non-null    float64
 12  HST                    632 non-null    float64
 13  AST                    632 non-null    float64
 14  HF                     632 non-null    float64
 15  AF    

In [13]:
"""
Target Varible - Full Time Results
0 - Home Team Win
1 - Away Team Win
2 - Draw
"""

y

0      1
1      0
2      0
3      1
4      1
      ..
665    1
666    0
667    1
668    0
669    2
Name: FTR Encoding, Length: 632, dtype: int64

# Split the Data

In [60]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=0)

# Scale the Data

In [61]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Fitting & Evaluating the Model

In [62]:
nn_clf = MLPClassifier(solver='adam', alpha=1e-3,
                     hidden_layer_sizes=(5, 2), random_state=1,max_iter=500)

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.9960474308300395




In [63]:
results = nn_clf.predict(X_test)

In [64]:
# Precision_Recall_F-Score
p_r_f = precision_recall_fscore_support(y_test, results, average='macro')
p_r_f

(0.9934640522875817, 0.996415770609319, 0.9948978681651949, None)

In [65]:
test_acc = accuracy_score(y_test, results)
test_acc

0.9960474308300395

In [66]:
cv_dict = cross_validate(nn_clf, X, y, return_train_score=True)
cv_dict



{'fit_time': array([0.07100248, 0.72799826, 0.11300516, 0.13199639, 0.74597573]),
 'score_time': array([0.00199628, 0.00200033, 0.00197315, 0.00200295, 0.00200033]),
 'test_score': array([0.43307087, 0.96062992, 0.42063492, 0.42063492, 0.98412698]),
 'train_score': array([0.42772277, 0.98811881, 0.4229249 , 0.4229249 , 0.98023715])}

In [67]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')
print('Accuracy:',cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))



Accuracy: [0.96062992 0.93650794 0.97619048] 





[[155   0   2]
 [  0 117   4]
 [  4   6  91]]




# Evaluating the Model

In [68]:
nn_clf.classes_

array([0, 1, 2], dtype=int64)

In [69]:
nn_clf.loss_

0.08359802915583088

In [70]:
nn_clf.best_loss_

0.08359802915583088

In [71]:
nn_clf.loss_curve_

[1.170467319169526,
 1.1648521420978595,
 1.158900215555357,
 1.1533503753694236,
 1.1482189523675046,
 1.1430542739840657,
 1.1380548346191834,
 1.1336262856969377,
 1.1286869640741155,
 1.124464942039189,
 1.1202748442630888,
 1.1161274280757374,
 1.1124335088483153,
 1.1085978364990527,
 1.1049753231972195,
 1.1015174214429522,
 1.0982099552259685,
 1.0950717737207127,
 1.09193560928337,
 1.0889314173831552,
 1.0860739292850206,
 1.0832838458309924,
 1.0806985011964132,
 1.0780345664696283,
 1.075469741145233,
 1.0730529135935594,
 1.0704776213144593,
 1.0681975160706836,
 1.0659719081336037,
 1.0636727039008516,
 1.0615351179903405,
 1.0594020504607689,
 1.057243734404299,
 1.0551423726389817,
 1.0529685968269602,
 1.0508534534713132,
 1.04879431048776,
 1.0468971326078345,
 1.0450529833649354,
 1.0430365082790605,
 1.040979875532637,
 1.039136875887016,
 1.0373155810388126,
 1.035406816881198,
 1.0336048659527484,
 1.031793968909752,
 1.0297628373908663,
 1.0278085555635712,
 1.02

In [72]:
str(nn_clf.t_) + " training instances seen during fitting"

'189500 training instances seen during fitting'

In [73]:
str(nn_clf.n_iter_) + "  iterations"

'500  iterations'

In [74]:
str(nn_clf.n_layers_) + " layers"

'4 layers'

In [75]:
str(nn_clf.n_outputs_) + " outputs"

'3 outputs'

In [76]:
"Output Activation: " + str(nn_clf.out_activation_)

'Output Activation: softmax'

In [77]:
nn_clf.get_params(nn_clf)

{'activation': 'relu',
 'alpha': 0.001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (5, 2),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 500,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [78]:
nn_clf.coefs_

[array([[-0.24505352, -0.08257378, -0.10829888, -0.3941678 , -0.19954641],
        [-0.12532929, -0.09372308, -0.06555577, -0.27284143,  0.42423816],
        [ 0.01083201,  0.05567132, -0.01594233,  0.0455147 , -0.13620696],
        [ 0.06478646, -0.03284948, -0.09293345, -0.22143896, -0.20883359],
        [ 0.57637998,  1.31697219,  0.87865828, -0.71456848,  0.0555449 ],
        [-0.00402713, -1.12218532, -1.36111898,  0.40432745,  0.26297675],
        [-0.21764079, -0.27017111,  0.28474488, -0.37385138,  0.26346725],
        [-0.2357354 , -0.05307773, -0.01242591,  0.01096512, -0.09698311],
        [ 0.27210979,  0.11492504,  0.02651523,  0.24389317, -0.00588916],
        [ 0.03611792,  0.03920265, -0.01547776, -0.02140146, -0.1913633 ],
        [-0.23131919, -0.12281899,  0.02961174, -0.11307245,  0.01136641],
        [-0.2640734 ,  0.1013774 , -0.06639943,  0.05337812, -0.08504994],
        [-0.13046581, -0.12593821, -0.01166054, -0.53756202, -0.20816115],
        [ 0.20015002,  0.

In [79]:
len(nn_clf.coefs_)

3

In [80]:
len(nn_clf.coefs_[0])

23

In [81]:
len(nn_clf.coefs_[1])

5

In [82]:
len(nn_clf.coefs_[2])

2