# Imports

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# Loads Data

In [40]:
df = pd.read_csv(r"League_Result_Data/TimeEncoded_PremierLeague_Stats_From_2014to2021.csv")
df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0


## Encodes the YearOfSeason values

In [41]:
df = df.dropna()

In [42]:
seasons = set(df['YearOfSeason'].values)
seasons = list(seasons)
seasons

['2019/20', '2018/19', '2020/21']

In [43]:
# re-orders the list 
seasons = ['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']
seasons

['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']

In [44]:
df['YearOfSeason Encoding'] = -1

for index, value in df['YearOfSeason'].iteritems():
    
    # 2014/15: 0
    if value == seasons[0]:
        df.at[index, 'YearOfSeason Encoding'] = 0
    
    # 2015/16: 1
    if value == seasons[1]:
        df.at[index, 'YearOfSeason Encoding'] = 1
        
    # 2016/17: 2
    if value == seasons[2]:
        df.at[index, 'YearOfSeason Encoding'] = 2
    
    # 2017/18: 3
    if value == seasons[3]:
        df.at[index, 'YearOfSeason Encoding'] = 3
        
    # 2018/19: 4
    if value == seasons[4]:
        df.at[index, 'YearOfSeason Encoding'] = 4
        
    # 2019/20: 5
    if value == seasons[5]:
        df.at[index, 'YearOfSeason Encoding'] = 5
    
    # 2020/21: 6
    if value == seasons[6]:
        df.at[index, 'YearOfSeason Encoding'] = 6 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [45]:
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [46]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "Unnamed: 0", "YearOfSeason", "Time", "Date", "FTHG", "FTAG", "HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

X

Unnamed: 0,Season Encoding,Time Encoding,HomeTeam Encoding,AwayTeam Encoding,HTHG,HTAG,Referee Encoding,Fouls Called Per Game,HS,AS,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,0,8,0,0.0,1.0,7,14.0,5.0,13.0,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,0,2,6,21,1.0,0.0,27,14.0,5.0,9.0,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,0,2,13,11,3.0,2.0,28,14.0,22.0,6.0,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,0,2,28,17,0.0,0.0,11,13.0,15.0,15.0,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,0,1,27,12,0.0,0.0,10,15.0,7.0,13.0,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,3,1,12,15,0.0,0.0,16,14.0,14.0,7.0,...,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,3,1,14,18,2.0,0.0,25,15.0,31.0,5.0,...,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,3,1,17,13,1.0,1.0,10,15.0,3.0,14.0,...,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,3,1,21,20,0.0,1.0,3,13.0,13.0,5.0,...,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [47]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 632 entries, 0 to 669
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        632 non-null    int64  
 1   Time Encoding          632 non-null    int64  
 2   HomeTeam Encoding      632 non-null    int64  
 3   AwayTeam Encoding      632 non-null    int64  
 4   HTHG                   632 non-null    float64
 5   HTAG                   632 non-null    float64
 6   Referee Encoding       632 non-null    int64  
 7   Fouls Called Per Game  632 non-null    float64
 8   HS                     632 non-null    float64
 9   AS                     632 non-null    float64
 10  HST                    632 non-null    float64
 11  AST                    632 non-null    float64
 12  HF                     632 non-null    float64
 13  AF                     632 non-null    float64
 14  HC                     632 non-null    float64
 15  AC    

In [48]:
"""
Target Varible - Full Time Results
0 - Home Team Win
1 - Away Team Win
2 - Draw
"""

y

0      1
1      0
2      0
3      1
4      1
      ..
665    1
666    0
667    1
668    0
669    2
Name: FTR Encoding, Length: 632, dtype: int64

# Split the Data

In [49]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=0)

# Scale the Data

In [50]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Fitting & Evaluating the Model

In [51]:
nn_clf = MLPClassifier(solver='adam', alpha=1e-3,
                     hidden_layer_sizes=(5, 2), random_state=1,max_iter=500)

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.6482213438735178




In [52]:
results = nn_clf.predict(X_test)

In [53]:
# Precision_Recall_F-Score
p_r_f = precision_recall_fscore_support(y_test, results, average='macro')
p_r_f

(0.43549618320610683, 0.5401759530791789, 0.4820291040850915, None)

In [54]:
test_acc = accuracy_score(y_test, results)
test_acc

0.6482213438735178

In [55]:
cv_dict = cross_validate(nn_clf, X, y, return_train_score=True)
cv_dict

{'fit_time': array([0.64307022, 0.64998746, 0.65997314, 0.68497562, 0.63500285]),
 'score_time': array([0.00199103, 0.00199747, 0.00203037, 0.00199962, 0.00199986]),
 'test_score': array([0.42519685, 0.40944882, 0.41269841, 0.42857143, 0.44444444]),
 'train_score': array([0.43168317, 0.43366337, 0.42885375, 0.43280632, 0.42687747])}

In [56]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')
print('Accuracy:',cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))



Accuracy: [0.4015748  0.4047619  0.53968254] 





[[125  13  19]
 [ 83  29   9]
 [ 72  13  16]]




# Evaluating the Model

In [57]:
nn_clf.classes_

array([0, 1, 2], dtype=int64)

In [58]:
nn_clf.loss_

0.7732943820362741

In [59]:
nn_clf.best_loss_

0.7732943820362741

In [60]:
nn_clf.loss_curve_

[1.416865809540699,
 1.4090824453686281,
 1.400896860939403,
 1.3932365544194503,
 1.3856142996212668,
 1.3781946398696796,
 1.3714581632076994,
 1.365112201454452,
 1.358512814724069,
 1.3525533569626194,
 1.3470186875083519,
 1.3410384268328563,
 1.33568632142165,
 1.3306314538068986,
 1.3259802787828827,
 1.321553319967767,
 1.3170230243051333,
 1.3125106129952409,
 1.3083990419155025,
 1.3043808057486912,
 1.3009897471847975,
 1.2975969087938166,
 1.294220376750645,
 1.2907603260630816,
 1.2875647522394433,
 1.284468250385249,
 1.2817736692405923,
 1.2788498788148364,
 1.2760809033479277,
 1.2734661908164815,
 1.2707753940000883,
 1.2682587951867166,
 1.2658588002414122,
 1.2633750017049656,
 1.2610725866593062,
 1.2589780793326202,
 1.256619589525791,
 1.2545282820915793,
 1.2525090083626917,
 1.2505927550266809,
 1.2486013140138084,
 1.246702014659719,
 1.2451113236660054,
 1.243160438660174,
 1.2415977373860079,
 1.2398440544779448,
 1.2382217306465872,
 1.2368393199598762,
 1.2

In [61]:
str(nn_clf.t_) + " training instances seen during fitting"

'189500 training instances seen during fitting'

In [62]:
str(nn_clf.n_iter_) + "  iterations"

'500  iterations'

In [63]:
str(nn_clf.n_layers_) + " layers"

'4 layers'

In [64]:
str(nn_clf.n_outputs_) + " outputs"

'3 outputs'

In [65]:
"Output Activation: " + str(nn_clf.out_activation_)

'Output Activation: softmax'

In [66]:
nn_clf.get_params(nn_clf)

{'activation': 'relu',
 'alpha': 0.001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (5, 2),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 500,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [67]:
nn_clf.coefs_

[array([[-0.00342578,  0.11906684,  0.02936149, -0.02390014, -0.50906831],
        [-0.22492729, -0.5814839 ,  0.02488756,  0.13037801, -0.26178003],
        [-0.3274317 , -0.03240194, -0.4277957 ,  0.15210817, -0.26327825],
        [ 0.136645  , -0.07906998,  0.23189814, -0.07351488, -0.21620567],
        [-0.1297299 ,  0.20602445,  0.32882475,  0.63492124, -0.14047743],
        [ 0.33096938, -0.2879996 , -1.05710249, -0.40357064,  0.21520204],
        [-0.3063154 ,  0.15898909, -0.02127809, -0.15235514,  0.0691022 ],
        [-0.09195276,  0.05683317,  0.44295356, -0.07625012,  0.1813732 ],
        [ 0.26970585,  0.24571905, -0.05179169,  0.2187666 , -0.27093319],
        [ 0.02847707,  0.26790772, -0.28324359, -0.44860893, -0.43663894],
        [-0.80411006,  0.02129114,  0.19469727,  0.17711766, -0.33307495],
        [-0.18240257, -0.04640479, -0.92406689, -0.66025087,  0.33804829],
        [-0.26987145, -0.11064541,  0.04319262, -0.26800872, -0.34786445],
        [ 0.47304931, -0.

In [68]:
len(nn_clf.coefs_)

3

In [69]:
len(nn_clf.coefs_[0])

21

In [70]:
len(nn_clf.coefs_[1])

5

In [71]:
len(nn_clf.coefs_[2])

2

# Test With a Recent Game -- Work In Progress

Game Used: Man City vs Chelsea FA Cup (4/17/2021) -- Chelsea won (ie the Home Team won)

In [72]:
list(X.columns) # The values we need to make a prediction 

['Season Encoding',
 'Time Encoding',
 'HomeTeam Encoding',
 'AwayTeam Encoding',
 'HTHG',
 'HTAG',
 'Referee Encoding',
 'Fouls Called Per Game',
 'HS',
 'AS',
 'HST',
 'AST',
 'HF',
 'AF',
 'HC',
 'AC',
 'HY',
 'AY',
 'HR',
 'AR',
 'YearOfSeason Encoding']

In [73]:
"""
test_game = [
    2, # 2 bc it took place in Spring
    0, # 0 bc kickoff was at 12:32 (rounds to 12:30)
    5, # Chelsea: 5 in the Team encoding
    14, # Man City: 14 in the Team encoding
    
            ]
"""

'\ntest_game = [\n    2, # 2 bc it took place in Spring\n    0, # 0 bc kickoff was at 12:32 (rounds to 12:30)\n    5, # Chelsea: 5 in the Team encoding\n    14, # Man City: 14 in the Team encoding\n    \n            ]\n'