# Imports

In [552]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [553]:
from warnings import filterwarnings
filterwarnings('ignore')

# Loads Data

In [554]:
df = pd.read_csv(r"League_Result_Data/TimeEncoded_PremierLeague_Stats_From_2014to2021.csv")
df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0


## Encodes the YearOfSeason values

In [555]:
df = df.dropna()

In [556]:
seasons = set(df['YearOfSeason'].values)
seasons = list(seasons)
seasons

['2019/20', '2018/19', '2020/21']

In [557]:
# re-orders the list 
seasons = ['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']
seasons

['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']

In [558]:
df['YearOfSeason Encoding'] = -1

for index, value in df['YearOfSeason'].iteritems():
    
    # 2014/15: 0
    if value == seasons[0]:
        df.at[index, 'YearOfSeason Encoding'] = 0
    
    # 2015/16: 1
    if value == seasons[1]:
        df.at[index, 'YearOfSeason Encoding'] = 1
        
    # 2016/17: 2
    if value == seasons[2]:
        df.at[index, 'YearOfSeason Encoding'] = 2
    
    # 2017/18: 3
    if value == seasons[3]:
        df.at[index, 'YearOfSeason Encoding'] = 3
        
    # 2018/19: 4
    if value == seasons[4]:
        df.at[index, 'YearOfSeason Encoding'] = 4
        
    # 2019/20: 5
    if value == seasons[5]:
        df.at[index, 'YearOfSeason Encoding'] = 5
    
    # 2020/21: 6
    if value == seasons[6]:
        df.at[index, 'YearOfSeason Encoding'] = 6 

In [559]:
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,Time,Time Encoding,HomeTeam,HomeTeam Encoding,AwayTeam,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,Fall,0,12/09/2020,2020/21,Afternoon,0,Fulham,8,Arsenal,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,1,Fall,0,12/09/2020,2020/21,Late-Day,2,Crystal Palace,6,Southampton,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,2,Fall,0,12/09/2020,2020/21,Late-Day,2,Liverpool,13,Leeds,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,3,Fall,0,12/09/2020,2020/21,Late-Day,2,West Ham,28,Newcastle,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,4,Fall,0,13/09/2020,2020/21,Mid-Day,1,West Brom,27,Leicester,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,665,Summer,3,26/07/2020,2019/20,Mid-Day,1,Leicester,12,Man United,...,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,666,Summer,3,26/07/2020,2019/20,Mid-Day,1,Man City,14,Norwich,...,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,667,Summer,3,26/07/2020,2019/20,Mid-Day,1,Newcastle,17,Liverpool,...,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,668,Summer,3,26/07/2020,2019/20,Mid-Day,1,Southampton,21,Sheffield United,...,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [560]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "Unnamed: 0", "YearOfSeason", "Time", "Date", "FTHG", "FTAG", "HTHG", "HTAG", "HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

X

Unnamed: 0,Season Encoding,Time Encoding,HomeTeam Encoding,AwayTeam Encoding,Referee Encoding,Fouls Called Per Game,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,0,8,0,7,14.0,5.0,13.0,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,0,2,6,21,27,14.0,5.0,9.0,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,0,2,13,11,28,14.0,22.0,6.0,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,0,2,28,17,11,13.0,15.0,15.0,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,0,1,27,12,10,15.0,7.0,13.0,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,3,1,12,15,16,14.0,14.0,7.0,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,5
666,3,1,14,18,25,15.0,31.0,5.0,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,5
667,3,1,17,13,10,15.0,3.0,14.0,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,5
668,3,1,21,20,3,13.0,13.0,5.0,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,5


In [561]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 632 entries, 0 to 669
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        632 non-null    int64  
 1   Time Encoding          632 non-null    int64  
 2   HomeTeam Encoding      632 non-null    int64  
 3   AwayTeam Encoding      632 non-null    int64  
 4   Referee Encoding       632 non-null    int64  
 5   Fouls Called Per Game  632 non-null    float64
 6   HS                     632 non-null    float64
 7   AS                     632 non-null    float64
 8   HST                    632 non-null    float64
 9   AST                    632 non-null    float64
 10  HF                     632 non-null    float64
 11  AF                     632 non-null    float64
 12  HC                     632 non-null    float64
 13  AC                     632 non-null    float64
 14  HY                     632 non-null    float64
 15  AY    

In [562]:
"""
Target Varible - Full Time Results
0 - Home Team Win
1 - Away Team Win
2 - Draw
"""

y

0      1
1      0
2      0
3      1
4      1
      ..
665    1
666    0
667    1
668    0
669    2
Name: FTR Encoding, Length: 632, dtype: int64

# Split the Data

In [563]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Scale the Data

In [564]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Fitting & Evaluating the Model

In [565]:
nn_clf = MLPClassifier(solver='adam', alpha=1e-4,
                     hidden_layer_sizes=(5, 4), random_state=42,
                       max_iter=500, learning_rate='adaptive',
                       activation='logistic'
                      )

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.6929133858267716


In [566]:
results = nn_clf.predict(X_test)

In [567]:
# Precision_Recall_F-Score
p_r_f = precision_recall_fscore_support(y_test, results, average='macro')
p_r_f

(0.4668192219679634, 0.5376235399820306, 0.4991515151515151, None)

In [568]:
test_acc = accuracy_score(y_test, results)
test_acc

0.6929133858267716

In [569]:
cv_dict = cross_validate(nn_clf, X, y, return_train_score=True)
cv_dict

{'fit_time': array([0.70010519, 0.67504334, 0.65106034, 0.67004371, 0.6650064 ]),
 'score_time': array([0.0020082 , 0.00199914, 0.0020051 , 0.00197697, 0.0010078 ]),
 'test_score': array([0.61417323, 0.57480315, 0.52380952, 0.58730159, 0.61904762]),
 'train_score': array([0.63168317, 0.63960396, 0.64229249, 0.6541502 , 0.63636364])}

In [570]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')
print('Accuracy:',cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

Accuracy: [0.56213018 0.60119048 0.5297619 ] 

[[165  46   0]
 [ 39 120   2]
 [ 73  60   0]]


# Evaluating the Model

In [571]:
nn_clf.classes_

array([0, 1, 2], dtype=int64)

In [572]:
nn_clf.loss_

0.8589605798848229

In [573]:
nn_clf.best_loss_

0.8589605798848229

In [574]:
nn_clf.loss_curve_

[1.0940765516191997,
 1.092518255564824,
 1.0911041008478772,
 1.0896267051711503,
 1.08850691846869,
 1.0874188749750466,
 1.0863268295017117,
 1.0854447082545855,
 1.0846138078760768,
 1.083959896960626,
 1.0832437658130312,
 1.0825393955765186,
 1.0820515334916034,
 1.0815942084777423,
 1.0811468189206546,
 1.0806830589762983,
 1.0804066774111445,
 1.0800970510718757,
 1.0798531560476872,
 1.0794878789053926,
 1.0792819153865454,
 1.0789983356241657,
 1.078809943669554,
 1.078627279479161,
 1.0784450955384504,
 1.0781711154785827,
 1.0779698788172942,
 1.0777944587005708,
 1.0776237155585093,
 1.0773928658493566,
 1.0771964281332282,
 1.076978192101889,
 1.0768025298216128,
 1.0765560776153595,
 1.0763610426116925,
 1.0761323399942226,
 1.0759010017037844,
 1.0756660942388645,
 1.0754777013633845,
 1.075219381611569,
 1.0749386692399079,
 1.0746804167155057,
 1.0744363785933038,
 1.0741350904106055,
 1.0738610551577314,
 1.073580013192168,
 1.0733047200288521,
 1.0730011425971953,
 

In [575]:
str(nn_clf.t_) + " training instances seen during fitting"

'252500 training instances seen during fitting'

In [576]:
str(nn_clf.n_iter_) + "  iterations"

'500  iterations'

In [577]:
str(nn_clf.n_layers_) + " layers"

'4 layers'

In [578]:
str(nn_clf.n_outputs_) + " outputs"

'3 outputs'

In [579]:
"Output Activation: " + str(nn_clf.out_activation_)

'Output Activation: softmax'

In [580]:
nn_clf.get_params(nn_clf)

{'activation': 'logistic',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (5, 4),
 'learning_rate': 'adaptive',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 500,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [581]:
nn_clf.coefs_

[array([[-0.02839718,  0.04134188, -0.17796999,  0.06379125, -0.07358444],
        [ 0.44680163, -0.31051758,  0.11204019,  0.10028047, -0.13222903],
        [ 0.16592473,  0.03590658, -0.03623863, -0.06146196, -0.38530398],
        [-0.43573026,  0.15482099, -0.17547308,  0.14643719,  0.20117194],
        [ 0.53164711,  0.16428464,  0.15261765, -0.17600605,  0.05444625],
        [ 0.57006878, -0.13658701,  0.02275098,  0.1903485 , -0.28927559],
        [ 0.58508816,  0.0880922 ,  0.23847837, -0.51829196,  0.64383027],
        [ 0.10709958,  0.17514893, -0.2153068 , -0.16905981,  0.03085644],
        [-0.99996784, -1.03488049, -1.40089913,  1.20975723, -1.27999612],
        [ 1.4497328 ,  0.85603898,  1.08041424, -1.1628947 ,  0.76491099],
        [-0.4089677 , -0.00149554,  0.11351201,  0.10238725, -0.30230584],
        [-0.09754095, -0.0478014 , -0.18758292, -0.11134427,  0.06874157],
        [-0.41773866,  0.01427902,  0.3890264 , -0.5491451 ,  0.22965582],
        [-0.53738814, -0.

In [582]:
len(nn_clf.coefs_)

3

In [583]:
len(nn_clf.coefs_[0])

19

In [584]:
len(nn_clf.coefs_[1])

5

In [585]:
len(nn_clf.coefs_[2])

4

# Test With a Recent Game

Game Used: Man City vs Chelsea FA Cup (4/17/2021) -- Chelsea won (ie the Home Team won)

Data came from these links and our data
- https://www.espn.com/soccer/matchstats?gameId=597190
- https://www.si.com/soccer/manchestercity/match-coverage/manchester-city-vs-chelsea-where-to-watch-team-news-referees-everything-you-need-to-know

---------------------------
Values Needed:

    'Season Encoding'

     'Time Encoding'

     'HomeTeam Encoding'

     'AwayTeam Encoding'

     'Referee Encoding'

     'Fouls Called Per Game'

     'HS'

     'AS'

     'HST'

     'AST'

     'HF'

     'AF'

     'HC'

     'AC'

     'HY'

     'AY'

     'HR'

     'AR'

     'YearOfSeason Encoding'

In [586]:
# Data came from these links and our data

test_game = np.array([
    2, # 2 bc it took place in Spring
    0, # 0 bc kickoff was at 12:32 (rounds to 12:30)
    5, # Chelsea: 5 in the Team encoding
    14, # Man City: 14 in the Team encoding
    #0,
    #0,
    23, # Mike Dean was the referee
    14, # got from our data
    5,
    11,
    3,
    3,
    8,
    12,
    3,
    8,
    2,
    3,
    0,
    0,
    6 # 6 bc its the 2020/2021 season
            ]).reshape(1, -1)

result = nn_clf.predict(test_game)
print(result)

if result[0] == 0:
    print("Correct!",end=' ')
else:
    print("Incorrect!",end=' ')
    
print("Chelsea, the Home Team, won.")

[1]
Incorrect! Chelsea, the Home Team, won.
