# Imports

In [110]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [111]:
from warnings import filterwarnings
filterwarnings('ignore')

# Loads Data

In [112]:
df = pd.read_csv(r"League_Result_Data/Encoded_PremierLeague_Stats_From_2014to2021.csv")
df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,...,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0
1,1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,...,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0
2,2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,...,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0
3,3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,...,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0
4,4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,...,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,...,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0
2565,2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,...,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0
2566,2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,...,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0
2567,2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,...,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0


## # Encodes the YearOfSeason values

In [113]:
df = df.dropna()

In [114]:
seasons = set(df['YearOfSeason'].values)
seasons = list(seasons)
seasons

['2019/20', '2015/16', '2016/17', '2017/18', '2020/21', '2014/15', '2018/19']

In [115]:
# re-orders the list 
seasons = ['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']
seasons

['2014/15', '2015/16', '2016/17', '2017/18', '2018/19', '2019/20', '2020/21']

In [116]:
df['YearOfSeason Encoding'] = -9

for index, value in df['YearOfSeason'].iteritems():
    
    # 2014/15: 0
    if value == seasons[0]:
        df.at[index, 'YearOfSeason Encoding'] = 0
    
    # 2015/16: 1
    if value == seasons[1]:
        df.at[index, 'YearOfSeason Encoding'] = 1
        
    # 2016/17: 2
    if value == seasons[2]:
        df.at[index, 'YearOfSeason Encoding'] = 2
    
    # 2017/18: 3
    if value == seasons[3]:
        df.at[index, 'YearOfSeason Encoding'] = 3
        
    # 2018/19: 4
    if value == seasons[4]:
        df.at[index, 'YearOfSeason Encoding'] = 4
        
    # 2019/20: 5
    if value == seasons[5]:
        df.at[index, 'YearOfSeason Encoding'] = 5
    
    # 2020/21: 6
    if value == seasons[6]:
        df.at[index, 'YearOfSeason Encoding'] = 6 

In [117]:
df

Unnamed: 0.1,Unnamed: 0,Season,Season Encoding,Date,YearOfSeason,HomeTeam,HomeTeam Encoding,AwayTeam,AwayTeam Encoding,FTHG,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,Fall,0,12/09/2020,2020/21,Fulham,8,Arsenal,0,0.0,...,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,1,Fall,0,12/09/2020,2020/21,Crystal Palace,6,Southampton,21,1.0,...,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,2,Fall,0,12/09/2020,2020/21,Liverpool,13,Leeds,11,4.0,...,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,3,Fall,0,12/09/2020,2020/21,West Ham,28,Newcastle,17,0.0,...,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,4,Fall,0,13/09/2020,2020/21,West Brom,27,Leicester,12,0.0,...,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2564,Spring,2,24/05/15,2014/15,Everton,7,Tottenham,25,0.0,...,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0,0
2565,2565,Spring,2,24/05/15,2014/15,Hull,10,Man United,15,0.0,...,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0,0
2566,2566,Spring,2,24/05/15,2014/15,Leicester,12,QPR,19,5.0,...,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0,0
2567,2567,Spring,2,24/05/15,2014/15,Man City,14,Southampton,21,2.0,...,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0,0


In [118]:
# Features -- Drops FTR and any categorical value 
X = df.drop(columns=["Season", "Unnamed: 0", "YearOfSeason", "Date", "FTHG", "FTAG", "HTHG", "HTAG", "HomeTeam", "AwayTeam", "Referee", "FTR", "FTR Encoding", "HTR", "Referee"])
# Labels
y = df["FTR Encoding"]

X

Unnamed: 0,Season Encoding,HomeTeam Encoding,AwayTeam Encoding,HTR Encoding,Referee Encoding,Fouls Called Per Game,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,YearOfSeason Encoding
0,0,8,0,1,7,14.0,5.0,13.0,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6
1,0,6,21,0,27,14.0,5.0,9.0,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,6
2,0,13,11,0,28,14.0,22.0,6.0,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,6
3,0,28,17,2,11,13.0,15.0,15.0,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,6
4,0,27,12,2,10,15.0,7.0,13.0,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,2,7,25,1,27,14.0,9.0,16.0,1.0,3.0,12.0,8.0,3.0,5.0,1.0,2.0,0.0,0.0,0
2565,2,10,15,2,15,13.0,16.0,7.0,6.0,1.0,12.0,15.0,8.0,1.0,2.0,2.0,0.0,1.0,0
2566,2,12,19,0,28,14.0,22.0,18.0,7.0,2.0,7.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0,0
2567,2,14,21,0,2,14.0,15.0,13.0,6.0,4.0,13.0,8.0,8.0,4.0,1.0,1.0,0.0,0.0,0


In [119]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2414 entries, 0 to 2568
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season Encoding        2414 non-null   int64  
 1   HomeTeam Encoding      2414 non-null   int64  
 2   AwayTeam Encoding      2414 non-null   int64  
 3   HTR Encoding           2414 non-null   int64  
 4   Referee Encoding       2414 non-null   int64  
 5   Fouls Called Per Game  2414 non-null   float64
 6   HS                     2414 non-null   float64
 7   AS                     2414 non-null   float64
 8   HST                    2414 non-null   float64
 9   AST                    2414 non-null   float64
 10  HF                     2414 non-null   float64
 11  AF                     2414 non-null   float64
 12  HC                     2414 non-null   float64
 13  AC                     2414 non-null   float64
 14  HY                     2414 non-null   float64
 15  AY  

In [120]:
"""
Target Varible - Full Time Results
0 - Home Team Win
1 - Away Team Win
2 - Draw
"""

y

0       1
1       0
2       0
3       1
4       1
       ..
2564    1
2565    2
2566    0
2567    0
2568    0
Name: FTR Encoding, Length: 2414, dtype: int64

# Split the Data

In [121]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Scale the Data

In [122]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Fitting & Evaluating the Model

In [152]:
nn_clf = MLPClassifier(solver='adam', alpha=1e-3,
                     hidden_layer_sizes=(5, 4), random_state=42,
                       max_iter=500, learning_rate='adaptive',
                       activation='logistic',
                       power_t=0.75,
                       warm_start=False,
                       beta_2=0.75,
                       beta_1=0.95
                      )

nn_clf.fit(X_train, y_train)

score = nn_clf.score(X_test, y_test)

print(score)

0.6521739130434783


In [153]:
results = nn_clf.predict(X_test)

In [154]:
# Precision_Recall_F-Score
p_r_f = precision_recall_fscore_support(y_test, results, average='macro')
p_r_f

(0.601903464078278, 0.5958236319587459, 0.5922537431048069, None)

In [155]:
test_acc = accuracy_score(y_test, results)
test_acc

0.6521739130434783

In [156]:
cv_dict = cross_validate(nn_clf, X, y, return_train_score=True)
cv_dict

{'fit_time': array([2.24300027, 2.27899909, 2.23701334, 2.20202708, 2.21003842]),
 'score_time': array([0.00203276, 0.00103331, 0.00198817, 0.00197268, 0.00199986]),
 'test_score': array([0.61904762, 0.63146998, 0.66252588, 0.6563147 , 0.61825726]),
 'train_score': array([0.64111859, 0.64992232, 0.65251165, 0.64370792, 0.67546584])}

In [157]:
# Confusion Matrix

cv_score = cross_val_score(nn_clf, X_train, y_train, cv=3, scoring='accuracy')
print('Accuracy:',cv_score, '\n')

y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)
print(confusion_matrix(y_train, y_train_pred))

Accuracy: [0.6257764  0.6568323  0.65474339] 

[[722 115  50]
 [ 99 430  52]
 [194 174  95]]


# Evaluating the Model

In [158]:
nn_clf.classes_

array([0, 1, 2], dtype=int64)

In [159]:
nn_clf.loss_

0.7130879073088664

In [160]:
nn_clf.best_loss_

0.7130879073088664

In [161]:
nn_clf.loss_curve_

[1.0886809158651405,
 1.0798342440106639,
 1.0738691061580095,
 1.0676499184316317,
 1.0631772889536981,
 1.0606235456043527,
 1.0593589752686576,
 1.0581449763438835,
 1.0576916443110242,
 1.0570567698429956,
 1.0563854819876286,
 1.055430984294561,
 1.0546199874348527,
 1.053575939214619,
 1.0526422104744209,
 1.0517451896499705,
 1.0507061470285404,
 1.0496717813911547,
 1.0485211660888045,
 1.0472755765330368,
 1.0460054391640257,
 1.044626859975289,
 1.0432472911128374,
 1.041821543503642,
 1.0403002032950177,
 1.0386938493712303,
 1.0369723238158928,
 1.03513119350918,
 1.0333619370275517,
 1.0314270164758224,
 1.029509506371735,
 1.0274333872370307,
 1.0251804353284169,
 1.023101389775676,
 1.0208312072362338,
 1.018449302486715,
 1.0158663336946334,
 1.013449914204491,
 1.010822503232399,
 1.0080546265833745,
 1.0053573502116566,
 1.002343715095784,
 0.9995377306818135,
 0.9966676261773644,
 0.9937457843710853,
 0.990588062706752,
 0.9875594528425903,
 0.9844320242040675,
 0.98

In [162]:
str(nn_clf.t_) + " training instances seen during fitting"

'965500 training instances seen during fitting'

In [163]:
str(nn_clf.n_iter_) + "  iterations"

'500  iterations'

In [164]:
str(nn_clf.n_layers_) + " layers"

'4 layers'

In [165]:
str(nn_clf.n_outputs_) + " outputs"

'3 outputs'

In [166]:
"Output Activation: " + str(nn_clf.out_activation_)

'Output Activation: softmax'

In [167]:
nn_clf.get_params(nn_clf)

{'activation': 'logistic',
 'alpha': 0.001,
 'batch_size': 'auto',
 'beta_1': 0.95,
 'beta_2': 0.75,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (5, 4),
 'learning_rate': 'adaptive',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 500,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.75,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [168]:
nn_clf.coefs_

[array([[ 0.04249882, -0.04996862,  0.23135228, -0.03900881, -0.15499338],
        [-0.08374523, -0.17896359,  0.02551801,  0.20325901,  0.40328323],
        [ 0.07913872,  0.37288758,  0.14181727,  0.13857043, -0.62941187],
        [-3.45449329, -0.94218778,  0.0965622 ,  1.20512021, -0.94038824],
        [ 0.00775039, -0.29047133,  0.60110945, -0.0917333 , -0.42356661],
        [ 0.05820743, -0.23264952, -0.0552907 , -0.12609863,  0.25194799],
        [-0.14617068, -0.2860386 ,  0.07519356, -0.32962011, -0.10652687],
        [ 0.10341227,  0.26280615, -0.37343329,  0.20680469,  0.30357   ],
        [ 0.23149208, -0.44041728, -1.24233494,  0.4881045 , -0.68388772],
        [-0.19857543,  0.60513324,  0.16394541, -0.85219154,  0.74414697],
        [ 0.16171261,  0.52360103,  0.45083877,  0.53781852, -0.01186428],
        [ 0.07916793, -0.1928601 ,  0.26176198, -0.18452274, -0.08940233],
        [-0.26695269, -0.11871414,  0.31150474, -0.59536511, -0.2137354 ],
        [ 0.11627595, -0.

In [169]:
len(nn_clf.coefs_)

3

In [170]:
len(nn_clf.coefs_[0])

19

In [171]:
len(nn_clf.coefs_[1])

5

In [172]:
len(nn_clf.coefs_[2])

4

# Test With a Recent Game

Game Used: Man City vs Chelsea FA Cup (4/17/2021) -- Chelsea won (ie the Home Team won)

Data came from these links and our data
- https://www.espn.com/soccer/matchstats?gameId=597190
- https://www.si.com/soccer/manchestercity/match-coverage/manchester-city-vs-chelsea-where-to-watch-team-news-referees-everything-you-need-to-know

In [173]:
list(X.columns) # The values we need to make a prediction 

['Season Encoding',
 'HomeTeam Encoding',
 'AwayTeam Encoding',
 'HTR Encoding',
 'Referee Encoding',
 'Fouls Called Per Game',
 'HS',
 'AS',
 'HST',
 'AST',
 'HF',
 'AF',
 'HC',
 'AC',
 'HY',
 'AY',
 'HR',
 'AR',
 'YearOfSeason Encoding']

In [174]:
# Data came from these links and our data

test_game = np.array([
    2, # 2 bc it took place in Spring
    0, # 0 bc kickoff was at 12:32 (rounds to 12:30)
    5, # Chelsea: 5 in the Team encoding
    14, # Man City: 14 in the Team encoding
    #0,
    #0,
    23, # Mike Dean was the referee
    14, # got from our data
    5,
    11,
    3,
    3,
    8,
    12,
    3,
    8,
    2,
    3,
    0,
    0,
    6 # 6 bc its the 2020/2021 season
            ]).reshape(1, -1)

result = nn_clf.predict(test_game)
print(result)

if result[0] == 0:
    print("Correct!",end=' ')
else:
    print("Incorrect!",end=' ')
    
print("Chelsea, the Home Team, won.")

[0]
Correct! Chelsea, the Home Team, won.
