# COMP0036 Group Assignment: BEAT THE BOOKIE

### Group N

In [135]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [136]:
cwd = os.path.join(os.getcwd(), 'Group Coursework Brief-20221106', 'Data_Files', 'Data_Files')
dirName_trainData = os.path.join(cwd, 'epl-training.csv')

In [137]:
df_epl_train = pd.read_csv(dirName_trainData)

# NOTE: I have manually removed the (one) line in csv file with blanks

# This is code to automatically remove rows with blanks cells - not currently working
# # Remove any rows with blank cells - Clean Data
# df_epl_train_3 = df_epl_train_2.copy()
# df_epl_train_3 = df_epl_train3.replace(r'^\s*$', float('NaN'), regex = True)

# df_epl_train = df_epl_train_3.copy()
# df_epl_train = df_epl_train.dropna(inplace = True)

In [138]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


## Model 1: Decision Tree & Nearest Neighbour Classifiers (Basic Model)

In [139]:
# Encode some of the columns in the dataframe
# NOTE: Using Label_Encoder to encode the (catergorical) values in the dataframe, this works by taking a list of values and mapping them to numbers - e.g. each referee in list will get a number
# See the link: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

df_epl_train["Referee_Enc"] = df_epl_train["Referee"].astype("category").cat.codes
df_epl_train["AwayTeam_Enc"] = df_epl_train["AwayTeam"].astype("category").cat.codes
df_epl_train["HomeTeam_Enc"] = df_epl_train["HomeTeam"].astype("category").cat.codes

# Transform the date column into day and month columns and Add into dataframe (Extract days & months from date)
temp = df_epl_train.iloc[:,0].values
print(temp)
days = list()
months = list()
for item in temp:
    item = str(item)
    if item[0] == "0":
        days.append(int(item[1]))
    else:
        days.append(int(item[0]+item[1]))
    if item[3] == '0':
        months.append(int(item[4]))
    else:
        months.append(int(item[3]+item[4]))

df_epl_train.insert(0, "Month", months, allow_duplicates=True)
df_epl_train.insert(0, "Day", days, allow_duplicates=True)


# This (below) is the transformed dataframe with new columns (at front) with categorical values converted into labels
df_epl_train.head()

['19/08/00' '19/08/00' '19/08/00' ... '29/10/2022' '30/10/2022'
 '30/10/2022']


Unnamed: 0,Day,Month,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AC,HF,AF,HY,AY,HR,AR,Referee_Enc,AwayTeam_Enc,HomeTeam_Enc
0,19,8,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,...,6.0,13.0,12.0,1.0,2.0,0.0,0.0,136,25,12
1,19,8,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,...,7.0,19.0,14.0,1.0,2.0,0.0,0.0,63,42,13
2,19,8,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,...,4.0,15.0,21.0,5.0,3.0,1.0,0.0,19,27,14
3,19,8,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,...,8.0,11.0,13.0,1.0,1.0,0.0,0.0,12,35,16
4,19,8,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,...,4.0,21.0,20.0,1.0,3.0,0.0,0.0,44,17,22


In [147]:
# Create the input features matrix X (made of day, month, HomeTeam, AwayTeam, Referee)
# Create the output values y vector (made of FTR)
# Take these values from the transformed dataframe

X = df_epl_train.iloc[:,[0,1,-1,-2,-3]].values
y = df_epl_train.iloc[:,7].values

In [148]:
print(X)

[[ 19   8  12  25 136]
 [ 19   8  13  42  63]
 [ 19   8  14  27  19]
 ...
 [ 29  10  24  22  89]
 [ 30  10   0  30 142]
 [ 30  10  26  42  23]]


In [149]:
# Encode the y output values as well
print(y)

print("---")

FTR_encoder = LabelEncoder()
y_transformed = FTR_encoder.fit_transform(y)
print(y_transformed)

['H' 'H' 'A' ... 'A' 'H' 'H']
---
[2 2 0 ... 0 2 2]


In [150]:
# Split the training data in a 80-20 split and keep 20% for final testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_transformed, test_size=0.2, random_state=22)
print("80% Training Data (INPUT): \n", X_train)
print("\n")
print("80% Training Data (OUTPUT): \n", y_train)

print("\n")

print("20% Testing Data (INPUT): \n", X_test)
print("\n")
print("20% Testing Data (OUTPUT): \n", y_test)

80% Training Data (INPUT): 
 [[ 26  12  20  26  89]
 [ 21   2  13   0  90]
 [ 21   8   9  40   4]
 ...
 [ 23  12  35  19  82]
 [ 14  12  38  39 130]
 [  3   1   6   0  89]]


80% Training Data (OUTPUT): 
 [0 0 2 ... 1 0 1]


20% Testing Data (INPUT): 
 [[ 14   9  18  41  82]
 [ 18   5  35  22 110]
 [ 26  12  13   9  85]
 ...
 [ 14   9  26  32 112]
 [ 29   4  23  13  84]
 [ 23   8  25  24 112]]


20% Testing Data (OUTPUT): 
 [1 0 2 ... 2 0 2]


Now that we have our input attributes and outputs we can create and test different prediction models:

Using a Decision Tree Classifier with our training data:

In [151]:
# Create an empty Tree model
DT_Model = DecisionTreeClassifier()
# Fit the model using training data
DT_Model.fit(X_train, y_train)
# Make predictions using the model we have created
DT_Preds = DT_Model.predict(X_test)
# NOTE: can convert prediction values (i.e. 0, 1 or 2) back into (H, D or A) using the FTR_encoder defined in earlier cell

In [152]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
# Returns fraction of correctly classified samples

print(accuracy_score(DT_Preds, y_test))

0.395170789163722


In [153]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

print(classification_report(DT_Preds, y_test))

              precision    recall  f1-score   support

           0       0.32      0.33      0.32       459
           1       0.27      0.25      0.26       450
           2       0.51      0.52      0.51       789

    accuracy                           0.40      1698
   macro avg       0.36      0.37      0.36      1698
weighted avg       0.39      0.40      0.39      1698



Using a Nearest Neighbor Classifier with our training data:

In [154]:
# Create an empty KNN model
KNN_Model = KNeighborsClassifier(n_neighbors=6)
# Fit the model using training data
KNN_Model.fit(X_train, y_train)
# Make predictions using the model we have created
KNN_Preds = KNN_Model.predict(X_test)

In [155]:
print(accuracy_score(KNN_Preds, y_test))

0.4004711425206125


In [156]:
print(classification_report(KNN_Preds, y_test))

              precision    recall  f1-score   support

           0       0.40      0.32      0.36       600
           1       0.17      0.25      0.20       274
           2       0.52      0.51      0.51       824

    accuracy                           0.40      1698
   macro avg       0.36      0.36      0.36      1698
weighted avg       0.42      0.40      0.41      1698

