# COMP0036 Group Assignment: BEAT THE BOOKIE

### Group N

In [157]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [158]:
cwd = os.path.join(os.getcwd(), 'Group Coursework Brief-20221106', 'Data_Files', 'Data_Files')
dirName_trainData = os.path.join(cwd, 'epl-training.csv')

In [159]:
df_epl_train = pd.read_csv(dirName_trainData)

# NOTE: I have manually removed the (one) line in csv file with blanks

# This is code to automatically remove rows with blanks cells - not currently working
# # Remove any rows with blank cells - Clean Data
# df_epl_train_3 = df_epl_train_2.copy()
# df_epl_train_3 = df_epl_train3.replace(r'^\s*$', float('NaN'), regex = True)

# df_epl_train = df_epl_train_3.copy()
# df_epl_train = df_epl_train.dropna(inplace = True)

In [160]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


## Model 1 (Supervised Learning): Decision Tree, Nearest Neighbour Classifiers, Random Forest
This models use the features Day, Month, HomeTeam and AwayTeam to predict the FTR result of a test match given its Date (Day, Month), HomeTeam and AwayTeam. This is a fully supervised learning approach. We would like to expand this by shifting away from supervised learning, as the test data only has Date, HomeTeam and AwayTeam. 

In [161]:
# Encode some of the columns in the dataframe
# NOTE: Using Label_Encoder to encode the (catergorical) values in the dataframe, this works by taking a list of values and mapping them to numbers - e.g. each referee in list will get a number
# See the link: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

df_epl_train["Referee_Enc"] = df_epl_train["Referee"].astype("category").cat.codes
df_epl_train["AwayTeam_Enc"] = df_epl_train["AwayTeam"].astype("category").cat.codes
df_epl_train["HomeTeam_Enc"] = df_epl_train["HomeTeam"].astype("category").cat.codes

# Transform the date column into day and month columns and Add into dataframe (Extract days & months from date)
df_epl_train["Date"] = pd.to_datetime(df_epl_train["Date"])
df_epl_train["Day"] = df_epl_train["Date"].dt.day
df_epl_train["Month"] = df_epl_train["Date"].dt.month 
df_epl_train["Year"] = df_epl_train["Date"].dt.year


# This (below) is the transformed dataframe with new columns (at front) with categorical values converted into labels
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HY,AY,HR,AR,Referee_Enc,AwayTeam_Enc,HomeTeam_Enc,Day,Month,Year
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,1.0,2.0,0.0,0.0,136,25,12,19,8,2000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,1.0,2.0,0.0,0.0,63,42,13,19,8,2000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,5.0,3.0,1.0,0.0,19,27,14,19,8,2000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,1.0,1.0,0.0,0.0,12,35,16,19,8,2000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,1.0,3.0,0.0,0.0,44,17,22,19,8,2000


In [162]:
# Create the input features matrix X (made of day, month, HomeTeam, AwayTeam, Referee)
# Create the output values y vector (made of FTR)
# Take these values from the transformed dataframe

X = df_epl_train.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'AwayTeam_Enc']].values
y = df_epl_train.loc[:,'FTR'].values

In [163]:
# Split the training data in a 80-20 split and keep 20% for final testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=22)

# Encode the y output values as well
FTR_encoder = LabelEncoder()
y_train = FTR_encoder.fit_transform(y_train);

Now that we have our input attributes and outputs we can create and test different prediction models:

Using a Decision Tree Classifier with our training data:

In [164]:
# Create an empty Tree model
DT_Model = DecisionTreeClassifier(random_state=42)
# Fit the model using training data
DT_Model.fit(X_train, y_train)
# Make predictions using the model we have created
DT_predictions_test = DT_Model.predict(X_test)
# Reconverting prediction values (i.e. 0, 1 or 2) back into (H, D or A) using the FTR_encoder defined in earlier cell
DT_predictions_test = FTR_encoder.inverse_transform(DT_predictions_test)

In [165]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
# Returns fraction of correctly classified samples

print(accuracy_score(DT_predictions_test, y_test))

0.3934040047114252


In [166]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

print(classification_report(DT_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.33      0.34      0.34       460
           D       0.25      0.24      0.24       433
           H       0.51      0.51      0.51       805

    accuracy                           0.39      1698
   macro avg       0.36      0.36      0.36      1698
weighted avg       0.39      0.39      0.39      1698



Using a Nearest Neighbor Classifier with our training data:

In [145]:
# Create an empty KNN model
KNN_Model = KNeighborsClassifier(n_neighbors=6)
# Fit the model using training data
KNN_Model.fit(X_train, y_train)
# Make predictions using the model we have created
KNN_predictions_test = KNN_Model.predict(X_test)
KNN_predictions_test = FTR_encoder.inverse_transform(KNN_predictions_test)

In [146]:
print(accuracy_score(KNN_predictions_test, y_test))

0.4122497055359246


In [147]:
print(classification_report(KNN_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.41      0.34      0.37       577
           D       0.18      0.26      0.21       278
           H       0.53      0.51      0.52       843

    accuracy                           0.41      1698
   macro avg       0.37      0.37      0.37      1698
weighted avg       0.43      0.41      0.42      1698



In [153]:
# Create an empty KNN model
RF_Model = RandomForestClassifier(n_estimators=50, random_state=42)
# Fit the model using training data
RF_Model.fit(X_train, y_train)
# Make predictions using the model we have created
RF_predictions_test = RF_Model.predict(X_test)
RF_predictions_test = FTR_encoder.inverse_transform(RF_predictions_test)

In [154]:
print(accuracy_score(RF_predictions_test, y_test))

0.45288574793875147


In [155]:
print(classification_report(RF_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.34      0.39      0.36       418
           D       0.19      0.27      0.22       297
           H       0.66      0.54      0.59       983

    accuracy                           0.45      1698
   macro avg       0.39      0.40      0.39      1698
weighted avg       0.50      0.45      0.47      1698

