# COMP0036 Group Assignment: BEAT THE BOOKIE

### Group N

In [238]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [210]:
cwd = os.path.join(os.getcwd(), 'Group Coursework Brief-20221106', 'Data_Files', 'Data_Files')
dirName_trainData = os.path.join(cwd, 'epl-training.csv')

In [211]:
df_epl_train = pd.read_csv(dirName_trainData)

# NOTE: I have manually removed the (one) line in csv file with blanks

# This is code to automatically remove rows with blanks cells - not currently working
# # Remove any rows with blank cells - Clean Data
# df_epl_train_3 = df_epl_train_2.copy()
# df_epl_train_3 = df_epl_train3.replace(r'^\s*$', float('NaN'), regex = True)

# df_epl_train = df_epl_train_3.copy()
# df_epl_train = df_epl_train.dropna(inplace = True)

In [212]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


## Model 1 (Supervised Learning): Decision Tree, Nearest Neighbour Classifiers, Random Forest
This models use the features Day, Month, HomeTeam and AwayTeam to predict the FTR result of a test match given its Date (Day, Month), HomeTeam and AwayTeam. This is a fully supervised learning approach. We would like to expand this by shifting away from supervised learning, as the test data only has Date, HomeTeam and AwayTeam. 

In [213]:
# Encode some of the columns in the dataframe
# NOTE: Using Label_Encoder to encode the (catergorical) values in the dataframe, this works by taking a list of values and mapping them to numbers - e.g. each referee in list will get a number
# See the link: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

df_epl_train["Referee_Enc"] = df_epl_train["Referee"].astype("category").cat.codes
df_epl_train["AwayTeam_Enc"] = df_epl_train["AwayTeam"].astype("category").cat.codes
df_epl_train["HomeTeam_Enc"] = df_epl_train["HomeTeam"].astype("category").cat.codes

# Transform the date column into day and month columns and Add into dataframe (Extract days & months from date)
df_epl_train["Date"] = pd.to_datetime(df_epl_train["Date"])
df_epl_train["Day"] = df_epl_train["Date"].dt.day
df_epl_train["Month"] = df_epl_train["Date"].dt.month 
df_epl_train["Year"] = df_epl_train["Date"].dt.year


# This (below) is the transformed dataframe with new columns (at front) with categorical values converted into labels
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HY,AY,HR,AR,Referee_Enc,AwayTeam_Enc,HomeTeam_Enc,Day,Month,Year
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,1.0,2.0,0.0,0.0,136,25,12,19,8,2000
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,1.0,2.0,0.0,0.0,63,42,13,19,8,2000
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,5.0,3.0,1.0,0.0,19,27,14,19,8,2000
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,1.0,1.0,0.0,0.0,12,35,16,19,8,2000
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,1.0,3.0,0.0,0.0,44,17,22,19,8,2000


In [214]:
# Create the input features matrix X (made of day, month, HomeTeam, AwayTeam, Referee)
# Create the output values y vector (made of FTR)
# Take these values from the transformed dataframe

X = df_epl_train.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'AwayTeam_Enc']].values
y = df_epl_train.loc[:,'FTR'].values

In [215]:
# Split the training data in a 80-20 split and keep 20% for final testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=22)

# Encode the y output values as well
FTR_encoder = LabelEncoder()
y_train = FTR_encoder.fit_transform(y_train);

Now that we have our input attributes and outputs we can create and test different prediction models:

Using a Decision Tree Classifier with our training data:

In [216]:
# Create an empty Tree model
DT_Model = DecisionTreeClassifier(random_state=42)
# Fit the model using training data
DT_Model.fit(X_train, y_train)
# Make predictions using the model we have created
DT_predictions_test = DT_Model.predict(X_test)
# Reconverting prediction values (i.e. 0, 1 or 2) back into (H, D or A) using the FTR_encoder defined in earlier cell
DT_predictions_test = FTR_encoder.inverse_transform(DT_predictions_test)

In [217]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
# Returns fraction of correctly classified samples

print(accuracy_score(DT_predictions_test, y_test))

0.3934040047114252


In [218]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

print(classification_report(DT_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.33      0.34      0.34       460
           D       0.25      0.24      0.24       433
           H       0.51      0.51      0.51       805

    accuracy                           0.39      1698
   macro avg       0.36      0.36      0.36      1698
weighted avg       0.39      0.39      0.39      1698



Using a Nearest Neighbor Classifier with our training data:

In [219]:
# Create an empty KNN model
KNN_Model = KNeighborsClassifier(n_neighbors=6)
# Fit the model using training data
KNN_Model.fit(X_train, y_train)
# Make predictions using the model we have created
KNN_predictions_test = KNN_Model.predict(X_test)
KNN_predictions_test = FTR_encoder.inverse_transform(KNN_predictions_test)

In [220]:
print(accuracy_score(KNN_predictions_test, y_test))

0.4128386336866902


In [221]:
print(classification_report(KNN_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.42      0.35      0.38       582
           D       0.17      0.24      0.20       285
           H       0.54      0.52      0.53       831

    accuracy                           0.41      1698
   macro avg       0.37      0.37      0.37      1698
weighted avg       0.43      0.41      0.42      1698



In [222]:
# Create an empty KNN model
RF_Model = RandomForestClassifier(n_estimators=50, random_state=42)
# Fit the model using training data
RF_Model.fit(X_train, y_train)
# Make predictions using the model we have created
RF_predictions_test = RF_Model.predict(X_test)
RF_predictions_test = FTR_encoder.inverse_transform(RF_predictions_test)

In [223]:
print(accuracy_score(RF_predictions_test, y_test))

0.45229681978798586


In [224]:
print(classification_report(RF_predictions_test, y_test))

              precision    recall  f1-score   support

           A       0.33      0.38      0.36       420
           D       0.19      0.27      0.22       287
           H       0.66      0.53      0.59       991

    accuracy                           0.45      1698
   macro avg       0.39      0.40      0.39      1698
weighted avg       0.50      0.45      0.47      1698



# Predicting Premier League Game Results using historic data

## 1. Introduction
Recently, bookmakes have started applying Machine Learning in the context of predicting Football match outcomes. This was motivated to reduce risk exposure and maximise profit. Strategies include the collection of huge datasets (including real-time information such as injuries, fouls, substitutions, etc.).

Our project will focus on predicting match scores for the English Premier League, one of the most unpredictable and competitive footbal leagues, using historic data of the last 

## 2. Problem
We have been assigned to build model(s) that predict the FTR value, which can be Home Win (H), Draw (D) and Away Win (A). The general steps we will be taking to build the model(s) begins with finding a suitable dataset and performing feature engineering on the selected features to be used in the model. This entails creating functions or classes to convert the raw data and transforms it into a format where every match has that historic feature. Then, we perform feature selection to filter out uni

Div = League Division

Date = Match Date (dd/mm/yy)

HomeTeam = Home Team

AwayTeam = Away Team

FTHG = Full Time Home Team Goals

FTAG = Full Time Away Team Goals

FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)

HTHG = Half Time Home Team Goals

HTAG = Half Time Away Team Goals

HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)

Referee = Match Referee

HS = Home Team Shots

AS = Away Team Shots

HST = Home Team Shots on Target

AST = Away Team Shots on Target

HF = Home Team Fouls Committed

AF = Away Team Fouls Committed

HC = Home Team Corners

AC = Away Team Corners

HY = Home Team Yellow Cards

AY = Away Team Yellow Cards

HR = Home Team Red Cards

AR = Away Team Red Cards

In [251]:
# The following section will present an analysis of the dataset.
# The result of this will be used during feature engineering and feature selection

dirName_trainFullData = os.path.join(cwd, 'epl-full-training.csv')
df_epl_train = pd.read_csv(dirName_trainFullData)
cols = ["Div","Date","HomeTeam","AwayTeam","FTHG", "FTAG","FTR","HTHG","HTAG","HTR","Referee","HS","AS", "HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]

df_epl_train = df_epl_train.loc[:, cols]
df_epl_train = df_epl_train.reset_index(drop=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_epl_train.head())

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,17/08/2002,Blackburn,Sunderland,0.0,0.0,D,0.0,0.0,D,D Elleray,15.0,7.0,5.0,3.0,14.0,11.0,9.0,1.0,1.0,2.0,0.0,0.0
1,E0,17/08/2002,Charlton,Chelsea,2.0,3.0,A,2.0,1.0,H,G Barber,5.0,21.0,5.0,12.0,10.0,12.0,3.0,6.0,0.0,3.0,1.0,0.0
2,E0,17/08/2002,Everton,Tottenham,2.0,2.0,D,1.0,0.0,H,N Barry,13.0,10.0,9.0,5.0,18.0,4.0,10.0,5.0,1.0,1.0,0.0,0.0
3,E0,17/08/2002,Fulham,Bolton,4.0,1.0,H,3.0,1.0,H,A Wiley,13.0,3.0,6.0,1.0,16.0,12.0,7.0,4.0,1.0,2.0,0.0,0.0
4,E0,17/08/2002,Leeds,Man City,3.0,0.0,H,2.0,0.0,H,G Poll,13.0,18.0,8.0,10.0,13.0,13.0,2.0,7.0,1.0,1.0,0.0,0.0


In [249]:
describe = (df_epl_train.describe())

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display((df_epl_train.describe()))

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
count,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0,7736.0
mean,1.525853,1.159902,0.683428,0.509566,13.611556,10.802353,6.098759,4.77999,11.194157,11.631463,6.046406,4.791236,1.432911,1.740176,0.060496,0.087771
std,1.306086,1.147505,0.834433,0.725498,5.318015,4.659593,3.356911,2.827868,3.686632,3.84688,3.098894,2.746281,1.188024,1.264285,0.249547,0.29418
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,10.0,7.0,4.0,3.0,9.0,9.0,4.0,3.0,1.0,1.0,0.0,0.0
50%,1.0,1.0,0.0,0.0,13.0,10.0,6.0,4.0,11.0,11.0,6.0,4.0,1.0,2.0,0.0,0.0
75%,2.0,2.0,1.0,1.0,17.0,14.0,8.0,6.0,13.0,14.0,8.0,6.0,2.0,3.0,0.0,0.0
max,9.0,9.0,5.0,5.0,43.0,31.0,24.0,20.0,33.0,28.0,20.0,19.0,7.0,9.0,3.0,2.0


In [250]:
%matplotlib inline
fig=plt.figure(figsize=(18, 8), dpi= 80, facecolor='w', edgecolor='k')

# width of the bars
barWidth = 0.3
# Choose the height of the blue bars
bars1 = np.array(describe[1:2][(['FTHG','HTHG','HS','HST','HHW','HC','HF','HO','HY','HR'])]).flatten()
# Choose the height of the cyan bars
bars2 = np.array(describe[1:2][(['FTAG','HTAG','AS','AST','AHW','AC','AF','AO','AY','AR'])]).flatten()
# Choose the height of the error bars (bars1)
yer1 = np.array(describe[2:3][(['FTHG','HTHG','HS','HST','HHW','HC','HF','HO','HY','HR'])]).flatten()
# Choose the height of the error bars (bars2)
yer2 = np.array(describe[2:3][(['FTAG','HTAG','AS','AST','AHW','AC','AF','AO','AY','AR'])]).flatten()
# The x position of bars
r1 = np.arange(len(bars1.flatten()))
r2 = [x + barWidth for x in r1]
# Create blue bars
plt.bar(r1, bars1, width = barWidth, color = 'blue', edgecolor = 'black', yerr=yer1, capsize=7, label='Home_Side')
# Create cyan bars
plt.bar(r2, bars2, width = barWidth, color = 'red', edgecolor = 'black', yerr=yer2, capsize=7, label='Away_Side')
# general layout
plt.xticks([r + barWidth for r in range(len(bars1))], ['Goals Scored', 'Half-time Gaols Scored', 'Shots','Shots on target','Hit Woodwork', 'Corners','Fouls','Offsides','Yellow Cards','Red Cards','Booking points'])
plt.ylabel('Average Value')
plt.ylim(0)
plt.legend()
# Show graphic
plt.show()

KeyError: "['HO', 'HHW'] not in index"

<Figure size 1440x640 with 0 Axes>