# COMP0036 Group Assignment: BEAT THE BOOKIE

## Group N

In [186]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

First we set the path for the dataset folder, to access the dataset.

In [187]:
cwd = os.getcwd() + '\\Data_Files\\Data_Files'
dirName_trainData = cwd + '\\epl-training.csv'
dirName_testData = cwd + '\\epl-test.csv'

Load the data into a pandas dataframe

In [188]:
df_epl_train = pd.read_csv(dirName_trainData)
df_epl_test = pd.read_csv(dirName_testData)

In [189]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


Now, let's create some basic predictors for a simple ML model. These predictors will consist of Date, HomeTeam and AwayTeam.

We need to convert the data types of data columns that aren't numeric data such as Date, as ML algorithms only work with numeric data.

In [190]:
df_epl_train.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTHG        float64
FTAG        float64
FTR          object
HTHG        float64
HTAG        float64
HTR          object
Referee      object
HS          float64
AS          float64
HST         float64
AST         float64
HC          float64
AC          float64
HF          float64
AF          float64
HY          float64
AY          float64
HR          float64
AR          float64
dtype: object

The following code overwrites the existing Date column and converts it into the datetime data type. (https://stackoverflow.com/questions/41495580/python-pandas-dataframe-predict-values-based-on-date)

In [191]:
df_epl_train['Date'] = pd.to_datetime(df_epl_train['Date'])  
df_epl_train['date_delta'] = (df_epl_train['Date'] - df_epl_train['Date'].min())  / np.timedelta64(1,'D')

In [192]:
df_epl_train["ht_code"] = df_epl_train["HomeTeam"].astype("category").cat.codes

In [193]:
df_epl_train["at_code"] = df_epl_train["AwayTeam"].astype("category").cat.codes

Our target for the ML model is what we are trying to predict thus the FTR, so we convert it into codes: H = 2, A = 1 & D = 0.

In [194]:
df_epl_train["target"] = df_epl_train["FTR"].astype("category").cat.codes

In [195]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HF,AF,HY,AY,HR,AR,date_delta,ht_code,at_code,target
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,13.0,12.0,1.0,2.0,0.0,0.0,222.0,12,25,2
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,19.0,14.0,1.0,2.0,0.0,0.0,222.0,13,42,2
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,15.0,21.0,5.0,3.0,1.0,0.0,222.0,14,27,0
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,11.0,13.0,1.0,1.0,0.0,0.0,222.0,16,35,1
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,21.0,20.0,1.0,3.0,0.0,0.0,222.0,22,17,2


In [196]:
from sklearn.ensemble import RandomForestClassifier

In [197]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [223]:
count_nan = df_epl_train['date_delta'].isnull().sum()
 
# printing the number of values present
# in the column
print('Number of NaN values present: ' + str(count_nan))

Number of NaN values present: 1


In [224]:
df_epl_train[df_epl_train['Date'].isnull()]

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HF,AF,HY,AY,HR,AR,date_delta,ht_code,at_code,target
5700,NaT,,,,,,,,,,...,,,,,,,,-1,-1,-1
