In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
cwd = os.path.join(os.getcwd(), 'Group Coursework Brief-20221106', 'Data_Files', 'Data_Files')
dirName_trainData = os.path.join(cwd, 'epl-training.csv')

In [3]:
df_epl_train = pd.read_csv(dirName_trainData)

In [4]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


In [5]:
# Transform the date column from strings into datetime objects
df_epl_train["Date"] = pd.to_datetime(df_epl_train["Date"], dayfirst=True)

In [7]:
df_epl_train.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,14.0,4.0,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,10.0,5.0,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,3.0,9.0,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,4.0,6.0,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,8.0,6.0,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0


In [8]:
# This function will take as input a date, HomeTeam and AwayTeam and output a filtered dataframe where the matches shown are played before input data and match is between HomeTeam and AwayTeam

# For Example:
# date = "24/06/2020"
# HomeTeam = "Newcastle"
# AwayTeam = "Aston Villa"

def filter_dataframe(date, HomeTeam, AwayTeam):
    # Convert the input string date into datetime
    date = pd.to_datetime(date, dayfirst=True)

    # Filter the dataframe to include only rows where Date<input(Date) && HomeTeam=input(HomeTeam) && AwayTeam=input(AwayTeam)
    df_epl_train_filtered = df_epl_train.copy()
    df_epl_train_filtered = df_epl_train_filtered[(df_epl_train.Date<date) & (df_epl_train.HomeTeam==HomeTeam) & (df_epl_train.AwayTeam==AwayTeam)]

    # Return filtered dataframe
    return df_epl_train_filtered

# An example to see what the function does:
print(filter_dataframe("24/06/2020", "Newcastle", "Aston Villa"))

           Date   HomeTeam     AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  \
377  2001-05-19  Newcastle  Aston Villa   3.0   0.0   H   2.0   0.0   H   
486  2001-11-03  Newcastle  Aston Villa   3.0   0.0   H   1.0   0.0   H   
1105 2003-04-21  Newcastle  Aston Villa   1.0   1.0   D   1.0   0.0   H   
1244 2003-11-01  Newcastle  Aston Villa   1.0   1.0   D   1.0   1.0   D   
1825 2005-04-02  Newcastle  Aston Villa   0.0   3.0   A   0.0   1.0   A   
2041 2005-12-03  Newcastle  Aston Villa   1.0   1.0   D   1.0   0.0   H   
2526 2007-01-31  Newcastle  Aston Villa   3.0   1.0   H   2.0   1.0   H   
2679 2007-08-18  Newcastle  Aston Villa   0.0   0.0   D   0.0   0.0   D   
3148 2008-11-03  Newcastle  Aston Villa   2.0   0.0   H   0.0   0.0   D   
3818 2010-08-22  Newcastle  Aston Villa   6.0   0.0   H   3.0   0.0   H   
4418 2012-02-05  Newcastle  Aston Villa   2.0   1.0   H   1.0   1.0   D   
4587 2012-09-02  Newcastle  Aston Villa   1.0   1.0   D   0.0   1.0   A   
5206 2014-02-23  Newcastl

In [9]:
# This function takes as input the filtered dataframe from previous cell, features to average and a dictionary,
# it then appends an average of each feature to the dictionary

def average_columns(features, avg_features, filtered_df):
    for feature in features:
        df_col_means = df_epl_train_filtered[feature].mean()
        avg_features[feature].append(df_col_means)

In [None]:
# Run the two functions for each row of our df_epl_train dataframe to fill dictionary with AVG for each match
# NOTE: Some matches won't have past stats since the two teams may not have played against each other in past or we might not have the data

# These are the features we want to get averages for
features = ["FTHG","FTAG","HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]
avg_features = {
                    "FTHG": [],
                    "FTAG": [],
                    "HTHG": [],
                    "HTAG": [],
                    "HS"  : [],
                    "AS"  : [],
                    "HST" : [],
                    "AST" : [],
                    "HF"  : [],
                    "AF"  : [],
                    "HC"  : [],
                    "AC"  : [],
                    "HY"  : [],
                    "AY"  : [],
                    "HR"  : [],
                    "AR"  : []
                }

# Run the two functions on each row of the df_epl_train and fill the dictionary
# For each row in the dataframe
for index, row in df_epl_train.iterrows():
    # Filter the dataframe to only show matches played between those teams and before the certain date
    df_epl_train_filtered = filter_dataframe(row["Date"],row["HomeTeam"],row["AwayTeam"])
    # Get averages from the filtered dataframe and add the the dictionary
    average_columns(features, avg_features, df_epl_train_filtered)