In [82]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [83]:
cwd = os.path.join(os.getcwd(), 'Group Coursework Brief-20221106', 'Data_Files', 'Data_Files')
dirName_trainData = os.path.join(cwd, 'epl-full-training.csv')

In [84]:
df_epl_train = pd.read_csv(dirName_trainData)
cols = ["Div","Date","HomeTeam","AwayTeam","FTHG", "FTAG","FTR","HTHG","HTAG","HTR","Referee","HS","AS", "HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]

df_epl_train = df_epl_train.loc[:, cols]
df_epl_train = df_epl_train.reset_index(drop=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_epl_train.head())

  df_epl_train = pd.read_csv(dirName_trainData)


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,17/08/2002,Blackburn,Sunderland,0.0,0.0,D,0.0,0.0,D,D Elleray,15.0,7.0,5.0,3.0,14.0,11.0,9.0,1.0,1.0,2.0,0.0,0.0
1,E0,17/08/2002,Charlton,Chelsea,2.0,3.0,A,2.0,1.0,H,G Barber,5.0,21.0,5.0,12.0,10.0,12.0,3.0,6.0,0.0,3.0,1.0,0.0
2,E0,17/08/2002,Everton,Tottenham,2.0,2.0,D,1.0,0.0,H,N Barry,13.0,10.0,9.0,5.0,18.0,4.0,10.0,5.0,1.0,1.0,0.0,0.0
3,E0,17/08/2002,Fulham,Bolton,4.0,1.0,H,3.0,1.0,H,A Wiley,13.0,3.0,6.0,1.0,16.0,12.0,7.0,4.0,1.0,2.0,0.0,0.0
4,E0,17/08/2002,Leeds,Man City,3.0,0.0,H,2.0,0.0,H,G Poll,13.0,18.0,8.0,10.0,13.0,13.0,2.0,7.0,1.0,1.0,0.0,0.0


In [85]:
# Transform the date column from strings into datetime objects
df_epl_train["Date"] = pd.to_datetime(df_epl_train["Date"], dayfirst=True)

In [86]:
df_epl_train.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,2002-08-17,Blackburn,Sunderland,0.0,0.0,D,0.0,0.0,D,...,5.0,3.0,14.0,11.0,9.0,1.0,1.0,2.0,0.0,0.0
1,E0,2002-08-17,Charlton,Chelsea,2.0,3.0,A,2.0,1.0,H,...,5.0,12.0,10.0,12.0,3.0,6.0,0.0,3.0,1.0,0.0
2,E0,2002-08-17,Everton,Tottenham,2.0,2.0,D,1.0,0.0,H,...,9.0,5.0,18.0,4.0,10.0,5.0,1.0,1.0,0.0,0.0
3,E0,2002-08-17,Fulham,Bolton,4.0,1.0,H,3.0,1.0,H,...,6.0,1.0,16.0,12.0,7.0,4.0,1.0,2.0,0.0,0.0
4,E0,2002-08-17,Leeds,Man City,3.0,0.0,H,2.0,0.0,H,...,8.0,10.0,13.0,13.0,2.0,7.0,1.0,1.0,0.0,0.0


AIM:

1) Given match date, Filter the dataframe for the all games of current year

2) Given HomeTeam & AwayTeam, Filter database for all games (in this time frame) where HomeTeam is Home against all other teams, same for AwayTeam -> Return two filtered dataframes

3) Average these past stats for the home and away teams from current and last season

4) Give the home and away teams ratings based upon the weighted sum of the past stats

5) Create a expected goals predictor using linear/polynomial regression using the past average stats

6) Final Classifier takes as input: Home and Away Team ratings, expected goals, day, month, hometeam and awayteam

In [87]:
# This function will take as input a date, HomeTeam and AwayTeam and output two filtered dataframe

def get_season_start_date(date):
    if date.month <= 7:
        return datetime(date.year-3, 8, 1)
    return datetime(date.year-2, 8, 1)

def filter_dataframe_by_hometeam_recent_season(df, date, HomeTeam):
    # Convert the input string date into datetime
    date = pd.to_datetime(date, dayfirst=True)

    # Filter the dataframe to include only rows where Dateinput(first day of season) && HomeTeam=input(HomeTeam)
    df_filtered = df.copy()
    df_filtered = df_filtered[(df_filtered.Date<date) & (df_filtered.Date>get_season_start_date(date)) & (df_filtered.HomeTeam==HomeTeam)]

    # Return filtered dataframe
    return df_filtered

def filter_dataframe_by_awayteam_recent_season(df, date, AwayTeam):
    # Convert the input string date into datetime
    date = pd.to_datetime(date, dayfirst=True)

    # Filter the dataframe to include only rows where Dateinput(first day of season) && HomeTeam=input(HomeTeam)
    df_filtered = df.copy()
    df_filtered = df_filtered[(df_filtered.Date<date) & (df_filtered.Date>get_season_start_date(date)) & (df_filtered.AwayTeam==AwayTeam)]

    # Return filtered dataframe
    return df_filtered

# For Example:
# date = "24/06/2020"
# HomeTeam = "Newcastle"
# AwayTeam = "Aston Villa"

# An example to see what the function does:
df_epl_train_filtered_Home = filter_dataframe_by_hometeam_recent_season(df_epl_train, "24/06/2020", "Newcastle")
df_epl_train_filtered_Away = filter_dataframe_by_awayteam_recent_season(df_epl_train, "24/06/2020", "Aston Villa")
print(df_epl_train_filtered_Home)
print(df_epl_train_filtered_Away)

  return self._cmp_method(other, operator.lt)


AttributeError: 'str' object has no attribute 'month'

In [88]:
# This function takes as input a filtered dataframe from previous cell, features to average and a dictionary,
# it then appends an average of each feature to the dictionary

def average_columns(features, avg_features, filtered_df):
    for feature in features:
        df_col_means = filtered_df[feature].mean()
        avg_features[feature].append(df_col_means)

In [89]:
# Run the two functions for each row of our df_epl_train dataframe to fill dictionary with AVG for each match
# NOTE: Some matches won't have past stats since its the first game of the season or we might not have past data

features = ["FTHG","FTAG","HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]
avg_features_HOME = {
                        "FTHG": [],
                        "FTAG": [],
                        "HTHG": [],
                        "HTAG": [],
                        "HS"  : [],
                        "AS"  : [],
                        "HST" : [],
                        "AST" : [],
                        "HF"  : [],
                        "AF"  : [],
                        "HC"  : [],
                        "AC"  : [],
                        "HY"  : [],
                        "AY"  : [],
                        "HR"  : [],
                        "AR"  : []
                    }

avg_features_AWAY = {
                        "FTHG": [],
                        "FTAG": [],
                        "HTHG": [],
                        "HTAG": [],
                        "HS"  : [],
                        "AS"  : [],
                        "HST" : [],
                        "AST" : [],
                        "HF"  : [],
                        "AF"  : [],
                        "HC"  : [],
                        "AC"  : [],
                        "HY"  : [],
                        "AY"  : [],
                        "HR"  : [],
                        "AR"  : []
                    }

# Run the two functions on each row of the df_epl_train and fill the dictionary
# We need to do this for the HOME filtered dataframe, and AWAY filtered dataframe

# For each row in our original dataframe
for index, row in df_epl_train.iterrows():
    # Filter the dataframe to only show matches played between those teams and before the certain date
    df_epl_train_filtered_Home = filter_dataframe_by_hometeam_recent_season(df_epl_train, row["Date"],row["HomeTeam"])
    df_epl_train_filtered_Away = filter_dataframe_by_awayteam_recent_season(df_epl_train, row["Date"],row["AwayTeam"])
    # Get averages from the filtered dataframe and add the the dictionary
    average_columns(features, avg_features_HOME, df_epl_train_filtered_Home)
    average_columns(features, avg_features_AWAY, df_epl_train_filtered_Away)
    
# Check this is correct
# print(avg_features_HOME)
# print(avg_features_AWAY)

TypeError: integer argument expected, got float

In [None]:
# Add a column for each of these feature averages using the list of values from the dictionary
df_epl_train_updated = df_epl_train.copy()
features = ["FTHG","FTAG","HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]

for feature in features:
    # Get the list of averages for a certain feature from the dicitonary
    feature_vals_HOME = avg_features_HOME[feature]
    feature_vals_AWAY = avg_features_AWAY[feature]
    # Add the list of averages into the dataframe for that certain feature
    df_epl_train_updated[feature + "_AVG_Home"] = feature_vals_HOME
    df_epl_train_updated[feature + "_AVG_Away"] = feature_vals_AWAY

In [None]:
# Now this dataframe contains our original data + the average of the past stats (Home & Away) for each row
df_epl_train_updated

In [None]:
# Now we loop through the updated dataframe and for each row average the HomeAvgStats and AwayAvgStats using 
# weights to give higher importance to some specific stats. This results in creating and adding two final columns.

features = ["FTHG","FTAG","HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]
feature_weights = {
                        "FTHG": 0,
                        "FTAG": 0,
                        "HTHG": 0,
                        "HTAG": 0,
                        "HS"  : 2,
                        "AS"  : 2,
                        "HST" : 2.5,
                        "AST" : 2.5,
                        "HF"  : 1.5,
                        "AF"  : 1.5,
                        "HC"  : 0.5,
                        "AC"  : 0.5,
                        "HY"  : 0.5,
                        "AY"  : 0.5,
                        "HR"  : 0.5,
                        "AR"  : 0.5
                    }
    
Home_Weighted_Avg = []
Away_Weighted_Avg = []

for index, row in df_epl_train_updated.iterrows():
    home_vals = []
    away_vals = []
    for feature in features:
        current_feature_HOME = feature + "_AVG_Home"
        current_feature_AWAY = feature + "_AVG_Away"
        current_val_HOME = row[current_feature_HOME]*feature_weights[feature]
        current_val_AWAY = row[current_feature_AWAY]*feature_weights[feature]
        home_vals.append(current_val_HOME)
        away_vals.append(current_val_AWAY)
    Home_Weighted_Avg.append(sum(home_vals)/len(home_vals))
    Away_Weighted_Avg.append(sum(away_vals)/len(away_vals))

df_epl_train_final = df_epl_train.copy()
df_epl_train_final["HomeTeam_Rating"] = Home_Weighted_Avg
df_epl_train_final["AwayTeam_Rating"] = Away_Weighted_Avg

# Check if its working correctly
df_epl_train_final

In [None]:
# Turn the catergorical data into labels using same method from before
df_epl_train_final["AwayTeam_Enc"] = df_epl_train_final["AwayTeam"].astype("category").cat.codes
df_epl_train_final["HomeTeam_Enc"] = df_epl_train_final["HomeTeam"].astype("category").cat.codes

# Transform the date column into day and month columns and Add into dataframe (Extract days & months from date)
df_epl_train_final["Date"] = pd.to_datetime(df_epl_train_final["Date"])
df_epl_train_final["Day"] = df_epl_train_final["Date"].dt.day
df_epl_train_final["Month"] = df_epl_train_final["Date"].dt.month 
df_epl_train_final["Year"] = df_epl_train_final["Date"].dt.year

# Add average values from updated dataframe
df_epl_train_final["FTHG_AVG"] = df_epl_train_updated["FTHG_AVG_Home"]
df_epl_train_final["FTAG_AVG"] = df_epl_train_updated["FTAG_AVG_Away"]
df_epl_train_final["HTHG_AVG"] = df_epl_train_updated["HTHG_AVG_Home"]
df_epl_train_final["HTAG_AVG"] = df_epl_train_updated["HTAG_AVG_Away"]
df_epl_train_final["HS_AVG"] = df_epl_train_updated["HS_AVG_Home"]
df_epl_train_final["AS_AVG"] = df_epl_train_updated["AS_AVG_Away"]
df_epl_train_final["HST_AVG"] = df_epl_train_updated["HST_AVG_Home"]
df_epl_train_final["AST_AVG"] = df_epl_train_updated["AST_AVG_Away"]
df_epl_train_final["HF_AVG"] = df_epl_train_updated["HF_AVG_Home"]
df_epl_train_final["AF_AVG"] = df_epl_train_updated["AF_AVG_Away"]
df_epl_train_final["HC_AVG"] = df_epl_train_updated["HC_AVG_Home"]
df_epl_train_final["AC_AVG"] = df_epl_train_updated["AC_AVG_Away"]
df_epl_train_final["HY_AVG"] = df_epl_train_updated["HY_AVG_Home"]
df_epl_train_final["AY_AVG"] = df_epl_train_updated["AY_AVG_Away"]
df_epl_train_final["HR_AVG"] = df_epl_train_updated["HR_AVG_Home"]
df_epl_train_final["AR_AVG"] = df_epl_train_updated["AR_AVG_Away"]

# Check the final updated dataframe
df_epl_train_final

In [None]:
# NOTE: We must remove the rows in the dataframe where the average values of stats/features are 'nan';
# we get these values because either its the teams first game of the season OR because we have
# not got the past stats for these matches. We cannot use the 'nan' values for the classifier training and 
# hence have to remove these rows. We can then train a classifier using this final dataframe.

# In the final model/classifier, in the case where we DO NOT have these past stats of the teams playing, 
# we need to switch back to using the OLD classifier which only took the 4 basic fetaures: day, month, 
# HomeTeam and AwayTeam.

# In the case where we DO have these past stats for the teams, we can use this model/classifier 
# and input the features like HST_AVG. We would find these by using the filter_dataframe() and 
# average_columns() functions to find them for any two specific teams playing each other on some date.

# Remove any rows with nan
df_epl_train_final = df_epl_train_final.dropna()
df_epl_train_final

In [None]:
# Try either using polynomial regression, NN or more features, the MSE is too high with linear regression and current features

# Here we aim to create a 'expected or predicted goals for a HomeTeam' feature based upon the HomeTeamRating, and also the past wins, losses and draws, our outputs will be full time goals(FTHG).
X2 = df_epl_train_final.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'FTHG_AVG', 'HTHG_AVG', 'HS_AVG', 'HomeTeam_Rating']].values
y2 = df_epl_train_final.loc[:,'FTHG'].values
# Split the data for testing
X2_train, X2_test, y2_train, y2_test = model_selection.train_test_split(X2, y2, test_size=0.2, random_state=20)
# Here we use a linear regression classifier to predict the goals BUT could use a polynomial regression classifier:
# Create an empty linear regression model
LR_Model_HOME_EG = LinearRegression()
# Fit the model using training data
LR_Model_HOME_EG.fit(X2_train, y2_train)
# Make predictions using the model we have created
LR_H_predictions_test = LR_Model_HOME_EG.predict(X2_test)


# Similar idea for AwayTeam
X3 = df_epl_train_final.loc[:,['Day', 'Month', 'AwayTeam_Enc', 'FTAG_AVG', 'HTAG_AVG', 'AS_AVG', 'AwayTeam_Rating']].values
y3 = df_epl_train_final.loc[:,'FTAG'].values
# Split the data for testing
X3_train, X3_test, y3_train, y3_test = model_selection.train_test_split(X3, y3, test_size=0.2, random_state=20)
LR_Model_AWAY_EG = LinearRegression()
# Fit the model using training data
LR_Model_AWAY_EG.fit(X3_train, y3_train)
# Make predictions using the model we have created
LR_A_predictions_test = LR_Model_AWAY_EG.predict(X3_test)


# Check the mean square error(MSE) for HomeTeam Expected Goals
print(mean_squared_error(LR_H_predictions_test, y2_test))
# Check the mean square error(MSE) for AwayTeam Expected Goals
print(mean_squared_error(LR_A_predictions_test, y3_test))

In [None]:
# Using the two regression classfiers above, predict the number of goals that the Home and Away teams will hit for each row in the dataframe:
HomeExGoals = []
AwayExGoals = []
# For each row, predict the home and away expected goals
for index, row in df_epl_train_final.iterrows():
    X_Home_features = np.array([[row["Day"],row["Month"],row["HomeTeam_Enc"],row["FTHG_AVG"],row["HTHG_AVG"],row["HS_AVG"],row["HomeTeam_Rating"]]])
    X_Away_features = np.array([[row["Day"],row["Month"],row["AwayTeam_Enc"],row["FTAG_AVG"],row["HTAG_AVG"],row["AS_AVG"],row["AwayTeam_Rating"]]])
    # Note the prediction is a 1 by 1 vector
    ex_home_goals = LR_Model_HOME_EG.predict(X_Home_features)[0]
    ex_away_goals = LR_Model_AWAY_EG.predict(X_Away_features)[0]
    HomeExGoals.append(ex_home_goals)
    AwayExGoals.append(ex_away_goals)

# Add this data into the final dataframe
df_epl_train_final["Ex_Goals_Home"] = HomeExGoals
df_epl_train_final["Ex_Goals_Away"] = AwayExGoals

# Check the final dataframe
df_epl_train_final

In [None]:
# Create the input features matrix X
# Create the output values y vector (made of FTR)
# Take these values from the transformed dataframe

# Different OPTIONS for features to use in the design matrix:

# OPTION 1 - features are Day, Month, Home/Away Team, Current/Past Season Goal Averages, HomeTeam Rating, AwayTeam Rating
# X = df_epl_train_final.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'AwayTeam_Enc', 'FTHG_AVG', 'FTAG_AVG', 'HTHG_AVG', 'HTAG_AVG', 'HomeTeam_Rating' ,'AwayTeam_Rating']].values

# OPTION 2 - Same as OPTION 1 but also adds expected goals for each team
# X = df_epl_train_final.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'AwayTeam_Enc', 'FTHG_AVG', 'FTAG_AVG', 'HTHG_AVG', 'HTAG_AVG', 'HomeTeam_Rating' ,'AwayTeam_Rating', 'Ex_Goals_Home', 'Ex_Goals_Away']].values

# OPTION 3 - All Features
# X = df_epl_train_final.loc[:,['Day', 'Month', 'HomeTeam_Enc', 'AwayTeam_Enc','FTHG_AVG','FTAG_AVG','HTHG_AVG','HTAG_AVG','HS_AVG','AS_AVG','HST_AVG','AST_AVG','HF_AVG','AF_AVG','HC_AVG','AC_AVG','HY_AVG','AY_AVG','HR_AVG','AR_AVG','HomeTeam_Rating','AwayTeam_Rating','Ex_Goals_Home','Ex_Goals_Away']].values

# OPTION 4 - OPTION 3 without Yellow & Red Cards
X = df_epl_train_final.loc[:,['Day', 'Month','HomeTeam_Enc', 'AwayTeam_Enc','FTHG_AVG','FTAG_AVG','HTHG_AVG','HTAG_AVG','HS_AVG','AS_AVG','HST_AVG','AST_AVG','HF_AVG','AF_AVG','HC_AVG','AC_AVG','HomeTeam_Rating','AwayTeam_Rating','Ex_Goals_Home','Ex_Goals_Away']].values

# The output value
y = df_epl_train_final.loc[:,'FTR'].values

# Split the training data in a 80-20 split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=22)

# Encode the y output values as well
FTR_encoder = LabelEncoder()
y_train = FTR_encoder.fit_transform(y_train);

In [None]:
# Create an empty Tree model
DT_Model = DecisionTreeClassifier(random_state=42)
# Fit the model using training data
DT_Model.fit(X_train, y_train)
# Make predictions using the model we have created
DT_predictions_test = DT_Model.predict(X_test)
# Reconverting prediction values (i.e. 0, 1 or 2) back into (H, D or A) using the FTR_encoder defined in earlier cell
DT_predictions_test = FTR_encoder.inverse_transform(DT_predictions_test)

print(accuracy_score(DT_predictions_test, y_test))
print(classification_report(DT_predictions_test, y_test))

In [None]:
# Create an empty KNN model
KNN_Model = KNeighborsClassifier(n_neighbors=50)
# Fit the model using training data
KNN_Model.fit(X_train, y_train)
# Make predictions using the model we have created
KNN_predictions_test = KNN_Model.predict(X_test)
KNN_predictions_test = FTR_encoder.inverse_transform(KNN_predictions_test)

print(accuracy_score(KNN_predictions_test, y_test))
print(classification_report(KNN_predictions_test, y_test))

In [None]:
# Create an empty Random Forest model
RF_Model = RandomForestClassifier(n_estimators=200, random_state=42)
# Fit the model using training data
RF_Model.fit(X_train, y_train)
# Make predictions using the model we have created
RF_predictions_test = RF_Model.predict(X_test)
RF_predictions_test = FTR_encoder.inverse_transform(RF_predictions_test)

print(accuracy_score(RF_predictions_test, y_test))
print(classification_report(RF_predictions_test, y_test))

In [None]:
FTR_encoder = LabelEncoder()
y_test = FTR_encoder.fit_transform(y_test);
plot_confusion_matrix(DT_Model, X_test, y_test)
plot_confusion_matrix(KNN_Model, X_test, y_test)
plot_confusion_matrix(RF_Model, X_test, y_test)