In [226]:
# ============================================================
# Notebook setup
# ============================================================

%load_ext autoreload
%autoreload 2

figsize=(14, 4)

from utils import DataAggregator
from sklearn.calibration import LabelEncoder
import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature engineering

In order to more accurately predict the games, we now want to perform some feature engineering in order to create new variables that can be used for the games. But first of all, lets again take a look at the dataset we have available.

In [227]:
data_aggregator = DataAggregator()
df = data_aggregator.get_data(["E0"])

In [228]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC',
       'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A'],
      dtype='object')

In [229]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,E0,15/08/09,Aston Villa,Wigan,0,2,A,0,1,A,...,14,4,6,2,2,0,0,1.67,3.6,5.5
1,E0,15/08/09,Blackburn,Man City,0,2,A,0,1,A,...,9,5,4,2,1,0,0,3.6,3.25,2.1
2,E0,15/08/09,Bolton,Sunderland,0,1,A,0,1,A,...,10,4,7,2,1,0,0,2.25,3.25,3.25
3,E0,15/08/09,Chelsea,Hull,2,1,H,1,1,D,...,15,12,4,1,2,0,0,1.17,6.5,21.0
4,E0,15/08/09,Everton,Arsenal,1,6,A,0,3,A,...,13,4,9,0,0,0,0,3.2,3.25,2.3


We know that if we want to use this data to do predictions using e.g., classifiers, we need to convert the data to a numerical format. Lets start by making the date numerical.

In [230]:
df = data_aggregator.format_date(df, "Date")

The format date function extracts the year, month, day and day of week from the original dataframe and ensures it is sorted by the oldest dates first.

In [231]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AY,HR,AR,B365H,B365D,B365A,Year,Month,Day,DayOfWeek
0,E0,2002-08-17,Blackburn,Sunderland,0,0,D,0,0,D,...,2,0,0,1.727,3.25,4.333,2002,8,17,5
1,E0,2002-08-17,Southampton,Middlesbrough,0,0,D,0,0,D,...,0,0,0,2.25,3.25,2.75,2002,8,17,5
2,E0,2002-08-17,Man United,West Brom,1,0,H,0,0,D,...,1,0,1,1.2,5.0,12.0,2002,8,17,5
3,E0,2002-08-17,Charlton,Chelsea,2,3,A,2,1,H,...,3,1,0,2.8,3.25,2.2,2002,8,17,5
4,E0,2002-08-17,Fulham,Bolton,4,1,H,3,1,H,...,2,0,0,1.727,3.25,4.333,2002,8,17,5


Furthermore, we want encode the result in a numerical format. We will use the following mapping:
- ``H``-> 1
- ``D``-> 0
- ``A``> -1

In [232]:
df = data_aggregator.encode_result(df, 
                                   mapping={"H": 1, "D": 0, "A": -1}, 
                                   result_column="FTR")

In [233]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AY,HR,AR,B365H,B365D,B365A,Year,Month,Day,DayOfWeek
0,E0,2002-08-17,Blackburn,Sunderland,0,0,0,0,0,D,...,2,0,0,1.727,3.25,4.333,2002,8,17,5
1,E0,2002-08-17,Southampton,Middlesbrough,0,0,0,0,0,D,...,0,0,0,2.25,3.25,2.75,2002,8,17,5
2,E0,2002-08-17,Man United,West Brom,1,0,1,0,0,D,...,1,0,1,1.2,5.0,12.0,2002,8,17,5
3,E0,2002-08-17,Charlton,Chelsea,2,3,-1,2,1,H,...,3,1,0,2.8,3.25,2.2,2002,8,17,5
4,E0,2002-08-17,Fulham,Bolton,4,1,1,3,1,H,...,2,0,0,1.727,3.25,4.333,2002,8,17,5


Next up, we also want to encode some form data for the teams as well (as in; has the team played well recently?). We want to encode the form each team has had for the last 10 games, meaning how well they have performed recently. This can for example be done using the average result of the previous 10 games a team has played.

In [234]:
form_window = 10

df['ShiftedFTR_Home'] = df.groupby('HomeTeam')['FTR'].shift(1)
df['ShiftedFTR_Away'] = df.groupby('AwayTeam')['FTR'].shift(1)

home_team_win_form = df.groupby('HomeTeam')['ShiftedFTR_Home'].rolling(window=form_window).mean().reset_index(0, drop=True)
away_team_win_form = df.groupby('AwayTeam')['ShiftedFTR_Away'].rolling(window=form_window).mean().reset_index(0, drop=True)

df = pd.concat([df, home_team_win_form.rename('HomeTeamWinForm'), away_team_win_form.rename('AwayTeamWinForm')], axis=1)

df = df.drop(columns=['ShiftedFTR_Home', 'ShiftedFTR_Away'])

In [235]:
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamWinForm", "AwayTeamWinForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamWinForm,AwayTeamWinForm
0,Blackburn,Sunderland,0,,
1,Southampton,Middlesbrough,0,,
2,Man United,West Brom,1,,
3,Charlton,Chelsea,-1,,
4,Fulham,Bolton,1,,
...,...,...,...,...,...
8435,Ipswich,Everton,-1,,0.6
8436,Bournemouth,Arsenal,1,0.3,-0.8
8437,Liverpool,Chelsea,1,0.5,-0.4
8438,Wolves,Man City,-1,-0.4,-0.8


Here, we look at a teams 10 previous games, either at home or away, and calculate the average result. However, we believe it would make sense if for example the last game played by a team would indicate their form more accurately than the game they played 5 games prior. By doing some testing, we also found this to be true. Because of this, we add a weighting to the calculations of a teams form, so that the more recent a game is, the more it affects the calculation of a teams form.

We do this using this formula:
$$\frac{np.dot(x,weights)}{weights.sum()}$$

And calculate the weights like this:
$$weights = np.arange(1, form\_window + 1)$$


In [236]:
form_window = 10
weights = np.arange(1, form_window + 1)

df.drop(columns=['HomeTeamWinForm', 'AwayTeamWinForm'], inplace=True)

df['ShiftedFTR_Home'] = df.groupby('HomeTeam')['FTR'].shift(1)
df['ShiftedFTR_Away'] = df.groupby('AwayTeam')['FTR'].shift(1)

home_team_win_form = df.groupby('HomeTeam')['ShiftedFTR_Home'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)
away_team_win_form = df.groupby('AwayTeam')['ShiftedFTR_Away'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)

df = pd.concat([df, home_team_win_form.rename('HomeTeamWinForm'), away_team_win_form.rename('AwayTeamWinForm')], axis=1)


df = df.drop(columns=['ShiftedFTR_Home', 'ShiftedFTR_Away'])

In [237]:
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamWinForm", "AwayTeamWinForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamWinForm,AwayTeamWinForm
0,Blackburn,Sunderland,0,,
1,Southampton,Middlesbrough,0,,
2,Man United,West Brom,1,,
3,Charlton,Chelsea,-1,,
4,Fulham,Bolton,1,,
...,...,...,...,...,...
8435,Ipswich,Everton,-1,,0.618182
8436,Bournemouth,Arsenal,1,0.163636,-0.763636
8437,Liverpool,Chelsea,1,0.454545,-0.654545
8438,Wolves,Man City,-1,-0.672727,-0.781818


We can see from the DataFrame above that we now have some missing data. We need to consider how to fill them. Should we just remove them, or should we instead fill them with a selected value?

We have decided to subsitute the ``NaN``values with the average of each teams win form.

NOTE: We tried both options, but there was very little difference in final accuracy by the different models.

In [238]:
df["HomeTeamWinForm"] = df["HomeTeamWinForm"].fillna(df.groupby("HomeTeam")["HomeTeamWinForm"].transform("mean"))
df["AwayTeamWinForm"] = df["AwayTeamWinForm"].fillna(df.groupby("AwayTeam")["AwayTeamWinForm"].transform("mean"))
df.dropna(subset=["HomeTeamWinForm", "AwayTeamWinForm"], inplace=True)
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamWinForm", "AwayTeamWinForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamWinForm,AwayTeamWinForm
0,Blackburn,Sunderland,0,0.112222,0.430525
1,Southampton,Middlesbrough,0,0.011608,0.359539
2,Man United,West Brom,1,0.558561,0.362332
3,Charlton,Chelsea,-1,0.060535,-0.258561
4,Fulham,Bolton,1,0.060769,0.288788
...,...,...,...,...,...
8434,Tottenham,West Ham,1,0.200000,0.145455
8436,Bournemouth,Arsenal,1,0.163636,-0.763636
8437,Liverpool,Chelsea,1,0.454545,-0.654545
8438,Wolves,Man City,-1,-0.672727,-0.781818


We can also have a look at the average points per game (PPG) a team has.

In [239]:
# Map the 'FTR' values to points for the home and away teams
df['HomePoints'] = df['FTR'].map({1: 3, 0: 1, -1: 0})
df['AwayPoints'] = df['FTR'].map({1: 0, 0: 1, -1: 3})

# Shift the points to calculate rolling averages for past games
df['ShiftedHomePoints'] = df.groupby('HomeTeam')['HomePoints'].shift(1)
df['ShiftedAwayPoints'] = df.groupby('AwayTeam')['AwayPoints'].shift(1)

# Calculate rolling points per game
home_team_ppg = df.groupby('HomeTeam')['ShiftedHomePoints'].rolling(window=form_window).mean().reset_index(0, drop=True)
away_team_ppg = df.groupby('AwayTeam')['ShiftedAwayPoints'].rolling(window=form_window).mean().reset_index(0, drop=True)

# Add the new PPG columns to the DataFrame
df = pd.concat([df, home_team_ppg.rename('HomeTeamPPG'), away_team_ppg.rename('AwayTeamPPG')], axis=1)

# Drop intermediate columns
df = df.drop(columns=['HomePoints', 'AwayPoints', 'ShiftedHomePoints', 'ShiftedAwayPoints'])

Yet again, we decide to fill the null values with the average PPG for each team.

In [240]:
df["HomeTeamPPG"] = df["HomeTeamPPG"].fillna(df.groupby("HomeTeam")["HomeTeamPPG"].transform("mean"))
df["AwayTeamPPG"] = df["AwayTeamPPG"].fillna(df.groupby("AwayTeam")["AwayTeamPPG"].transform("mean"))

In [241]:
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamPPG", "AwayTeamPPG"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamPPG,AwayTeamPPG
0,Blackburn,Sunderland,0,1.527222,0.731651
1,Southampton,Middlesbrough,0,1.381467,0.820423
2,Man United,West Brom,1,2.249272,0.804219
3,Charlton,Chelsea,-1,1.467059,1.785437
4,Fulham,Bolton,1,1.489933,0.956667
...,...,...,...,...,...
8434,Tottenham,West Ham,1,1.800000,1.100000
8436,Bournemouth,Arsenal,1,1.800000,2.600000
8437,Liverpool,Chelsea,1,2.200000,1.900000
8438,Wolves,Man City,-1,0.900000,2.600000


Next up, we also want to consider the goal form of each team. Lets create a column HomeTeamGoalForm and AwayTeamGoalForm. Here we also apply weighting, just like with win form.

In [242]:
df['ShiftedFTHG_Home'] = df.groupby('HomeTeam')['FTHG'].shift(1)
df['ShiftedFTAG_Away'] = df.groupby('AwayTeam')['FTAG'].shift(1)

home_team_goal_form = df.groupby('HomeTeam')['ShiftedFTHG_Home'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)
away_team_goal_form = df.groupby('AwayTeam')['ShiftedFTAG_Away'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)

df = pd.concat([df, home_team_goal_form.rename('HomeTeamGoalForm'), away_team_goal_form.rename('AwayTeamGoalForm')], axis=1)

df = df.drop(columns=['ShiftedFTHG_Home', 'ShiftedFTAG_Away'])

In [243]:
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamGoalForm", "AwayTeamGoalForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamGoalForm,AwayTeamGoalForm
0,Blackburn,Sunderland,0,,
1,Southampton,Middlesbrough,0,,
2,Man United,West Brom,1,,
3,Charlton,Chelsea,-1,,
4,Fulham,Bolton,1,,
...,...,...,...,...,...
8434,Tottenham,West Ham,1,1.981818,1.363636
8436,Bournemouth,Arsenal,1,1.690909,1.981818
8437,Liverpool,Chelsea,1,1.963636,2.545455
8438,Wolves,Man City,-1,1.127273,2.400000


Again we need to fill in the missing values. For this we will again use the average of each teams goal scoring form.

In [244]:
df["HomeTeamGoalForm"] = df["HomeTeamGoalForm"].fillna(df.groupby("HomeTeam")["HomeTeamGoalForm"].transform("mean"))
df["AwayTeamGoalForm"] = df["AwayTeamGoalForm"].fillna(df.groupby("AwayTeam")["AwayTeamGoalForm"].transform("mean"))
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamGoalForm", "AwayTeamGoalForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamGoalForm,AwayTeamGoalForm
0,Blackburn,Sunderland,0,1.347879,0.887406
1,Southampton,Middlesbrough,0,1.382380,0.849424
2,Man United,West Brom,1,2.043601,0.867971
3,Charlton,Chelsea,-1,1.351872,1.607855
4,Fulham,Bolton,1,1.384808,1.003131
...,...,...,...,...,...
8434,Tottenham,West Ham,1,1.981818,1.363636
8436,Bournemouth,Arsenal,1,1.690909,1.981818
8437,Liverpool,Chelsea,1,1.963636,2.545455
8438,Wolves,Man City,-1,1.127273,2.400000


Next up, we would also like to model the goals conceded for each team. We apply weighting here too.

In [245]:
df['ShiftedFTAG_Home'] = df.groupby('HomeTeam')['FTAG'].shift(1)
df['ShiftedFTHG_Away'] = df.groupby('AwayTeam')['FTHG'].shift(1)

home_team_goal_against_form = df.groupby('HomeTeam')['FTAG'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)
away_team_goal_against_form = df.groupby('AwayTeam')['FTHG'].rolling(window=form_window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True).reset_index(0, drop=True)

df = pd.concat([df, home_team_goal_against_form.rename('HomeTeamGoalAgainstForm'), away_team_goal_against_form.rename('AwayTeamGoalAgainstForm')], axis=1)

df = df.drop(columns=['ShiftedFTAG_Home', 'ShiftedFTHG_Away'])

In [246]:
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamGoalAgainstForm", "AwayTeamGoalAgainstForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamGoalAgainstForm,AwayTeamGoalAgainstForm
0,Blackburn,Sunderland,0,,
1,Southampton,Middlesbrough,0,,
2,Man United,West Brom,1,,
3,Charlton,Chelsea,-1,,
4,Fulham,Bolton,1,,
...,...,...,...,...,...
8434,Tottenham,West Ham,1,1.127273,2.400000
8436,Bournemouth,Arsenal,1,0.890909,0.872727
8437,Liverpool,Chelsea,1,0.690909,1.436364
8438,Wolves,Man City,-1,2.436364,0.581818


Yet again, we need to fill in the missing values.

In [247]:
df["HomeTeamGoalAgainstForm"] = df["HomeTeamGoalAgainstForm"].fillna(df.groupby("HomeTeam")["HomeTeamGoalAgainstForm"].transform("mean"))
df["AwayTeamGoalAgainstForm"] = df["AwayTeamGoalAgainstForm"].fillna(df.groupby("AwayTeam")["AwayTeamGoalAgainstForm"].transform("mean"))
df[["HomeTeam", "AwayTeam", "FTR", "HomeTeamGoalAgainstForm", "AwayTeamGoalAgainstForm"]]

Unnamed: 0,HomeTeam,AwayTeam,FTR,HomeTeamGoalAgainstForm,AwayTeamGoalAgainstForm
0,Blackburn,Sunderland,0,1.146861,1.753840
1,Southampton,Middlesbrough,0,1.303706,1.500699
2,Man United,West Brom,1,0.812767,1.647594
3,Charlton,Chelsea,-1,1.347992,1.056879
4,Fulham,Bolton,1,1.320888,1.674837
...,...,...,...,...,...
8434,Tottenham,West Ham,1,1.127273,2.400000
8436,Bournemouth,Arsenal,1,0.890909,0.872727
8437,Liverpool,Chelsea,1,0.690909,1.436364
8438,Wolves,Man City,-1,2.436364,0.581818


Instead of having to manually do this again, we have created a function in the ``DataAggregator``class that we can run to generate all form data. This is called ``create_form_data``.

Lets now have a look at what we have done so far:

In [248]:
df = data_aggregator.get_data(["E0"])
df = data_aggregator.format_date(df, "Date")
df = data_aggregator.encode_result(df,
                                   mapping={"H": 1, "D": 0, "A": -1}, 
                                   result_column="FTR")
df = data_aggregator.create_form_data(df, form_window=5)

In [249]:
df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,Day,DayOfWeek,HomeTeamPPG,AwayTeamPPG,HomeTeamWinForm,AwayTeamWinForm,HomeTeamGoalForm,AwayTeamGoalForm,HomeTeamGoalAgainstForm,AwayTeamGoalAgainstForm
0,E0,2002-08-17,Blackburn,Sunderland,0,0,0,0,0,D,...,17,5,1.524324,0.729148,0.108829,0.431988,1.354595,0.880717,1.151351,1.748281
1,E0,2002-08-17,Southampton,Middlesbrough,0,0,0,0,0,D,...,17,5,1.374340,0.805442,0.015094,0.371882,1.383648,0.836735,1.301384,1.493878
2,E0,2002-08-17,Man United,West Brom,1,0,1,0,0,D,...,17,5,2.245564,0.800000,0.557154,0.366667,2.039648,0.867769,0.813110,1.648209
3,E0,2002-08-17,Charlton,Chelsea,2,3,-1,2,1,H,...,17,5,1.475556,1.785612,0.071111,-0.259313,1.353333,1.608793,1.332593,1.054357
4,E0,2002-08-17,Fulham,Bolton,4,1,1,3,1,H,...,17,5,1.487129,0.951351,0.059846,0.292973,1.378658,1.007207,1.316612,1.687207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,E0,2024-10-19,Ipswich,Everton,0,2,-1,0,2,A,...,19,5,,0.400000,,0.600000,,1.066667,,2.266667
8436,E0,2024-10-19,Bournemouth,Arsenal,2,0,1,0,0,D,...,19,5,1.400000,2.600000,0.000000,-0.666667,1.533333,1.666667,1.066667,0.800000
8437,E0,2024-10-20,Liverpool,Chelsea,2,1,1,1,0,H,...,20,6,2.400000,3.000000,0.466667,-1.000000,1.933333,2.933333,0.400000,0.666667
8438,E0,2024-10-20,Wolves,Man City,1,2,-1,1,1,D,...,20,6,0.600000,2.600000,-0.866667,-0.666667,1.266667,2.066667,2.866667,0.600000


Lastly, in order to create models that we can use to predict the matches, we need to encode the team names. We can either one hot encode the teams with boolean values [Team Name]Home/AwayTeam or ordinal encode the teams with an index. We chose the second option.

In [250]:
new_df = df.copy()
le = LabelEncoder()
new_df["HomeTeam"] = le.fit_transform(df["HomeTeam"])
new_df["AwayTeam"] = le.fit_transform(df["AwayTeam"])

team_mapping = dict(zip(le.transform(le.classes_), le.classes_))

df = new_df
print(team_mapping)

{np.int64(0): 'Arsenal', np.int64(1): 'Aston Villa', np.int64(2): 'Birmingham', np.int64(3): 'Blackburn', np.int64(4): 'Blackpool', np.int64(5): 'Bolton', np.int64(6): 'Bournemouth', np.int64(7): 'Brentford', np.int64(8): 'Brighton', np.int64(9): 'Burnley', np.int64(10): 'Cardiff', np.int64(11): 'Charlton', np.int64(12): 'Chelsea', np.int64(13): 'Crystal Palace', np.int64(14): 'Derby', np.int64(15): 'Everton', np.int64(16): 'Fulham', np.int64(17): 'Huddersfield', np.int64(18): 'Hull', np.int64(19): 'Ipswich', np.int64(20): 'Leeds', np.int64(21): 'Leicester', np.int64(22): 'Liverpool', np.int64(23): 'Luton', np.int64(24): 'Man City', np.int64(25): 'Man United', np.int64(26): 'Middlesbrough', np.int64(27): 'Newcastle', np.int64(28): 'Norwich', np.int64(29): "Nott'm Forest", np.int64(30): 'Portsmouth', np.int64(31): 'QPR', np.int64(32): 'Reading', np.int64(33): 'Sheffield United', np.int64(34): 'Southampton', np.int64(35): 'Stoke', np.int64(36): 'Sunderland', np.int64(37): 'Swansea', np.i

In [251]:
df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,Day,DayOfWeek,HomeTeamPPG,AwayTeamPPG,HomeTeamWinForm,AwayTeamWinForm,HomeTeamGoalForm,AwayTeamGoalForm,HomeTeamGoalAgainstForm,AwayTeamGoalAgainstForm
0,E0,2002-08-17,3,36,0,0,0,0,0,D,...,17,5,1.524324,0.729148,0.108829,0.431988,1.354595,0.880717,1.151351,1.748281
1,E0,2002-08-17,34,26,0,0,0,0,0,D,...,17,5,1.374340,0.805442,0.015094,0.371882,1.383648,0.836735,1.301384,1.493878
2,E0,2002-08-17,25,40,1,0,1,0,0,D,...,17,5,2.245564,0.800000,0.557154,0.366667,2.039648,0.867769,0.813110,1.648209
3,E0,2002-08-17,11,12,2,3,-1,2,1,H,...,17,5,1.475556,1.785612,0.071111,-0.259313,1.353333,1.608793,1.332593,1.054357
4,E0,2002-08-17,16,5,4,1,1,3,1,H,...,17,5,1.487129,0.951351,0.059846,0.292973,1.378658,1.007207,1.316612,1.687207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,E0,2024-10-19,19,15,0,2,-1,0,2,A,...,19,5,,0.400000,,0.600000,,1.066667,,2.266667
8436,E0,2024-10-19,6,0,2,0,1,0,0,D,...,19,5,1.400000,2.600000,0.000000,-0.666667,1.533333,1.666667,1.066667,0.800000
8437,E0,2024-10-20,22,12,2,1,1,1,0,H,...,20,6,2.400000,3.000000,0.466667,-1.000000,1.933333,2.933333,0.400000,0.666667
8438,E0,2024-10-20,43,24,1,2,-1,1,1,D,...,20,6,0.600000,2.600000,-0.866667,-0.666667,1.266667,2.066667,2.866667,0.600000


Now that we have completed our feature engineering, let us create a function ``preprocess_data`` in the DataAggregator class that we can use to create all this data. We will also filter out all columns that we wont be using to predict the matches.

In [252]:
df = data_aggregator.preprocess_data(data_aggregator.get_data(["E0"]), 
                                     date_column="Date",
                                     home_team_column="HomeTeam",
                                     away_team_column="AwayTeam",
                                     result_column="FTR",
                                     form_window=5)
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,Year,Month,Day,DayOfWeek,HomeTeamPPG,AwayTeamPPG,HomeTeamWinForm,AwayTeamWinForm,HomeTeamGoalForm,AwayTeamGoalForm,HomeTeamGoalAgainstForm,AwayTeamGoalAgainstForm
0,2002-08-17,3,36,0,1.727,3.25,4.333,2002,8,17,5,1.524324,0.729148,0.108829,0.431988,1.354595,0.880717,1.151351,1.748281
1,2002-08-17,34,26,0,2.250,3.25,2.750,2002,8,17,5,1.374340,0.805442,0.015094,0.371882,1.383648,0.836735,1.301384,1.493878
2,2002-08-17,25,40,1,1.200,5.00,12.000,2002,8,17,5,2.245564,0.800000,0.557154,0.366667,2.039648,0.867769,0.813110,1.648209
3,2002-08-17,11,12,-1,2.800,3.25,2.200,2002,8,17,5,1.475556,1.785612,0.071111,-0.259313,1.353333,1.608793,1.332593,1.054357
4,2002-08-17,16,5,1,1.727,3.25,4.333,2002,8,17,5,1.487129,0.951351,0.059846,0.292973,1.378658,1.007207,1.316612,1.687207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,2024-10-19,19,15,-1,2.380,3.30,3.000,2024,10,19,5,,0.400000,,0.600000,,1.066667,,2.266667
8436,2024-10-19,6,0,1,5.000,3.90,1.650,2024,10,19,5,1.400000,2.600000,0.000000,-0.666667,1.533333,1.666667,1.066667,0.800000
8437,2024-10-20,22,12,1,1.620,4.10,5.000,2024,10,20,6,2.400000,3.000000,0.466667,-1.000000,1.933333,2.933333,0.400000,0.666667
8438,2024-10-20,43,24,-1,8.000,6.00,1.330,2024,10,20,6,0.600000,2.600000,-0.866667,-0.666667,1.266667,2.066667,2.866667,0.600000


Lastly, we will also create a new column GD, which will stand for goal difference, and will be the result of FTHG-FTAG. This is used for regression, which we will see later.

In [None]:
df = data_aggregator.get_data(["E0"])

df["GD"] = df["FTHG"] - df["FTAG"]
df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,GD
0,E0,15/08/09,Aston Villa,Wigan,0,2,A,0,1,A,...,4,6,2,2,0,0,1.67,3.60,5.50,-2
1,E0,15/08/09,Blackburn,Man City,0,2,A,0,1,A,...,5,4,2,1,0,0,3.60,3.25,2.10,-2
2,E0,15/08/09,Bolton,Sunderland,0,1,A,0,1,A,...,4,7,2,1,0,0,2.25,3.25,3.25,-1
3,E0,15/08/09,Chelsea,Hull,2,1,H,1,1,D,...,12,4,1,2,0,0,1.17,6.50,21.00,1
4,E0,15/08/09,Everton,Arsenal,1,6,A,0,3,A,...,4,9,0,0,0,0,3.20,3.25,2.30,-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,E0,21/05/17,Liverpool,Middlesbrough,3,0,H,1,0,H,...,3,3,0,1,0,0,1.14,9.00,23.00,3
8436,E0,21/05/17,Man United,Crystal Palace,2,0,H,2,0,H,...,3,6,2,0,0,0,2.50,3.30,3.10,2
8437,E0,21/05/17,Southampton,Stoke,0,1,A,0,0,D,...,4,10,2,4,0,0,1.80,3.80,4.75,-1
8438,E0,21/05/17,Swansea,West Brom,2,1,H,0,1,A,...,7,4,1,1,0,0,2.10,3.50,3.75,1


This we will put into a new function in the ``DataAggregator``class, named ``create_gd_feature``.