## Reading the data in and checking no records are missing

In [1]:
# read the csv into a pandas dataframe
import pandas as pd

match_data = pd.read_csv('premier_league_data.csv', index_col=0)

match_data.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [2]:
# each team plays 38 games a season, there are 20 teams and we have 3 seasons of data, 38 * 20 * 3 = 2280 games
# shape of the dataframe should be 2280
match_data.shape

(2280, 27)

In [3]:
# 38 * 3 = 114, each team should play 38, 76 or 114 games depending if they got relegated or promoted during these 3 seasons

match_data["team"].value_counts()

Manchester City             114
Wolverhampton Wanderers     114
Burnley                     114
Everton                     114
Southampton                 114
Aston Villa                 114
Liverpool                   114
Crystal Palace              114
Newcastle United            114
Brighton and Hove Albion    114
Leicester City              114
West Ham United             114
Manchester United           114
Arsenal                     114
Tottenham Hotspur           114
Chelsea                     114
Leeds United                 76
Watford                      76
Norwich City                 76
Sheffield United             76
West Bromwich Albion         38
Brentford                    38
Fulham                       38
Bournemouth                  38
Name: team, dtype: int64

In [4]:
match_data["opponent"].value_counts()

Tottenham          114
Crystal Palace     114
Manchester City    114
Newcastle Utd      114
Wolves             114
Aston Villa        114
Everton            114
Manchester Utd     114
West Ham           114
Brighton           114
Burnley            114
Liverpool          114
Chelsea            114
Southampton        114
Leicester City     114
Arsenal            114
Norwich City        76
Watford             76
Leeds United        76
Sheffield Utd       76
Brentford           38
Fulham              38
West Brom           38
Bournemouth         38
Name: opponent, dtype: int64

## Cleaning the data so that it is ready to be processed for machine learning purposes

In [5]:
# our machine learning algorithms can only work with numerical data, so we need to convert the categorical data or objects to numerical data
match_data.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [6]:
# convert the date column to datetime format
match_data["date"] = pd.to_datetime(match_data["date"])

In [7]:
# create predictors to be fed into the model, any columns that are needed as predictors need to be converted to numerical data
# columns used: venue, opponent, time (hour), date (day)

match_data["venue_code"] = match_data["venue"].astype('category').cat.codes
match_data["opp_code"] = match_data["opponent"].astype('category').cat.codes
match_data["hour"] = match_data["time"].str.replace(":.+", "", regex=True).astype(int)
match_data["day_code"] = match_data["date"].dt.dayofweek

match_data.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,16.9,1.0,0.0,0.0,2022,Manchester City,0,19,16,6
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,17.3,1.0,0.0,0.0,2022,Manchester City,1,16,15,5
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,14.3,0.0,0.0,0.0,2022,Manchester City,1,0,12,5
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,14.0,0.0,0.0,0.0,2022,Manchester City,0,11,15,5
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,15.7,1.0,0.0,0.0,2022,Manchester City,1,18,15,5


In [8]:
# convert the result into numerical data
match_data["target"] = (match_data["result"] == "W").astype(int)

match_data.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,19,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,16,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,0,11,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,1,18,15,5,0


## Predict the results using a random forrest classifier

In [9]:
# create machine learning model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [10]:
# split data into training and testing data
train = match_data[match_data["date"] < '2022/01/02']
test = match_data[match_data["date"] > '2022/01/02']

In [11]:
# specify the predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [12]:
# train the model
rf.fit(train[predictors], train["target"])

In [13]:
# make predictions on the test data
predictions = rf.predict(test[predictors])

In [14]:
# check the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test["target"], predictions)
accuracy


0.6026315789473684

In [15]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=predictions))
pd.crosstab(combined["actual"], combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,184,44
1,107,45


The above shows us that the model is more accurate at predicting a loss over a win, we want to be able to predict a win 

In [16]:
from sklearn.metrics import precision_score

precision = precision_score(test["target"], predictions)
precision

0.5056179775280899

In [17]:
# group the data by team and 
grouped_data = match_data.groupby("team")

In [18]:
# function to create rolling averages for a given team
def create_rolling_average(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_avg = group[cols].rolling(10, closed='left').mean()
    group[new_cols] = rolling_avg
    group = group.dropna(subset=new_cols)
    return group

In [19]:
# create a rolling average of in game stats so the machine learning model has more information about how a team has been performing to base the prediction on
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt" , "xg", "xga", "poss"]
new_cols = ["gf_rolling", "ga_rolling", "sh_rolling", "sot_rolling", "dist_rolling", "fk_rolling", "pk_rolling", "pkatt_rolling" , "xg_rolling", "xga_rolling","poss_rolling"]

rolling_data = match_data.groupby("team").apply(lambda x: create_rolling_average(x, cols, new_cols))

rolling_data

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling,poss_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,15,2019-11-02,15:00,Premier League,Matchweek 11,Sat,Home,D,1,1,Wolves,...,1.4,13.1,4.6,17.10,0.8,0.1,0.1,1.37,1.47,55.8
Arsenal,17,2019-11-09,17:30,Premier League,Matchweek 12,Sat,Away,L,0,2,Leicester City,...,1.5,13.4,4.8,17.09,0.8,0.1,0.1,1.38,1.57,55.5
Arsenal,18,2019-11-23,15:00,Premier League,Matchweek 13,Sat,Home,D,2,2,Southampton,...,1.6,12.6,4.0,16.82,0.8,0.1,0.1,1.37,1.60,53.8
Arsenal,20,2019-12-01,14:00,Premier League,Matchweek 14,Sun,Away,D,2,2,Norwich City,...,1.5,12.9,4.2,15.86,0.7,0.1,0.1,1.48,1.63,55.0
Arsenal,21,2019-12-05,20:15,Premier League,Matchweek 15,Thu,Home,L,1,2,Brighton,...,1.5,11.8,3.9,15.41,0.5,0.2,0.2,1.45,1.55,55.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,37,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Home,L,0,3,Brighton,...,1.2,10.8,3.7,18.23,0.1,0.0,0.0,0.76,1.74,54.6
Wolverhampton Wanderers,38,2022-05-07,15:00,Premier League,Matchweek 36,Sat,Away,D,2,2,Chelsea,...,1.4,10.8,3.5,18.29,0.1,0.0,0.0,0.76,1.83,55.9
Wolverhampton Wanderers,39,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Home,L,1,5,Manchester City,...,1.4,11.6,3.7,18.29,0.1,0.0,0.0,0.87,1.81,55.5
Wolverhampton Wanderers,40,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Norwich City,...,1.8,10.9,3.9,17.95,0.1,0.0,0.0,0.84,1.91,52.8


In [20]:
# drop the team level so we are left with one dataframe
rolling_data.droplevel("team")

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling,poss_rolling
15,2019-11-02,15:00,Premier League,Matchweek 11,Sat,Home,D,1,1,Wolves,...,1.4,13.1,4.6,17.10,0.8,0.1,0.1,1.37,1.47,55.8
17,2019-11-09,17:30,Premier League,Matchweek 12,Sat,Away,L,0,2,Leicester City,...,1.5,13.4,4.8,17.09,0.8,0.1,0.1,1.38,1.57,55.5
18,2019-11-23,15:00,Premier League,Matchweek 13,Sat,Home,D,2,2,Southampton,...,1.6,12.6,4.0,16.82,0.8,0.1,0.1,1.37,1.60,53.8
20,2019-12-01,14:00,Premier League,Matchweek 14,Sun,Away,D,2,2,Norwich City,...,1.5,12.9,4.2,15.86,0.7,0.1,0.1,1.48,1.63,55.0
21,2019-12-05,20:15,Premier League,Matchweek 15,Thu,Home,L,1,2,Brighton,...,1.5,11.8,3.9,15.41,0.5,0.2,0.2,1.45,1.55,55.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Home,L,0,3,Brighton,...,1.2,10.8,3.7,18.23,0.1,0.0,0.0,0.76,1.74,54.6
38,2022-05-07,15:00,Premier League,Matchweek 36,Sat,Away,D,2,2,Chelsea,...,1.4,10.8,3.5,18.29,0.1,0.0,0.0,0.76,1.83,55.9
39,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Home,L,1,5,Manchester City,...,1.4,11.6,3.7,18.29,0.1,0.0,0.0,0.87,1.81,55.5
40,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Norwich City,...,1.8,10.9,3.9,17.95,0.1,0.0,0.0,0.84,1.91,52.8


In [21]:
# reindex the dataframe
rolling_data.index = range(rolling_data.shape[0])
rolling_data_rf = rolling_data.copy()
rolling_data_lr = rolling_data.copy()
rolling_data_dt = rolling_data.copy()


In [22]:
# create a function to make predictions
def make_predictions(data, predictors, model):
    # split data into training and testing data
    train = data[data["date"] < '2022/01/02']
    test = data[data["date"] > '2022/01/02']

    # train the model
    model.fit(train[predictors], train["target"])

    # make predictions on the test data
    predictions = model.predict(test[predictors])

    # combine the predictions with the test data
    combined = pd.DataFrame(dict(actual=test["target"], predicted=predictions), index = test.index)

    
    # return the predictions and the precision
    return combined


In [23]:
# make predictions with the random forest model

rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

combined_rf = make_predictions(rolling_data_rf, predictors + new_cols, rf)

# check the accuracy of the model
combined_accuracy  = accuracy_score(combined_rf["actual"], combined_rf["predicted"])
combined_precision = precision_score(combined_rf["actual"], combined_rf["predicted"])

# print the accuracy and precision
print("Accuracy:", combined_accuracy)
print("Precision:", combined_precision)


Accuracy: 0.6710526315789473
Precision: 0.6483516483516484


In [26]:
# create a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()


combined_lr = make_predictions(rolling_data_lr, predictors + new_cols, lr)
 
# round the predictions to the nearest integer
combined_lr["predicted"] = combined_lr["predicted"].round()

# check the accuracy of the model
combined_accuracy  = accuracy_score(combined_lr["actual"], combined_lr["predicted"])
combined_precision = precision_score(combined_lr["actual"], combined_lr["predicted"])

# print the accuracy and precision
print("Accuracy:", combined_accuracy)
print("Precision:", combined_precision)




Accuracy: 0.6710526315789473
Precision: 0.6708860759493671


In [37]:
# create decision tree model
from sklearn.tree import DecisionTreeClassifier

# create instance of model 
dt = DecisionTreeClassifier(max_depth=3)

rolling_data_lr = rolling_data.copy()
combined_dt = make_predictions(rolling_data_dt, predictors + new_cols, dt)

# check the accuracy of the model
combined_accuracy  = accuracy_score(combined_dt["actual"], combined_dt["predicted"])
combined_precision = precision_score(combined_dt["actual"], combined_dt["predicted"])

# print the accuracy and precision
print("Accuracy:", combined_accuracy)
print("Precision:", combined_precision)
  

Accuracy: 0.6526315789473685
Precision: 0.6515151515151515
