In [1]:
import pandas as pd

In [2]:
all_matches = pd.read_csv("all_matches.csv",index_col=0)

In [3]:
all_matches.head(5)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,sot%,g/sh,g/sot,dist,fk,pk,pkatt,season,team name
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,2.0,20.0,0.1,0.5,14.6,1.0,0.0,0.0,2022,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,7.0,36.8,0.21,0.57,13.0,0.0,0.0,0.0,2022,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,6.0,42.9,0.21,0.5,14.8,0.0,0.0,0.0,2022,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,8.0,36.4,0.09,0.25,15.5,1.0,0.0,0.0,2022,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,8.0,36.4,0.09,0.25,16.3,1.0,0.0,0.0,2022,Arsenal


In [6]:
all_matches.shape

(2112, 30)

### There are 38 matches each season, 20 teams playing and since we have datas for 3 seasons

In [9]:
38*20*3

2280

In [13]:
all_matches["team name"].value_counts()

Arsenal                     106
Chelsea                     106
Southampton                 106
Leicester City              106
Everton                     106
Leeds United                106
Crystal Palace              106
Wolverhampton Wanderers     106
Tottenham Hotspur           106
Aston Villa                 106
Manchester United           105
Liverpool                   105
Manchester City             105
West Ham United             105
Newcastle United            105
Brighton and Hove Albion    104
Burnley                      76
Brentford                    68
Fulham                       67
Watford                      38
Norwich City                 38
West Bromwich Albion         38
Sheffield United             38
Nottingham Forest            30
Bournemouth                  30
Name: team name, dtype: int64

#### There are less number of matches because 
- the season 2022-23  was ongoing while we scraped the data
- 3 teams are relegated each season and 3 teams are promoted 
- So we expect (~ 9 teams) to be having around 38 matches less each

## Cleaning our data for Machine Learning

In [14]:
all_matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
sot%            float64
g/sh            float64
g/sot           float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team name        object
dtype: object

- Machine Learning algorithms can't work with data that is an object
- Can only use columns that are numeric as predictors 
- Convert relevant feature columns to numeric datatype

In [16]:
# Convert date object to datetime
all_matches["date"] = pd.to_datetime(all_matches["date"])
all_matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
sot%                   float64
g/sh                   float64
g/sot                  float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team name               object
dtype: object

## Creating relevant features

In [18]:
n_venue_types = all_matches['venue'].nunique()
n_venue_types            # home/away only

2

In [21]:
all_matches['venue_id'] = all_matches ['venue'].replace({'Home':0,'Away':1})
all_matches['opponent_id'] = all_matches["opponent"].astype("category").cat.codes

In [26]:
all_matches["hour"] = all_matches["time"].str.replace(":.+","", regex=True).astype("int")

In [27]:
all_matches["day_id"] = all_matches["date"].dt.dayofweek

In [28]:
all_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team name,venue_id,opponent_id,hour,day_id
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,14.6,1.0,0.0,0.0,2022,Arsenal,1,7,20,4
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,13.0,0.0,0.0,0.0,2022,Arsenal,0,11,15,5
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,14.8,0.0,0.0,0.0,2022,Arsenal,1,2,17,5
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,15.5,1.0,0.0,0.0,2022,Arsenal,0,9,17,5
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,16.3,1.0,0.0,0.0,2022,Arsenal,0,1,19,2


In [30]:
# Set up a target to actually predict
all_matches["target"] = (all_matches["result"] == "W").astype("int")

In [31]:
# Non-linearity can be captured using Random forest classifier
from sklearn.ensemble import RandomForestClassifier