In [41]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,Manchester City


In [4]:
matches.shape

(3040, 27)

In [6]:
matches[matches['team'] == "Liverpool"]

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1,1,Chelsea,...,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3,1,Bournemouth,...,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2,1,Newcastle Utd,...,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3,0,Aston Villa,...,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3,1,Wolves,...,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2021-05-08,20:15,Premier League,Matchweek 35,Sat,Home,W,2,0,Southampton,...,Match Report,,14.0,6.0,13.6,0.0,0,0,2021,Liverpool
49,2021-05-13,20:15,Premier League,Matchweek 34,Thu,Away,W,4,2,Manchester Utd,...,Match Report,,17.0,8.0,14.9,0.0,0,0,2021,Liverpool
50,2021-05-16,16:30,Premier League,Matchweek 36,Sun,Away,W,2,1,West Brom,...,Match Report,,26.0,6.0,18.3,1.0,0,0,2021,Liverpool
51,2021-05-19,20:15,Premier League,Matchweek 37,Wed,Away,W,3,0,Burnley,...,Match Report,,20.0,3.0,16.2,1.0,0,0,2021,Liverpool


In [21]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                       int64
ga                       int64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
venue_code                int8
opp_code                  int8
hour                     int64
dtype: object

In [23]:
matches["date"] = pd.to_datetime(matches['date'])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W").astype("int")

In [29]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [30]:
train = matches[matches["date"] < '2023-01-01']
test = matches[matches["date"] > '2023-01-01']

In [31]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [32]:
rf.fit(train[predictors], train["target"])

In [33]:
preds = rf.predict(test[predictors])

In [35]:
acc = accuracy_score(test["target"], preds)

In [36]:
acc

0.6153198653198653

In [37]:
combined = pd.DataFrame(dict(actual = test["target"], prediction = preds))

In [40]:
pd.crosstab(index = combined["actual"], columns = combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,578,147
1,310,153


In [42]:
precision_score(test["target"], preds)

0.51

In [47]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Liverpool")

In [48]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1,1,Chelsea,...,0.0,0,0,2024,Liverpool,0,6,16,6,0
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3,1,Bournemouth,...,1.0,0,1,2024,Liverpool,1,2,15,5,1
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2,1,Newcastle Utd,...,1.0,0,0,2024,Liverpool,0,16,16,6,1
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3,0,Aston Villa,...,0.0,0,0,2024,Liverpool,1,1,14,6,1
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3,1,Wolves,...,0.0,0,0,2024,Liverpool,0,25,12,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2021-05-08,20:15,Premier League,Matchweek 35,Sat,Home,W,2,0,Southampton,...,0.0,0,0,2021,Liverpool,1,20,20,5,1
49,2021-05-13,20:15,Premier League,Matchweek 34,Thu,Away,W,4,2,Manchester Utd,...,0.0,0,0,2021,Liverpool,0,15,20,3,1
50,2021-05-16,16:30,Premier League,Matchweek 36,Sun,Away,W,2,1,West Brom,...,1.0,0,0,2021,Liverpool,0,23,16,6,1
51,2021-05-19,20:15,Premier League,Matchweek 37,Wed,Away,W,3,0,Burnley,...,1.0,0,0,2021,Liverpool,0,5,20,2,1


In [54]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [55]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [56]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,19:15,Premier League,Matchweek 4,Sun,Away,L,2,7,Aston Villa,...,6,0,3.000000,1.333333,19.666667,5.666667,18.166667,0.333333,0.666667,0.666667
7,2020-10-17,12:30,Premier League,Matchweek 5,Sat,Away,D,2,2,Everton,...,5,0,2.333333,2.666667,17.666667,7.000000,17.466667,0.666667,0.000000,0.000000
9,2020-10-24,20:00,Premier League,Matchweek 6,Sat,Home,W,2,1,Sheffield Utd,...,5,1,2.333333,3.333333,19.000000,8.000000,16.733333,1.000000,0.000000,0.000000
11,2020-10-31,17:30,Premier League,Matchweek 7,Sat,Home,W,2,1,West Ham,...,5,1,2.000000,3.333333,17.666667,7.000000,17.033333,1.333333,0.000000,0.000000
13,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Away,D,1,1,Manchester City,...,6,0,2.000000,1.333333,15.666667,5.666667,18.266667,1.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,2024-04-24,20:00,Premier League,Matchweek 29,Wed,Away,L,0,2,Everton,...,2,0,1.666667,1.333333,20.666667,6.000000,16.266667,1.000000,0.333333,0.333333
54,2024-04-27,12:30,Premier League,Matchweek 35,Sat,Away,D,2,2,West Ham,...,5,0,1.000000,1.333333,19.333333,6.333333,16.033333,1.000000,0.000000,0.000000
55,2024-05-05,16:30,Premier League,Matchweek 36,Sun,Home,W,4,2,Tottenham,...,6,1,1.666667,1.666667,21.666667,7.333333,17.400000,1.000000,0.000000,0.000000
56,2024-05-13,20:00,Premier League,Matchweek 37,Mon,Away,D,3,3,Aston Villa,...,0,0,2.000000,2.000000,25.333333,9.000000,16.233333,0.333333,0.000000,0.000000


In [57]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [58]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
Wolverhampton Wanderers,41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [59]:
matches_rolling = matches_rolling.droplevel("team")

In [61]:
matches_rolling.index = range(matches_rolling.shape[0])

In [62]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2954,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
2955,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
2956,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
2957,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [65]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-01-01']
    test = data[data["date"] > '2023-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [66]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [67]:
precision

0.5738831615120275

In [68]:
combined

Unnamed: 0,actual,predicted
89,0,1
90,1,0
91,1,0
92,0,1
93,0,1
...,...,...
2954,0,0
2955,1,0
2956,0,0
2957,0,0


In [70]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [71]:
combined.head(10)

Unnamed: 0,actual,predicted,date,team,opponent,result
89,0,1,2023-01-03,Arsenal,Newcastle Utd,D
90,1,0,2023-01-15,Arsenal,Tottenham,W
91,1,0,2023-01-22,Arsenal,Manchester Utd,W
92,0,1,2023-02-04,Arsenal,Everton,L
93,0,1,2023-02-11,Arsenal,Brentford,D
94,0,0,2023-02-15,Arsenal,Manchester City,L
95,1,1,2023-02-18,Arsenal,Aston Villa,W
96,1,0,2023-02-25,Arsenal,Leicester City,W
97,1,0,2023-03-01,Arsenal,Everton,W
98,1,1,2023-03-04,Arsenal,Bournemouth,W


In [72]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton", 
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
} 
mapping = MissingDict(**map_values)

In [73]:
mapping["Arsenal"]

'Arsenal'

In [74]:
mapping["Tottenham Hotspur"]

'Tottenham'

In [75]:
combined["new_team"] = combined["team"].map(mapping)

In [76]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
89,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal
90,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal
91,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal
92,0,1,2023-02-04,Arsenal,Everton,L,Arsenal
93,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal
...,...,...,...,...,...,...,...
2954,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves
2955,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves
2956,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves
2957,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves


In [77]:
merged = combined.merge(combined, left_on=['date', 'new_team'], right_on=['date', 'opponent'])

In [78]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
2,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,1,Manchester United,Arsenal,L,Manchester Utd
3,0,1,2023-02-04,Arsenal,Everton,L,Arsenal,1,0,Everton,Arsenal,W,Everton
4,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal,0,0,Brentford,Arsenal,D,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth
1081,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves,0,0,Luton Town,Wolves,L,Luton Town
1082,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
1083,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace


In [79]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    154
0    102
Name: count, dtype: int64

In [80]:
154/256

0.6015625