In [1]:
import pandas as pd

In [None]:
# read matches csv
matches = pd.read_csv("matches.csv", index_col=0)

In [None]:
# check if all matches are successfully scraped
matches["team"].value_counts()

team
Liverpool                   76
Arsenal                     76
Manchester City             76
Chelsea                     76
Newcastle United            76
Aston Villa                 76
Nottingham Forest           76
Brighton and Hove Albion    76
Bournemouth                 76
Brentford                   76
Fulham                      76
Crystal Palace              76
Everton                     76
West Ham United             76
Manchester United           76
Wolverhampton Wanderers     76
Tottenham Hotspur           76
Leicester City              38
Ipswich Town                38
Southampton                 38
Luton Town                  38
Burnley                     38
Sheffield United            38
Name: count, dtype: int64

In [None]:
# checking what types need to be changed
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf                 int64
ga                 int64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [7]:
matches["date"] = pd.to_datetime(matches["date"])

In [8]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [9]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [10]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [11]:
matches["day_code"] = matches["date"].dt.dayofweek

In [12]:
matches["target"] = (matches["result"] == "W").astype("int")

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [26]:
train = matches[matches["date"] < '2024-01-01']

In [27]:
test = matches[matches["date"] > '2024-01-01']

In [28]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [32]:
rf.fit(train[predictors], train["target"])

In [33]:
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [34]:
preds = rf.predict(test[predictors])

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
acc = accuracy_score(test["target"], preds)

In [37]:
acc

0.5861456483126111

In [38]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [41]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,539,163
1,303,121


In [42]:
from sklearn.metrics import precision_score

In [43]:
precision_score(test["target"], preds)

0.426056338028169

In [44]:
grouped_matches = matches.groupby("team")

In [46]:
group = grouped_matches.get_group("Manchester City")

In [47]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,...,0.0,0,0,2023,Manchester City,0,6,16,6,1
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,...,1.0,1,1,2023,Manchester City,1,10,15,5,1
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,...,1.0,0,0,2023,Manchester City,0,21,17,5,1
4,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,W,2,1,Brentford,...,0.0,0,0,2023,Manchester City,1,3,15,5,1
6,2024-09-22,16:30,Premier League,Matchweek 5,Sun,Home,D,2,2,Arsenal,...,1.0,0,0,2023,Manchester City,1,0,16,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,2024-04-28,16:30,Premier League,Matchweek 35,Sun,Away,W,2,0,Nott'ham Forest,...,1.0,0,0,2022,Manchester City,0,17,16,6,1
52,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Home,W,5,1,Wolves,...,0.0,2,2,2022,Manchester City,1,22,17,5,1
53,2024-05-11,12:30,Premier League,Matchweek 37,Sat,Away,W,4,0,Fulham,...,0.0,1,1,2022,Manchester City,0,9,12,5,1
54,2024-05-14,20:00,Premier League,Matchweek 34,Tue,Away,W,2,0,Tottenham,...,0.0,1,1,2022,Manchester City,0,20,20,1,1


In [48]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [49]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [50]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [51]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,5,1,2.000000,0.333333,20.000000,7.000000,16.366667,0.666667,0.000000,0.333333
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,5,1,2.666667,0.666667,16.333333,5.666667,16.666667,0.666667,0.333333,0.666667
8,2023-09-23,15:00,Premier League,Matchweek 6,Sat,Home,W,2,0,Nott'ham Forest,...,5,1,3.333333,1.000000,21.333333,8.666667,16.166667,1.000000,0.333333,0.666667
10,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,L,1,2,Wolves,...,5,0,3.333333,0.666667,14.000000,7.000000,16.133333,1.000000,0.333333,0.333333
12,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Away,L,0,1,Arsenal,...,6,0,2.000000,1.000000,19.666667,8.333333,17.633333,1.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,2025-04-22,20:00,Premier League,Matchweek 34,Tue,Home,W,2,1,Aston Villa,...,1,1,2.333333,0.666667,14.000000,6.333333,17.500000,0.666667,0.000000,0.000000
52,2025-05-02,20:00,Premier League,Matchweek 35,Fri,Home,W,1,0,Wolves,...,4,1,3.000000,1.000000,15.666667,6.666667,15.133333,0.333333,0.000000,0.000000
53,2025-05-10,15:00,Premier League,Matchweek 36,Sat,Away,D,0,0,Southampton,...,5,0,1.666667,0.333333,11.666667,4.666667,16.766667,0.000000,0.000000,0.000000
55,2025-05-20,20:00,Premier League,Matchweek 37,Tue,Home,W,3,1,Bournemouth,...,1,1,1.000000,0.333333,16.333333,4.000000,17.400000,0.666667,0.000000,0.000000


In [52]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [53]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
Arsenal,5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
Arsenal,7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2,2,Tottenham,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
Arsenal,9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4,0,Bournemouth,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
Arsenal,11,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1,0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,38,2025-04-26,15:00,Premier League,Matchweek 34,Sat,Home,W,3,0,Leicester City,...,5,1,2.333333,1.000000,13.000000,4.333333,16.433333,0.666667,0.000000,0.000000
Wolverhampton Wanderers,39,2025-05-02,20:00,Premier League,Matchweek 35,Fri,Away,L,0,1,Manchester City,...,4,0,2.666667,0.666667,12.333333,4.000000,18.633333,1.000000,0.000000,0.000000
Wolverhampton Wanderers,40,2025-05-10,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Brighton,...,5,0,1.333333,0.333333,10.000000,2.333333,18.900000,0.666667,0.000000,0.000000
Wolverhampton Wanderers,41,2025-05-20,20:00,Premier League,Matchweek 37,Tue,Away,L,2,4,Crystal Palace,...,1,0,1.000000,1.000000,12.000000,2.666667,17.200000,0.333333,0.000000,0.000000


In [54]:
matches_rolling = matches_rolling.droplevel('team')

In [56]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2,2,Tottenham,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4,0,Bournemouth,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
11,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1,0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2025-04-26,15:00,Premier League,Matchweek 34,Sat,Home,W,3,0,Leicester City,...,5,1,2.333333,1.000000,13.000000,4.333333,16.433333,0.666667,0.000000,0.000000
39,2025-05-02,20:00,Premier League,Matchweek 35,Fri,Away,L,0,1,Manchester City,...,4,0,2.666667,0.666667,12.333333,4.000000,18.633333,1.000000,0.000000,0.000000
40,2025-05-10,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Brighton,...,5,0,1.333333,0.333333,10.000000,2.333333,18.900000,0.666667,0.000000,0.000000
41,2025-05-20,20:00,Premier League,Matchweek 37,Tue,Away,L,2,4,Crystal Palace,...,1,0,1.000000,1.000000,12.000000,2.666667,17.200000,0.333333,0.000000,0.000000


In [57]:
matches_rolling.index = range(matches_rolling.shape[0])

In [60]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision


In [61]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [62]:
precision

0.5117647058823529

In [63]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [64]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
17,1,0,2024-01-20,Arsenal,Crystal Palace,W
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
19,1,1,2024-02-04,Arsenal,Liverpool,W
20,1,0,2024-02-11,Arsenal,West Ham,W
21,1,1,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...
1446,1,1,2025-04-26,Wolverhampton Wanderers,Leicester City,W
1447,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L
1448,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L
1449,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L


In [69]:
# normalise names
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
"Brighton and Hove Albion": "Brighton",
"Manchester United": "Manchester Utd",
"Newcastle United": "Newcastle Utd",
"Tottenham Hotspur": "Tottenham",
"West Ham United": "West Ham",
"Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)


In [70]:
combined["new_team"] = combined["team"].map(mapping)

In [71]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
17,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal
19,1,1,2024-02-04,Arsenal,Liverpool,W,Arsenal
20,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal
21,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal
...,...,...,...,...,...,...,...
1446,1,1,2025-04-26,Wolverhampton Wanderers,Leicester City,W,Wolves
1447,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L,Wolves
1448,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L,Wolves
1449,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L,Wolves


In [None]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "oponent"])