In [92]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score


In [6]:
matches = pd.read_csv("Bundesliga_Matches.csv", index_col = 0)

In [9]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3,2,RB Leipzig,...,Match Report,,11.0,7.0,19.0,0.0,0,0,2024,Bayer Leverkusen
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3,0,Gladbach,...,Match Report,,24.0,11.0,15.8,0.0,0,0,2024,Bayer Leverkusen
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5,1,Darmstadt 98,...,Match Report,,25.0,13.0,17.3,1.0,0,0,2024,Bayer Leverkusen
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2,2,Bayern Munich,...,Match Report,,12.0,4.0,20.7,1.0,1,1,2024,Bayer Leverkusen
6,2023-09-24,15:30,Bundesliga,Matchweek 5,Sun,Home,W,4,1,Heidenheim,...,Match Report,,20.0,9.0,16.7,0.0,1,1,2024,Bayer Leverkusen


In [17]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes            object
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [19]:
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_num"] = matches["venue"].astype("category").cat.codes
matches["opp_num"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex = True).astype("int")
matches["day_num"] = matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W").astype("int")
referees_df = matches[['referee']].drop_duplicates().reset_index(drop=True)
referees_df["referee_num"] = referees_df["referee"].astype('category').cat.codes
matches = matches.merge(referees_df, on='referee', how='left')

In [22]:
def h2h_performance(matches):
    matches = matches.sort_values("date")
    historical_matches = matches[["date", "team", "opponent", "gf", "ga"]].copy()
    
    def get_h2h(row, historical_matches):
        historical = historical_matches[(historical_matches["team"] == row["team"]) & (historical_matches["opponent"] == row["opponent"]) & (historical_matches["date"] < row["date"])]
        if not historical.empty:
            return historical[["gf", "ga"]].mean().tolist()
        else:
            return [None, None]
    h2h_stats = matches.apply(lambda row: get_h2h(row, historical_matches), axis=1, result_type='expand')
    matches[['h2h_gf_avg', 'h2h_ga_avg']] = h2h_stats

    return matches.dropna(subset=['h2h_gf_avg', 'h2h_ga_avg'])

In [25]:
matches = h2h_performance(matches)

In [28]:
predictors = ["venue_num", "opp_num", "hour", "day_num"]


In [31]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [34]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "xga"]
new_cols = [f"{c}_rolling" for c in cols]

In [43]:
cutoff_date = pd.to_datetime('2024-01-01')
training_data = matches[matches["date"] < cutoff_date]
testing_data = matches[matches["date"] >= cutoff_date]

In [45]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling
0,2021-02-19,20:30,Bundesliga,Matchweek 22,Fri,Home,L,0,3,Wolfsburg,...,1.666667,3.666667,8.000000,3.333333,16.933333,0.000000,0.0,0.0,0.966667,2.066667
1,2021-02-27,15:30,Bundesliga,Matchweek 23,Sat,Away,L,0,3,Dortmund,...,1.333333,3.000000,7.666667,2.666667,19.100000,0.000000,0.0,0.0,1.000000,1.766667
2,2021-03-07,18:00,Bundesliga,Matchweek 24,Sun,Home,D,0,0,Union Berlin,...,1.000000,3.000000,7.333333,2.333333,20.066667,0.000000,0.0,0.0,0.933333,2.266667
3,2021-03-10,18:30,Bundesliga,Matchweek 20,Wed,Home,L,0,2,Werder Bremen,...,0.000000,2.000000,6.333333,2.000000,19.566667,0.000000,0.0,0.0,0.600000,1.900000
4,2021-03-14,13:30,Bundesliga,Matchweek 25,Sun,Away,W,2,1,Leverkusen,...,0.000000,1.666667,12.666667,4.000000,17.200000,0.333333,0.0,0.0,1.433333,1.933333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3740,2024-04-20,15:30,Bundesliga,Matchweek 30,Sat,Home,W,1,0,Bochum,...,1.000000,2.000000,13.000000,4.000000,17.333333,0.333333,0.0,0.0,1.300000,0.966667
3741,2024-04-27,15:30,Bundesliga,Matchweek 31,Sat,Away,W,2,1,Freiburg,...,0.666667,2.000000,10.666667,4.000000,18.633333,0.333333,0.0,0.0,0.800000,1.233333
3742,2024-05-04,15:30,Bundesliga,Matchweek 32,Sat,Home,W,3,0,Darmstadt 98,...,1.000000,1.333333,10.333333,4.000000,21.200000,0.333333,0.0,0.0,0.566667,1.600000
3743,2024-05-12,17:30,Bundesliga,Matchweek 33,Sun,Away,L,0,2,Bayern Munich,...,2.000000,0.333333,14.000000,5.333333,19.533333,1.000000,0.0,0.0,1.166667,1.300000


In [47]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=20, random_state=1)
rf.fit(training_data[predictors], training_data["target"])

In [50]:
predictions = rf.predict(testing_data[predictors])

In [53]:
accuracy = accuracy_score(testing_data["target"], predictions)
precision = precision_score(testing_data["target"], predictions)

In [56]:
predictors += ["hour", "day_num", "gf_rolling", "ga_rolling", "xg_rolling", "xga_rolling", "fk_rolling", 
    "pk_rolling", "pkatt_rolling", "sh_rolling", "sot_rolling", "h2h_gf_avg", "h2h_ga_avg", "referee_num"]

In [59]:
def make_predictions(data, predictors):
    training = data[data["date"] < '2024-01-01']
    test = data[data["date"] >= '2024-01-01']
    rf.fit(training[predictors], training["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test["target"], prediction = preds), index = test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [62]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [64]:
precision

0.5945945945945946

In [68]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [71]:
merged = combined.merge(combined, left_on = ["date", "team"], right_on = ["date", "opponent"])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,actual_y,prediction_y,team_y,opponent_y,result_y
0,0,0,2024-01-13,Augsburg,Leverkusen,L,1,1,Bayer Leverkusen,Augsburg,W
1,0,0,2024-01-13,Augsburg,Leverkusen,L,1,1,Bayer Leverkusen,Augsburg,W
2,1,0,2024-01-21,Augsburg,Gladbach,W,0,0,Monchengladbach,Augsburg,L
3,0,0,2024-01-27,Augsburg,Bayern Munich,L,1,1,Bayern Munich,Augsburg,W
4,0,0,2024-01-27,Augsburg,Bayern Munich,L,1,0,Bayern Munich,Augsburg,W
...,...,...,...,...,...,...,...,...,...,...,...
390,1,0,2024-04-27,Wolfsburg,Freiburg,W,0,0,Freiburg,Wolfsburg,L
391,1,0,2024-05-04,Wolfsburg,Darmstadt 98,W,0,0,Darmstadt 98,Wolfsburg,L
392,0,0,2024-05-12,Wolfsburg,Bayern Munich,L,1,1,Bayern Munich,Wolfsburg,W
393,0,0,2024-05-12,Wolfsburg,Bayern Munich,L,1,0,Bayern Munich,Wolfsburg,W
