In [593]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [594]:
matches1 = pd.read_csv("assets/2022-2023.csv")
matches2 = pd.read_csv("assets/2023-2024.csv")
matches3 = pd.read_csv("assets/2024-2025.csv")
matches = pd.concat([matches1, matches2, matches3])

In [595]:
matches["Date Time (US Eastern)"] = pd.to_datetime(matches["Date Time (US Eastern)"])

  matches["Date Time (US Eastern)"] = pd.to_datetime(matches["Date Time (US Eastern)"])


In [596]:
#advantages and disadvantages to consider when teams play against each other

matches["homeAway_code"] = np.where(matches["Team"] == matches["Home Team"], 0, 1) 

matches["opponent_code"] = np.where(
    matches["Team"] == matches["Home Team"],
    matches["Away Team"].astype("category").cat.codes,
    matches["Home Team"].astype("category").cat.codes
)
matches["day_code"] = matches["Date Time (US Eastern)"].dt.dayofweek
matches["date"] = matches["Date Time (US Eastern)"].dt.date

In [597]:
#determine the match result
matches["target"] = np.where(
    matches["Home Goal"] == matches["Away Goal"], 2,  
    np.where(
        (matches["Home Goal"] > matches["Away Goal"]) & (matches["Team"] == matches["Home Team"]), 1,  
        np.where(
            (matches["Home Goal"] > matches["Away Goal"]) & (matches["Team"] != matches["Home Team"]), 0,  
            np.where(
                (matches["Home Goal"] < matches["Away Goal"]) & (matches["Team"] == matches["Away Team"]), 1,  
                0  
            )
        )
    )
)

In [598]:
#began using machine learning 
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [599]:
predictors = ["homeAway_code", "opponent_code", "day_code"]

In [600]:
train = matches[matches["Date Time (US Eastern)"] <= '2024-05-01']

In [601]:
test = matches[matches["Date Time (US Eastern)"] > '2024-05-01']

In [602]:
rf.fit(train[predictors], train["target"])

In [603]:
preds = rf.predict(test[predictors])

In [604]:
accuracy = accuracy_score(test["target"], preds)
accuracy

0.4722222222222222

In [605]:
combine = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [606]:
pd.crosstab(index=combine["actual"], columns=combine["prediction"])

prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,67,32,11
1,36,60,14
2,27,32,9


In [607]:
precision_score(test["target"], preds, average='weighted', zero_division=0)

0.4441601185552798

In [608]:
grouped_matches = matches.groupby("Team")

In [609]:
columns = ["shotsOnTarget", "accuratePasses",  
           "blockedShots", "effectiveTackles", "effectiveClearance"]
new_columns = [f"{c}_rolling" for c in columns]

In [610]:
""" 
This function uses the stats from the previous 3 games 
to determine the team's condition and predict the next game. 
"""
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date Time (US Eastern)")
    rolling_stats = group[cols].shift(1).rolling(window=3).mean()
    for col, new_col in zip(cols, new_cols):
            group[new_col] = rolling_stats[col]
    group = group.dropna(subset=new_cols)
    return group

In [620]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, columns, new_columns))
matches_rolling.droplevel("Team")
matches_rolling = matches_rolling.reset_index(drop=True)
matches_rolling.index = range(matches_rolling.shape[0])

  matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, columns, new_columns))


In [612]:
def make_predictions(data, predictors):
    train = data[data["Date Time (US Eastern)"] <= '2024-05-01']
    test = data[data["Date Time (US Eastern)"] > '2024-05-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combine = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds, average='macro', zero_division=0)
    return combine, precision

In [618]:
combined, precision = make_predictions(matches_rolling, predictors + new_columns)
combined = combined.merge(matches_rolling[["date", "Team", "Home Team", "Away Team", "target"]], 
                          left_index=True, right_index=True, how='inner')

combined_liverpool = combined[combined["Team"] == "Arsenal"]
combined_liverpool

Unnamed: 0,actual,prediction,date,Team,Home Team,Away Team,target
154,1,1,2024-05-04,Arsenal,Arsenal,AFC Bournemouth,1
155,1,1,2024-05-12,Arsenal,Manchester United,Arsenal,1
156,1,1,2024-05-19,Arsenal,Arsenal,Everton,1
157,1,1,2024-08-17,Arsenal,Arsenal,Wolverhampton Wanderers,1
158,1,1,2024-08-24,Arsenal,Aston Villa,Arsenal,1
159,2,1,2024-08-31,Arsenal,Arsenal,Brighton & Hove Albion,2
160,1,0,2024-09-15,Arsenal,Tottenham Hotspur,Arsenal,1
161,2,0,2024-09-22,Arsenal,Manchester City,Arsenal,2
162,1,2,2024-09-28,Arsenal,Arsenal,Leicester City,1
163,1,1,2024-10-05,Arsenal,Arsenal,Southampton,1


In [614]:
accuracy

0.4722222222222222

In [615]:
precision

0.4500805152979066