In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier ##importing machine learning for non linear data
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Load the dataset and set the index
matches = pd.read_csv("matches.csv", index_col = 0)

# Preprocessing: Convert necessary columns to numeric formats for modeling
matches["date"] = pd.to_datetime(matches["date"]) # Convert the date column to datetime
matches["h/a"] = matches["venue"].astype("category").cat.codes # Encode venue as 1 (home) or 0 (away)
matches["opp"] = matches["opponent"].astype("category").cat.codes # Convert opponents to numerical labels
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") # Extract and convert hour to int
matches["day"] = matches["date"].dt.dayofweek  # Convert day of the week to a number (0=Monday, 6=Sunday)

matches["target"] = (matches["result"] == "W").astype("int") # Define the target column: Win ("W") as 1, others as 0

rf = RandomForestClassifier(n_estimators = 100, min_samples_split=10, random_state=1) # Initialize the RandomForestClassifier with specific parameters
train = matches[matches["date"] < '2022-01-01'] # Split the data into training (before 2022) and test (after 2022)
test = matches[matches["date"] > '2022-01-01']
predictors = ["h/a", "opp", "hour", "day"] # List of predictor columns
rf.fit(train[predictors], train["target"]) # Train the random forest model on the training set
RandomForestClassifier(min_samples_split = 10, n_estimators = 100, random_state = 1)
preds = rf.predict(test[predictors]) # making prediction

acc = accuracy_score(test["target"], preds) # calculate testing accuracy
acc

combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

test_acc = accuracy_score(test["target"], preds) # test accuracy score

precision_score(test["target"], preds) # precision score for positive class predictions (wins)

# group matches by team for further analysis
grouped_matches = matches.groupby("team") 
group = grouped_matches.get_group("Manchester United").sort_values("date")

# function to compute rolling average of key statistics over the last 3 matches 
def rolling_averages(group, cols, new_cols): 
    group = group.sort_values("date") # sort matches by date 
    rolling_stats = group[cols].rolling(3, closed='left').mean() #compute rolling averages over last 3 games
    group[new_cols] = rolling_stats # add rolling averages to new columns
    group = group.dropna(subset=new_cols) # drop rows with missing values for rolling averages
    return group 

# define columns for calculating rolling averages and the new column names
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] 
new_cols = [f"{c}_rolling" for c in cols] # creating new columns with rolling average values 

rolling_averages(group, cols, new_cols) # calling function and generating average of last 3 games

#Apply the rolling average function to each team
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team') # remove the multi-level index

#reset the index for the rolling averages DataFrame
matches_rolling.index = range(matches_rolling.shape[0]) ## adding new index
matches_rolling

# function to make predictions using the trained model and additional rolling average features
def make_predictions(data, predictors): 
    train = data[data["date"] < '2022-01-01'] # split training data 
    test = data[data["date"] > '2022-01-01'] # split test data
    rf.fit(train[predictors], train["target"]) # train the model on training data
    preds = rf.predict(test[predictors]) # make predictions on test data
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds) # calculate precision
    return combined, precision # return combined predictions and precision score

# make predictions with rolling averages included as additional predictors
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision
combined 

# merge the prediction results with the match details
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)
combined

#custom dictionary class to handle missing team mappings 
class MissingDict(dict):
    __missing__ = lambda self, key: key # return the key itself if it's missing 

#mapping of team names to their shortened forms
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)
mapping["West Ham United"]

#map team names to their shortened versions
combined["new_team"] = combined["team"].map(mapping)
combined

#merge home and away team predictions for the same match
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"]) ## finding both the home and away team predictions and merging them 
merged

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
243,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
244,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa
245,0,1,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle United


In [2]:
print(f"Testing Accuracy: {acc}")
print(f"Precision: {precision}")
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
confusion_matrix = pd.crosstab(index=combined["actual"], columns=combined["prediction"])
print(confusion_matrix)
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
print(f"Precision with rolling averages: {precision}")
print(merged)



Testing Accuracy: 0.6123188405797102
Precision: 0.6739130434782609
prediction    0   1
actual             
0           140  32
1            75  29
Precision with rolling averages: 0.6739130434782609
     actual_x  prediction_x       date                   team_x  \
0           0             1 2022-01-23                  Arsenal   
1           1             0 2022-02-10                  Arsenal   
2           1             0 2022-02-19                  Arsenal   
3           1             1 2022-02-24                  Arsenal   
4           1             1 2022-03-06                  Arsenal   
..        ...           ...        ...                      ...   
242         1             0 2022-03-13  Wolverhampton Wanderers   
243         0             0 2022-03-18  Wolverhampton Wanderers   
244         1             0 2022-04-02  Wolverhampton Wanderers   
245         0             1 2022-04-08  Wolverhampton Wanderers   
246         0             0 2022-04-24  Wolverhampton Wanderers 