In [7]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [8]:
WEEK = 12

In [9]:
# load in model
model = joblib.load('model_v2.pkl')

In [10]:
df = pd.read_csv('new_data.csv')

In [11]:
# create a variable called current day
current_day = pd.to_datetime('today').date()

# convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date']).dt.date

# create a new df filter out the rows where the date is greater than the current day
model_df = df[df['Date'] < current_day]

In [12]:
# remove rows with NaN values
model_df = model_df.dropna()

In [13]:
model_df['Attendance'] = model_df['Attendance'].fillna(0) # setting null values to 0 because the NaN are from Covid-19 games with no attendance

model_df['Attendance'] = model_df['Attendance'].astype(int)

In [14]:
model_df[['home_goals', 'away_goals']] = model_df['Score'].str.split('–', expand=True).astype(int)

# rename xG and xG.1 columns to home_xG and away_xG
model_df.rename(columns={'xG': 'home_xg', 'xG.1': 'away_xg'}, inplace=True)

In [15]:
def get_result(row):
    if row['home_goals'] > row['away_goals']:
        return 'Home Win'
    elif row['home_goals'] < row['away_goals']:
        return 'Away Win'
    else:
        return 'Draw'
    
model_df['Result'] = model_df.apply(get_result, axis=1)

In [16]:
model_df['Day'] = pd.to_datetime(model_df['Date']).dt.day_name()

model_df['Season'] = model_df['Season'].apply(lambda x: x.split('-')[1])

In [17]:
model_df.reset_index(drop=True, inplace=True)

In [18]:
for x in model_df.Home.unique():
    temp_df = model_df[(model_df['Home'] == x) | (model_df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])

    temp_df['goal_value_to_calculate'] = temp_df.apply(lambda y: y['home_goals'] if y['Home'] == x else y['away_goals'], axis=1)
    temp_df['rolling_avg_goals'] = temp_df['goal_value_to_calculate'].rolling(window = 5, closed = 'left', min_periods = 1).mean()

    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            model_df.at[index, 'home_rolling_avg_goals'] = row['rolling_avg_goals']
        else:
            model_df.at[index, 'away_rolling_avg_goals'] = row['rolling_avg_goals']

In [19]:
for x in model_df.Home.unique():
    temp_df = model_df[(model_df['Home'] == x) | (model_df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])

    temp_df['xg_value_to_calculate'] = temp_df.apply(lambda y: y['home_xg'] if y['Home'] == x else y['away_xg'], axis=1)
    temp_df['rolling_avg_xG'] = temp_df['xg_value_to_calculate'].rolling(window = 5, closed = 'left', min_periods = 1).mean()

    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            model_df.at[index, 'home_rolling_avg_xG'] = row['rolling_avg_xG']
        else:
            model_df.at[index, 'away_rolling_avg_xG'] = row['rolling_avg_xG']

In [20]:
model_df = model_df.dropna(subset=['home_rolling_avg_goals', 'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'])

In [21]:
# create a new column for the central moving average of the home goals
model_df['home_goals_cma'] = model_df['home_goals'].expanding().mean()
model_df['home_xg_cma'] = model_df['home_xg'].expanding().mean()

# away goals
model_df['away_goals_cma'] = model_df['away_goals'].expanding().mean()
model_df['away_xg_cma'] = model_df['away_xg'].expanding().mean()

# create a new column by dividing the home goals by central moving average
model_df['home_goals_div_cma'] = model_df['home_goals'] / model_df['home_goals_cma']
model_df['home_xg_div_cma'] = model_df['home_xg'] / model_df['home_xg_cma']

# away goals
model_df['away_goals_div_cma'] = model_df['away_goals'] / model_df['away_goals_cma']
model_df['away_xg_div_cma'] = model_df['away_xg'] / model_df['away_xg_cma']

In [22]:
# set Wk column to integer
model_df['Wk'] = model_df['Wk'].astype(int)

# create a new column called quarter. divide the season into 4 quarters using the Wk column and assign a value between 1 and 4
model_df['quarter'] = np.where(model_df['Wk'].astype(int) <= 9, 1,
                np.where(model_df['Wk'].astype(int) <= 18, 2,
                np.where(model_df['Wk'].astype(int) <= 27, 3, 4)))

# using home_goals_div_cma, take the mean of the column for each quarter. put into new column called home_goals_seasonality
model_df['home_goals_seasonality'] = model_df.groupby('quarter')['home_goals_div_cma'].transform('mean')
model_df['home_xg_seasonality'] = model_df.groupby('quarter')['home_xg_div_cma'].transform('mean')

# away goals
model_df['away_goals_seasonality'] = model_df.groupby('quarter')['away_goals_div_cma'].transform('mean')
model_df['away_xg_seasonality'] = model_df.groupby('quarter')['away_xg_div_cma'].transform('mean')

# create a new column called home_goals_deseasonalised by dividing home_goals by home_goals_seasonality
model_df['home_goals_deseasonalised'] = model_df['home_goals'] / model_df['home_goals_seasonality']
model_df['home_xg_deseasonalised'] = model_df['home_xg'] / model_df['home_xg_seasonality']

# away goals
model_df['away_goals_deseasonalised'] = model_df['away_goals'] / model_df['away_goals_seasonality']
model_df['away_xg_deseasonalised'] = model_df['away_xg'] / model_df['away_xg_seasonality']

In [23]:
model_df['Season'] = model_df['Season'].astype(int)
# set home goals and away goals to integer
model_df['home_goals'] = model_df['home_goals'].astype(int)
model_df['away_goals'] = model_df['away_goals'].astype(int)

In [24]:
# Ensure 'Result' is categorical or integer-based
model_df['Result'] = model_df['Result'].astype('category')  # or use int depending on your encoding

# Ensure 'home_goals' and 'away_goals' are integers
model_df['home_goals'] = pd.to_numeric(model_df['home_goals'], errors='coerce').fillna(0).astype(int)
model_df['away_goals'] = pd.to_numeric(model_df['away_goals'], errors='coerce').fillna(0).astype(int)

In [25]:
features = [column for column in model_df.drop(columns=[
    'Date', 'Day', 'Home', 'Away', 'Venue', 'Time', 'home_xg', 'away_xg', 'home_goals', 'Score', 'Referee', 'Attendance',
    'away_goals', 'Result', 'Season', 'quarter', 'home_goals_cma', 'home_xg_cma', 
    'away_goals_cma', 'away_xg_cma', 'home_goals_div_cma', 'home_xg_div_cma', 
    'away_goals_div_cma', 'away_xg_div_cma'])]

In [26]:
# create a new dataframe with the teams and the features

teams = df['Home'].unique()

teams_df = pd.DataFrame(teams, columns=['Team'])

# assign the latest 'home_rolling_avg_goals', 'away_rolling_avg_goals' etc. to the teams_df
for team in teams:
    home_rolling_avg_goals = model_df[model_df['Home'] == team]['home_rolling_avg_goals'].iloc[-1]
    away_rolling_avg_goals = model_df[model_df['Away'] == team]['away_rolling_avg_goals'].iloc[-1]
    home_rolling_avg_xG = model_df[model_df['Home'] == team]['home_rolling_avg_xG'].iloc[-1]
    away_rolling_avg_xG = model_df[model_df['Away'] == team]['away_rolling_avg_xG'].iloc[-1]
    home_goals_seasonality = model_df[model_df['Home'] == team]['home_goals_seasonality'].iloc[-1]
    home_xg_seasonality = model_df[model_df['Home'] == team]['home_xg_seasonality'].iloc[-1]
    away_goals_seasonality = model_df[model_df['Away'] == team]['away_goals_seasonality'].iloc[-1]
    away_xg_seasonality = model_df[model_df['Away'] == team]['away_xg_seasonality'].iloc[-1]
    home_goals_deseasonalised = model_df[model_df['Home'] == team]['home_goals_deseasonalised'].iloc[-1]
    home_xg_deseasonalised = model_df[model_df['Home'] == team]['home_xg_deseasonalised'].iloc[-1]
    away_goals_deseasonalised = model_df[model_df['Away'] == team]['away_goals_deseasonalised'].iloc[-1]
    away_xg_deseasonalised = model_df[model_df['Away'] == team]['away_xg_deseasonalised'].iloc[-1]

    teams_df.loc[teams_df['Team'] == team, 'home_rolling_avg_goals'] = home_rolling_avg_goals
    teams_df.loc[teams_df['Team'] == team, 'away_rolling_avg_goals'] = away_rolling_avg_goals
    teams_df.loc[teams_df['Team'] == team, 'home_rolling_avg_xG'] = home_rolling_avg_xG
    teams_df.loc[teams_df['Team'] == team, 'away_rolling_avg_xG'] = away_rolling_avg_xG
    teams_df.loc[teams_df['Team'] == team, 'home_goals_seasonality'] = home_goals_seasonality
    teams_df.loc[teams_df['Team'] == team, 'home_xg_seasonality'] = home_xg_seasonality
    teams_df.loc[teams_df['Team'] == team, 'away_goals_seasonality'] = away_goals_seasonality
    teams_df.loc[teams_df['Team'] == team, 'away_xg_seasonality'] = away_xg_seasonality
    teams_df.loc[teams_df['Team'] == team, 'home_goals_deseasonalised'] = home_goals_deseasonalised
    teams_df.loc[teams_df['Team'] == team, 'home_xg_deseasonalised'] = home_xg_deseasonalised
    teams_df.loc[teams_df['Team'] == team, 'away_goals_deseasonalised'] = away_goals_deseasonalised
    teams_df.loc[teams_df['Team'] == team, 'away_xg_deseasonalised'] = away_xg_deseasonalised


# create a copy of original df and filter for games this weekend
weekend_df = df.copy()

# filter for Wk 9, season 2024-2025
weekend_df = weekend_df[(weekend_df['Wk'] == WEEK) & (weekend_df['Season'] == '2024-2025')]

# drop columns with null values
weekend_df = weekend_df.drop(columns=['xG', 'xG.1', 'Attendance', 'Referee', 'Score'])


# using Home and Away columns, assign the rolling averages and seasonality values to the weekend_df
for index, row in weekend_df.iterrows():
    home_team = row['Home']
    away_team = row['Away']

    home_rolling_avg_goals = teams_df[teams_df['Team'] == home_team]['home_rolling_avg_goals'].iloc[0]
    away_rolling_avg_goals = teams_df[teams_df['Team'] == away_team]['away_rolling_avg_goals'].iloc[0]
    home_rolling_avg_xG = teams_df[teams_df['Team'] == home_team]['home_rolling_avg_xG'].iloc[0]
    away_rolling_avg_xG = teams_df[teams_df['Team'] == away_team]['away_rolling_avg_xG'].iloc[0]
    home_goals_seasonality = teams_df[teams_df['Team'] == home_team]['home_goals_seasonality'].iloc[0]
    home_xg_seasonality = teams_df[teams_df['Team'] == home_team]['home_xg_seasonality'].iloc[0]
    away_goals_seasonality = teams_df[teams_df['Team'] == away_team]['away_goals_seasonality'].iloc[0]
    away_xg_seasonality = teams_df[teams_df['Team'] == away_team]['away_xg_seasonality'].iloc[0]
    home_goals_deseasonalised = teams_df[teams_df['Team'] == home_team]['home_goals_deseasonalised'].iloc[0]
    home_xg_deseasonalised = teams_df[teams_df['Team'] == home_team]['home_xg_deseasonalised'].iloc[0]
    away_goals_deseasonalised = teams_df[teams_df['Team'] == away_team]['away_goals_deseasonalised'].iloc[0]
    away_xg_deseasonalised = teams_df[teams_df['Team'] == away_team]['away_xg_deseasonalised'].iloc[0]

    weekend_df.at[index, 'home_rolling_avg_goals'] = home_rolling_avg_goals
    weekend_df.at[index, 'away_rolling_avg_goals'] = away_rolling_avg_goals
    weekend_df.at[index, 'home_rolling_avg_xG'] = home_rolling_avg_xG
    weekend_df.at[index, 'away_rolling_avg_xG'] = away_rolling_avg_xG
    weekend_df.at[index, 'home_goals_seasonality'] = home_goals_seasonality
    weekend_df.at[index, 'home_xg_seasonality'] = home_xg_seasonality

    weekend_df.at[index, 'away_goals_seasonality'] = away_goals_seasonality
    weekend_df.at[index, 'away_xg_seasonality'] = away_xg_seasonality
    weekend_df.at[index, 'home_goals_deseasonalised'] = home_goals_deseasonalised
    weekend_df.at[index, 'home_xg_deseasonalised'] = home_xg_deseasonalised
    weekend_df.at[index, 'away_goals_deseasonalised'] = away_goals_deseasonalised
    weekend_df.at[index, 'away_xg_deseasonalised'] = away_xg_deseasonalised


display(weekend_df)

Unnamed: 0,Wk,Day,Date,Time,Home,Away,Venue,Season,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG,home_goals_seasonality,home_xg_seasonality,away_goals_seasonality,away_xg_seasonality,home_goals_deseasonalised,home_xg_deseasonalised,away_goals_deseasonalised,away_xg_deseasonalised
2770,12.0,Sat,2024-11-23,12:30,Leicester City,Chelsea,King Power Stadium,2024-2025,1.8,2.2,1.22,2.26,1.03565,1.036803,1.064295,1.049451,0.965577,0.771603,0.939589,1.048167
2771,12.0,Sat,2024-11-23,15:00,Arsenal,Nott'ham Forest,Emirates Stadium,2024-2025,2.0,1.0,1.86,1.04,1.03565,1.036803,1.06067,1.066417,1.931154,0.868053,2.828402,1.594122
2772,12.0,Sat,2024-11-23,15:00,Bournemouth,Brighton,Vitality Stadium,2024-2025,1.2,2.0,1.32,1.26,1.00385,1.012279,1.064295,1.049451,1.99233,1.975741,0.939589,0.952879
2773,12.0,Sat,2024-11-23,15:00,Aston Villa,Crystal Palace,Villa Park,2024-2025,2.2,0.4,1.34,0.86,1.03565,1.036803,1.064295,1.049451,0.965577,1.736106,1.879178,2.286911
2774,12.0,Sat,2024-11-23,15:00,Everton,Brentford,Goodison Park,2024-2025,1.4,2.4,1.06,2.06,1.03565,1.036803,1.064295,1.049451,0.965577,0.771603,0.939589,0.571728
2775,12.0,Sat,2024-11-23,15:00,Fulham,Wolves,Craven Cottage,2024-2025,1.6,1.4,1.82,0.82,1.00385,1.012279,1.06067,1.066417,1.99233,1.284231,1.885601,1.219035
2776,12.0,Sat,2024-11-23,17:30,Manchester City,Tottenham,Etihad Stadium,2024-2025,2.0,2.4,1.66,2.36,1.03565,1.036803,1.06067,1.066417,0.965577,2.79706,0.0,0.656403
2777,12.0,Sun,2024-11-24,14:00,Southampton,Liverpool,St Mary's Stadium,2024-2025,1.0,1.6,1.18,1.74,1.00385,1.012279,1.06067,1.066417,0.996165,0.691509,1.885601,0.750175
2778,12.0,Sun,2024-11-24,16:30,Ipswich Town,Manchester Utd,Portman Road Stadium,2024-2025,1.4,1.0,1.2,1.42,1.00385,1.012279,1.06067,1.066417,0.996165,0.98787,0.942801,2.156754
2779,12.0,Mon,2024-11-25,20:00,Newcastle Utd,West Ham,St James' Park,2024-2025,0.6,1.6,1.8,1.82,1.00385,1.012279,1.064295,1.049451,0.996165,0.493935,0.0,0.095288


In [27]:
teams_df

Unnamed: 0,Team,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG,home_goals_seasonality,home_xg_seasonality,away_goals_seasonality,away_xg_seasonality,home_goals_deseasonalised,home_xg_deseasonalised,away_goals_deseasonalised,away_xg_deseasonalised
0,Arsenal,2.0,1.8,1.86,1.98,1.03565,1.036803,1.064295,1.049451,1.931154,0.868053,0.939589,1.429319
1,Watford,0.6,0.6,0.78,0.72,1.07499,1.082672,1.066751,1.067687,0.930241,1.293096,0.937426,0.561962
2,Crystal Palace,0.8,0.4,1.14,0.86,1.00385,1.012279,1.064295,1.049451,0.0,1.481805,1.879178,2.286911
3,West Brom,1.0,0.8,0.88,0.84,1.07499,1.082672,1.066751,1.067687,0.930241,1.016004,0.937426,0.936604
4,Chelsea,1.8,2.2,2.04,2.26,1.00385,1.012279,1.064295,1.049451,0.996165,1.481805,0.939589,1.048167
5,Everton,1.4,1.0,1.06,1.14,1.03565,1.036803,1.064295,1.049451,0.965577,0.771603,0.0,1.048167
6,Southampton,1.0,1.0,1.18,0.84,1.00385,1.012279,1.064295,1.049451,0.996165,0.691509,0.0,0.571728
7,Brighton,1.8,2.0,1.26,1.26,1.00385,1.012279,1.064295,1.049451,1.99233,2.272102,0.939589,0.952879
8,Newcastle Utd,0.6,0.6,1.8,1.6,1.00385,1.012279,1.064295,1.049451,0.996165,0.493935,2.818766,1.524607
9,Manchester Utd,0.8,1.0,1.44,1.42,1.00385,1.012279,1.06067,1.066417,2.988495,0.790296,0.942801,2.156754


In [28]:
weekend_df['Day'] = pd.to_datetime(weekend_df['Date']).dt.day_name()

# store weekend_df in new variable for final results

results_df = weekend_df.loc[:, ['Date', 'Home', 'Away', 'Day', 'Venue']]

# Add missing columns with default values
for column in model_df.columns:
    if column not in weekend_df.columns:
        if model_df[column].dtype == 'bool':
            weekend_df[column] = False
        elif model_df[column].dtype == 'float64':
            weekend_df[column] = 0.0
        elif model_df[column].dtype == 'int64':
            weekend_df[column] = 0
        else:
            weekend_df[column] = None

# Ensure the order of columns matches
weekend_df = weekend_df[model_df.columns]

In [29]:
weekend_df

Unnamed: 0,Wk,Day,Date,Time,Home,home_xg,Score,away_xg,Away,Attendance,...,away_xg_div_cma,quarter,home_goals_seasonality,home_xg_seasonality,away_goals_seasonality,away_xg_seasonality,home_goals_deseasonalised,home_xg_deseasonalised,away_goals_deseasonalised,away_xg_deseasonalised
2770,12.0,Saturday,2024-11-23,12:30,Leicester City,0.0,,0.0,Chelsea,0,...,0.0,0,1.03565,1.036803,1.064295,1.049451,0.965577,0.771603,0.939589,1.048167
2771,12.0,Saturday,2024-11-23,15:00,Arsenal,0.0,,0.0,Nott'ham Forest,0,...,0.0,0,1.03565,1.036803,1.06067,1.066417,1.931154,0.868053,2.828402,1.594122
2772,12.0,Saturday,2024-11-23,15:00,Bournemouth,0.0,,0.0,Brighton,0,...,0.0,0,1.00385,1.012279,1.064295,1.049451,1.99233,1.975741,0.939589,0.952879
2773,12.0,Saturday,2024-11-23,15:00,Aston Villa,0.0,,0.0,Crystal Palace,0,...,0.0,0,1.03565,1.036803,1.064295,1.049451,0.965577,1.736106,1.879178,2.286911
2774,12.0,Saturday,2024-11-23,15:00,Everton,0.0,,0.0,Brentford,0,...,0.0,0,1.03565,1.036803,1.064295,1.049451,0.965577,0.771603,0.939589,0.571728
2775,12.0,Saturday,2024-11-23,15:00,Fulham,0.0,,0.0,Wolves,0,...,0.0,0,1.00385,1.012279,1.06067,1.066417,1.99233,1.284231,1.885601,1.219035
2776,12.0,Saturday,2024-11-23,17:30,Manchester City,0.0,,0.0,Tottenham,0,...,0.0,0,1.03565,1.036803,1.06067,1.066417,0.965577,2.79706,0.0,0.656403
2777,12.0,Sunday,2024-11-24,14:00,Southampton,0.0,,0.0,Liverpool,0,...,0.0,0,1.00385,1.012279,1.06067,1.066417,0.996165,0.691509,1.885601,0.750175
2778,12.0,Sunday,2024-11-24,16:30,Ipswich Town,0.0,,0.0,Manchester Utd,0,...,0.0,0,1.00385,1.012279,1.06067,1.066417,0.996165,0.98787,0.942801,2.156754
2779,12.0,Monday,2024-11-25,20:00,Newcastle Utd,0.0,,0.0,West Ham,0,...,0.0,0,1.00385,1.012279,1.064295,1.049451,0.996165,0.493935,0.0,0.095288


In [30]:
features = [column for column in model_df.drop(columns=[
    'Date', 'Day', 'Home', 'Away', 'Venue', 'Time', 'home_xg', 'away_xg', 'home_goals', 'Score', 'Referee', 'Attendance',
    'away_goals', 'Result', 'Season', 'quarter', 'home_goals_cma', 'home_xg_cma', 
    'away_goals_cma', 'away_xg_cma', 'home_goals_div_cma', 'home_xg_div_cma', 
    'away_goals_div_cma', 'away_xg_div_cma'])]

In [31]:
# add the weekend_df Result column to the results_df
results_df['results'] = model.predict(weekend_df[features])

In [32]:
display(results_df)

Unnamed: 0,Date,Home,Away,Day,Venue,results
2770,2024-11-23,Leicester City,Chelsea,Saturday,King Power Stadium,Draw
2771,2024-11-23,Arsenal,Nott'ham Forest,Saturday,Emirates Stadium,Away Win
2772,2024-11-23,Bournemouth,Brighton,Saturday,Vitality Stadium,Home Win
2773,2024-11-23,Aston Villa,Crystal Palace,Saturday,Villa Park,Away Win
2774,2024-11-23,Everton,Brentford,Saturday,Goodison Park,Draw
2775,2024-11-23,Fulham,Wolves,Saturday,Craven Cottage,Draw
2776,2024-11-23,Manchester City,Tottenham,Saturday,Etihad Stadium,Home Win
2777,2024-11-24,Southampton,Liverpool,Sunday,St Mary's Stadium,Away Win
2778,2024-11-24,Ipswich Town,Manchester Utd,Sunday,Portman Road Stadium,Draw
2779,2024-11-25,Newcastle Utd,West Ham,Monday,St James' Park,Home Win


In [33]:
# print the probability of each result for each game
model.predict_proba(weekend_df[features])

# print the probability of each result for each game and the assigned class for each probability
for i, row in enumerate(model.predict_proba(weekend_df[features])):
    print(row, model.classes_[np.argmax(row)])

[0.09431441 0.85030327 0.05538232] Draw
[0.76393617 0.18655871 0.04950512] Away Win
[0.00319609 0.02079199 0.97601192] Home Win
[0.93068894 0.06764439 0.00166667] Away Win
[0.07152957 0.85207107 0.07639936] Draw
[0.07136951 0.84401479 0.0846157 ] Draw
[0.02302706 0.1116192  0.86535374] Home Win
[0.90634486 0.07392306 0.01973208] Away Win
[0.06352976 0.8995081  0.03696214] Draw
[0.00607591 0.09604381 0.89788028] Home Win
