## Football MC season simulation and visualisation
Code to accompany the webpage: https://www.ben-bow.com/fitba

We intro with defining imports of data and packages, then define the class with multiple functions for different types of analysis and follow with uses and outputs of this class used on the webpage.

In [21]:
import pandas as pd
import numpy as np
import itertools
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

url = 'https://raw.githubusercontent.com/BenBowring/Football/main/DataDump/E0_21.csv'
data = pd.read_csv(url, index_col='Date')
data.index = pd.to_datetime(data.index, format='%d/%m/%Y')

In [22]:
class Football:
    
    def __init__(self, data, home_adv = 0.5, form_adv = 0.5):
        
        self.data = data
        self.data.index = pd.to_datetime(data.index, format='%d/%m/%Y')
        
        self.home_labels = ['GF', 'GA', 'Opp']
        self.home_cols = ['FTHG', 'FTAG', 'AwayTeam']
        
        self.away_labels = ['GF', 'GA', 'Opp']
        self.away_cols = ['FTAG', 'FTHG', 'HomeTeam']
        
        self.teams = data['HomeTeam'].unique()
        
        self.home_adv = home_adv
        self.form_adv = form_adv

    def get_history(self, date):
        
        self.doi = date
        self.doi_delta = self.doi + pd.Timedelta("1 days")

        self.season = self.data.loc[:self.doi]
        self.season_remaining = self.data.loc[self.doi_delta:]
        self.matchups_remaining = [[x[0], x[1]] for x in zip(self.season_remaining['HomeTeam'], self.season_remaining['AwayTeam'])]
        
        self.season_roundup = pd.DataFrame({'Home Team': self.season['HomeTeam'], 'Away Team': self.season['AwayTeam'],
                                            'Home Goals': self.season['FTHG'], 'Away Goals': self.season['FTAG']})
        
        self.dict_goals = {}
        
        for team in self.teams:
        
            home_games = self.season[self.season['HomeTeam'] == team][self.home_cols]
            home_games.columns = self.home_labels
            home_games['H/A'] = 'H'
            
            away_games = self.season[self.season['AwayTeam'] == team][self.away_cols]
            away_games.columns = self.away_labels
            away_games['H/A'] = 'A'
        
            self.dict_goals[team] = pd.concat([home_games, away_games], axis = 0).sort_index()
        
        self.df_total = pd.DataFrame({'Average Goals For': [self.dict_goals[team]['GF'].mean() for team in self.teams],
                               'Average Goals Against': [self.dict_goals[team]['GA'].mean() for team in self.teams],
                               'Predicted Goals For': [np.mean([self.dict_goals[opp]['GA'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams],
                               'Predicted Goals Against': [np.mean([self.dict_goals[opp]['GF'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams]}, 
                              index = self.teams)
        
        self.df_home = pd.DataFrame({'Average Goals For': [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],
                               'Average Goals Against': [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],
                               'Predicted Goals For': [np.mean([self.dict_goals[opp]['GA'].mean() for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) for team in self.teams],
                               'Predicted Goals Against': [np.mean([self.dict_goals[opp]['GF'].mean() for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) for team in self.teams]}, 
                              index = self.teams)
        
        self.df_away = pd.DataFrame({'Average Goals For': [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],
                               'Average Goals Against': [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],
                               'Predicted Goals For': [np.mean([self.dict_goals[opp]['GA'].mean() for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) for team in self.teams],
                               'Predicted Goals Against': [np.mean([self.dict_goals[opp]['GF'].mean() for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) for team in self.teams]}, 
                              index = self.teams)
        
        self.df_form = pd.DataFrame({'Goals For': [self.dict_goals[team]['GF'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams],
                               'Goals Against': [self.dict_goals[team]['GA'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams]}, 
                              index = self.teams)
        
        self.df_total['Adjusted Goals For'] = self.df_total.apply(lambda x: (x['Average Goals For']/x['Predicted Goals For']) * x['Average Goals For'], axis = 1)
        self.df_total['Adjusted Goals Against'] = self.df_total.apply(lambda x: (x['Average Goals Against']/x['Predicted Goals Against']) * x['Average Goals Against'], axis = 1) 
        
        self.df_home['Adjusted Goals For'] = self.df_home.apply(lambda x: (x['Average Goals For']/x['Predicted Goals For']) * x['Average Goals For'], axis = 1)
        self.df_home['Adjusted Goals Against'] = self.df_home.apply(lambda x: (x['Average Goals Against']/x['Predicted Goals Against']) * x['Average Goals Against'], axis = 1) 
        
        self.df_away['Adjusted Goals For'] = self.df_away.apply(lambda x: (x['Average Goals For']/x['Predicted Goals For']) * x['Average Goals For'], axis = 1)
        self.df_away['Adjusted Goals Against'] = self.df_away.apply(lambda x: (x['Average Goals Against']/x['Predicted Goals Against']) * x['Average Goals Against'], axis = 1) 
        
    
    def get_poiss(self, matchups):
        
        away_adv = 1 - self.home_adv
        anti_form = 1 - self.form_adv
        
        self.df_adjust = pd.DataFrame({'Home Team': [x[0] for x in matchups],
                                  'Away Team': [x[1] for x in matchups],
                                  'Pred Home Goals For': [self.home_adv * self.df_home['Adjusted Goals For'].loc[x[0]] + away_adv * self.df_away['Adjusted Goals For'].loc[x[0]] for x in matchups],
                                   'Pred Home Goals Against': [self.home_adv * self.df_home['Adjusted Goals Against'].loc[x[0]] + away_adv * self.df_away['Adjusted Goals Against'].loc[x[0]] for x in matchups],
                                   'Pred Away Goals For': [self.home_adv * self.df_away['Adjusted Goals For'].loc[x[1]] + away_adv * self.df_home['Adjusted Goals For'].loc[x[1]] for x in matchups],
                                   'Pred Away Goals Against': [self.home_adv * self.df_away['Adjusted Goals Against'].loc[x[1]] + away_adv * self.df_home['Adjusted Goals Against'].loc[x[1]] for x in matchups],
                                   'Home Form For': [self.df_form['Goals For'].loc[x[0]] for x in matchups],
                                   'Home Form Against': [self.df_form['Goals Against'].loc[x[0]] for x in matchups],
                                   'Away Form For': [self.df_form['Goals For'].loc[x[1]] for x in matchups],
                                   'Away Form Against': [self.df_form['Goals Against'].loc[x[1]] for x in matchups]},
                                     index = self.season_remaining.index)
        
        df_out = pd.DataFrame({'Home Team': self.df_adjust['Home Team'],
                               'Away Team': self.df_adjust['Away Team'],
                               'Home Goals': self.df_adjust.apply(lambda x: (anti_form * (self.home_adv * x['Pred Home Goals For'] + away_adv *  x['Pred Away Goals Against'])
                                                                        + self.form_adv * np.mean(x[['Home Form For', 'Away Form Against']])), axis = 1),
                               
                               'Away Goals': self.df_adjust.apply(lambda x: (anti_form * (self.home_adv * x['Pred Away Goals For'] + away_adv *  x['Pred Home Goals Against'])
                                                                        + self.form_adv * np.mean(x[['Away Form For', 'Home Form Against']])), axis = 1)})
                              
        return(df_out)


    def mse_run(self, granular = 0.1):
        
        if len(self.matchups_remaining) > 0:
        
            variables = np.arange(0, 1 + granular, granular)
    
            mse_full = pd.DataFrame(columns = variables, index = variables)
    
            for entry in itertools.product(variables, variables):
                
                self.home_adv, self.form_adv = entry[0], entry[1] 
                
                score_pred = self.get_poiss(self.matchups_remaining)
                
                mse_df = pd.DataFrame({'Home Team': score_pred['Home Team'],
                                       'Away Team': score_pred['Away Team'],
                                       'Actual Home': self.season_remaining['FTHG'],
                                       'Actual Away': self.season_remaining['FTAG'],
                                       'Predicted Home': score_pred['Home Goals'],
                                       'Predicted Away': score_pred['Away Goals']})
                
                y_true = mse_df[['Actual Home', 'Actual Away']]
                y_pred = mse_df[['Predicted Home', 'Predicted Away']]
                
                mse_full.loc[self.home_adv, self.form_adv] = mean_squared_error(y_true, y_pred)
            
            mse_full = mse_full.apply(pd.to_numeric)
    
            opt_home = mse_full.min(axis = 1).idxmin()
            opt_form = mse_full.min(axis = 0).idxmin()
        
            return [opt_home, opt_form, mse_full]
        
        else:
            
            print("Season Finished")
            
            return [np.nan, np.nan, np.nan]
    
    
    def match_outcome(self):
        
        score_pred = self.get_poiss(self.matchups_remaining)
        
        dates = score_pred.index
        score_pred.reset_index(inplace = True)

        df_outcome = score_pred[['Home Team', 'Away Team']].copy()
        df_outcome[['Home Goals', 'Away Goals']] = np.nan
        
        for match in score_pred.index:
            
            i = score_pred.loc[match]['Home Goals']
            j = score_pred.loc[match]['Away Goals']
            
            poiss_i = np.random.poisson(i)
            poiss_j = np.random.poisson(j)
                        
            df_outcome.loc[match, ['Home Goals', 'Away Goals']] = [poiss_i, poiss_j]

        df_outcome.index = dates 
        
        return df_outcome
        
    def run_season(self):
        
        pre_date = self.season_roundup.copy()
        
        
        pre_date = self.season_roundup.copy()
        post_date = self.match_outcome()
 
        self.full_season = pd.concat([pre_date, post_date])
        
        self.full_season['Winner'] = self.full_season.apply(lambda x: x['Home Team'] if x['Home Goals'] > x['Away Goals'] else x['Away Team'] 
                                                             if x['Away Goals'] > x['Home Goals'] else 'Draw', axis = 1)
        
        self.full_season['Loser'] = self.full_season.apply(lambda x: x['Away Team'] if x['Home Goals'] > x['Away Goals'] else x['Home Team'] 
                                                             if x['Away Goals'] > x['Home Goals'] else 'Draw', axis = 1)
        
        self.full_season['Draw 1'] = self.full_season.apply(lambda x: np.nan if x['Home Goals'] > x['Away Goals'] else np.nan 
                                                             if x['Away Goals'] > x['Home Goals'] else x['Home Team'], axis = 1)
        
        self.full_season['Draw 2'] = self.full_season.apply(lambda x: np.nan if x['Home Goals'] > x['Away Goals'] else np.nan 
                                                             if x['Away Goals'] > x['Home Goals'] else x['Away Team'], axis = 1)
        
        self.table = pd.DataFrame(columns = ['Goals For', 'Goals Against', 'Wins', 'Losses', 'Draws'],
                       index = self.teams)
        
        self.table['Goals For'] = self.full_season.groupby('Home Team')['Home Goals'].sum() + self.full_season.groupby('Away Team')['Away Goals'].sum()
        self.table['Goals Against'] = self.full_season.groupby('Home Team')['Away Goals'].sum() + self.full_season.groupby('Away Team')['Home Goals'].sum()
        self.table['Goal Differential'] = self.table['Goals For'] - self.table['Goals Against']
        self.table['Wins'] = self.full_season.groupby('Winner').count()
        self.table['Losses'] = self.full_season.groupby('Loser').count()
        
        self.table['Wins'].fillna(0, inplace = True)
        self.table['Losses'].fillna(0, inplace = True)
        
        self.table['Draws'] = 38 - (self.table['Wins'] + self.table['Losses'])
        self.table['Points'] = self.table.apply(lambda x: 3 * x['Wins'] + 1 * x['Draws'], axis = 1)
        
        self.table.sort_values(['Points', 'Goal Differential'], ascending = [False, False], inplace = True)
        self.table['Rank'] = np.arange(1, 21, 1)

## Running an example Monte Carlo

Ran over the last set of matchdays in the 20/21 season to predict the winner given the current strength of each team following methodology outlined on website. Use the optimised Home and Form Advantage weights as discussed.

In [37]:
evolution = data.index[350:].unique()
date = evolution[0]

optimal_weights_url = pd.read_csv('https://raw.githubusercontent.com/BenBowring/Football/main/DataDump/OptimalWeights_FullHist.csv',
                                  header=[0, 1], index_col = 0)

home_adv = optimal_weights_url.xs('Home Advantage', axis = 1, level = 1).median().median()
form_adv = optimal_weights_url.xs('Form Advantage', axis = 1, level = 1).median().median()

print(f'Home Advantage: {home_adv}')
print(f'Form Advantage: {form_adv}')

dict_position = {}

for date in evolution:
    
    if date < evolution[-1]:
        dict_seasons = {}
        
        for x in np.arange(1, 11, 1):
            
            wrapper = Football(data, home_adv, form_adv)
            wrapper.get_history(date)
            wrapper.run_season()
            
            dict_seasons[x] = wrapper.table
        
        dict_summary = {team: pd.DataFrame({'Rank': [dict_seasons[x].loc[team, 'Rank'] for x in dict_seasons],
                                            'Points': [dict_seasons[x].loc[team, 'Points'] for x in dict_seasons]},
                                           index = np.arange(1, 11, 1)) for team in wrapper.teams}
        
        df_position = pd.concat([dict_summary[x]['Rank'] for x in wrapper.teams], axis =1)
        df_position.columns = wrapper.teams
        df_position = df_position.T
        df_position['Average'] = df_position.mean(axis = 1)    
        df_position.sort_values(by = 'Average', inplace = True)    
        df_position['Exp Position'] = np.arange(1, 21, 1)     
    
        dict_position[date] = df_position.copy()
    
    else:
    
        pass
    
dict_position[evolution[0]]

Home Advantage: 0.65
Form Advantage: 0.3


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,Average,Exp Position
Man City,1,1,1,1,1,1,1,1,1,1,1.0,1
Man United,2,3,2,2,2,2,2,2,2,2,2.1,2
Leicester,3,5,4,3,4,5,4,5,3,3,3.9,3
Chelsea,5,2,5,5,3,4,3,3,5,4,3.9,4
Liverpool,4,4,3,6,6,3,5,4,4,5,4.4,5
West Ham,8,6,6,9,5,7,6,6,6,6,6.5,6
Tottenham,6,9,7,7,9,6,7,7,8,7,7.3,7
Everton,7,8,8,4,7,8,8,10,7,8,7.5,8
Arsenal,9,10,10,8,8,9,9,8,9,10,9.0,9
Leeds,11,7,9,10,10,10,10,9,10,9,9.5,10


### Full loop for calculating the Home Advantage and Form Advatange MSE surfaces 

Due to large period of simiulation, running with largely reduced simulation of 5 instead of 50. 

Note last entry is NaN as there are no following matches to predict.

In [35]:
data_paths = ['E0_12.csv', 'E0_13.csv', 'E0_14.csv', 'E0_15.csv', 'E0_16.csv',
              'E0_17.csv', 'E0_18.csv', 'E0_19.csv', 'E0_20.csv', 'E0_21.csv']

years = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

URL = 'https://raw.githubusercontent.com/BenBowring/Football/main/DataDump/'

len_season = 5

multi_ind = [x for x in itertools.product(years, ['Home Advantage', 'Form Advantage'])]

df_optimal_weights = pd.DataFrame(columns = pd.MultiIndex.from_tuples(multi_ind), index = np.arange(1, len_season + 1,1))

for data, year in zip(data_paths, years):
    
    print(f'Running year: {year}')
    
    data = pd.read_csv(URL + f'{data}', index_col='Date')
    data.index = pd.to_datetime(data.index, format='%d/%m/%Y')
    
    evolution = data.index.unique()
    evolution = evolution[-len_season:]
    
    optimal_weights = pd.DataFrame(columns = ['Home Advantage', 'Form Advantage'],
                                   index = evolution)
    
    its_alive = Football(data)
    
    surface_mse = {}
    
    for date in evolution:
        
        its_alive.get_history(date)
        
        mse_temp = its_alive.mse_run(0.05)
        
        optimal_weights.loc[date] = mse_temp[0:2]
        surface_mse[date] = mse_temp[2]
    
    df_optimal_weights[year] = optimal_weights.values
    
df_optimal_weights

2012
Season Finished
2013
Season Finished
2014
Season Finished
2015
Season Finished
2016
Season Finished
2017
Season Finished
2018
Season Finished
2019
Season Finished
2020
Season Finished
2021
Season Finished


Unnamed: 0_level_0,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019,2020,2020,2021,2021
Unnamed: 0_level_1,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage,Home Advantage,Form Advantage
1,0.65,0.35,0.75,0.0,0.5,0.35,1.0,0.65,1.0,0.3,0.6,0.0,1.0,0.55,0.35,0.15,0.8,0.1,0.75,0.05
2,1.0,0.75,0.75,0.0,0.5,0.35,1.0,0.55,1.0,0.3,0.55,0.0,1.0,0.55,0.3,0.0,0.75,0.0,0.75,0.05
3,0.45,0.2,0.85,0.1,0.45,0.25,0.9,0.3,1.0,0.1,0.6,0.0,1.0,0.5,0.25,0.0,0.7,0.0,0.9,0.0
4,0.45,0.0,0.9,0.2,0.35,0.35,0.95,0.0,0.0,0.0,0.85,0.15,1.0,0.45,0.3,0.0,0.55,0.0,0.9,0.0
5,,,,,,,,,,,,,,,,,,,,
