# Football MC season simulation and visualisation¶

## Code to accompany the webpage: https://www.ben-bow.com/fitba

We intro with defining imports of data and packages, define our class and follow by creating functions to deal with distinct parts of our simulation process.

In [13]:
import pandas as pd
import numpy as np
import itertools
import random
from pandas.core.frame import DataFrame
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import jdc

url = 'https://raw.githubusercontent.com/BenBowring/Football/main/DataDump/E0_21.csv'
data = pd.read_csv(url, index_col='Date')
data.index = pd.to_datetime(data.index, format='%d/%m/%Y')

# FootballDF

## Defining our class.

### Keyword arguments:

data -- (pandas dataframe) Full season of matchups and scores.

home_adv -- (float) Team home advantage weight to be used in calculations (default 0.5)

fome_adv -- (float) Team fome advantage weight to be used in calculations (default 0.5)


In [14]:
class FootballDF:

    def __init__(self, data = data, home_adv = 0.5, form_adv = 0.5):
        
        self.data = data
        self.data.index = pd.to_datetime(data.index, format='%d/%m/%Y')
        
        self.home_adv = home_adv
        self.form_adv = form_adv

        self.home_labels = ['GF', 'GA', 'Opp']
        self.home_cols = ['FTHG', 'FTAG', 'AwayTeam']
        
        self.away_labels = ['GF', 'GA', 'Opp']
        self.away_cols = ['FTAG', 'FTHG', 'HomeTeam']
        
        self.teams = data['HomeTeam'].unique()

# get_history

## Define date after which season will be simulated and store historical performance of all teams up to this point.

### Keyword arguments:

date -- (dt datetime) Date after which season will be simulated.

Note: Utlise magic function import from jdc to define functions for intial class in following cells

In [15]:
%%add_to FootballDF
def get_history(self, date):

    self.doi = date
    self.doi_delta = self.doi + pd.Timedelta("1 days")

    self.season = self.data.loc[:self.doi]
    self.season_remaining = self.data.loc[self.doi_delta:]
    self.matchups_remaining = [[x[0], x[1]] for x in zip(self.season_remaining['HomeTeam'], self.season_remaining['AwayTeam'])]
    
    self.season_roundup = pd.DataFrame({'Home Team': self.season['HomeTeam'], 'Away Team': self.season['AwayTeam'],
                                        'Home Goals': self.season['FTHG'], 'Away Goals': self.season['FTAG']})
    
    self.dict_goals = {}
    
    for team in self.teams:
    
        home_games = self.season[self.season['HomeTeam'] == team][self.home_cols]
        home_games.columns = self.home_labels
        home_games['H/A'] = 'H'
        
        away_games = self.season[self.season['AwayTeam'] == team][self.away_cols]
        away_games.columns = self.away_labels
        away_games['H/A'] = 'A'
    
        self.dict_goals[team] = pd.concat([home_games, away_games], axis = 0).sort_index()


# get_performance

## Calculate a performance metric up to date given within get_history.

#### Metric represents how many goals a team would both score and concede when matched with an average team in the league.

In [16]:
%%add_to FootballDF
def get_performance(self):

    self._labels = ['Average Goals For', 'Average Goals Against', 'Predicted Goals For', 'Predicted Goals Against']
    self._adj_labels = ['Adjusted Goals For', 'Adjusted Goals Against']

    self.df_total = pd.DataFrame({
        self._labels[0]: [self.dict_goals[team]['GF'].mean() for team in self.teams],
        self._labels[1]: [self.dict_goals[team]['GA'].mean() for team in self.teams],
        self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams],
        self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams]}, 
        index = self.teams)
    
    self.df_home = pd.DataFrame({
        self._labels[0]: [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],
        self._labels[1]: [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],

        self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() 
        for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) 
        for team in self.teams],

        self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() 
        for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) 
        for team in self.teams]},

        index = self.teams)
    
    self.df_away = pd.DataFrame({
        self._labels[0]: [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],
        self._labels[1]: [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],

        self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() 
        for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) 
        for team in self.teams],

        self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() 
        for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) 
        for team in self.teams]}, 
        
        index = self.teams)
    
    self.df_form = pd.DataFrame({
        'Goals For': [self.dict_goals[team]['GF'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams],
        'Goals Against': [self.dict_goals[team]['GA'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams]}, 
        index = self.teams)
    
    self.df_total[self._adj_labels[0]] = self.df_total.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
    self.df_total[self._adj_labels[1]] = self.df_total.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 
    
    self.df_home[self._adj_labels[0]] = self.df_home.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
    self.df_home[self._adj_labels[1]] = self.df_home.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 
    
    self.df_away[self._adj_labels[0]] = self.df_away.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
    self.df_away[self._adj_labels[1]] = self.df_away.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 


# get_poiss

## Calculate our adjusted Poisson values to use in our distribution after adjusting for home and form advantage.

### Keyword arguments:

matchups -- (pandas DataFrame) Matchups remaining to be played within season.

In [17]:
%%add_to FootballDF
def get_poiss(self, matchups):
        
        away_adv = 1 - self.home_adv
        anti_form = 1 - self.form_adv
        
        self.df_adjust = pd.DataFrame({
            'Home Team': [x[0] for x in matchups],
            'Away Team': [x[1] for x in matchups],

            'Pred Home Goals For': [self.home_adv * self.df_home[self._adj_labels[0]].loc[x[0]] 
            + away_adv * self.df_away[self._adj_labels[0]].loc[x[0]] for x in matchups],

            'Pred Home Goals Against': [self.home_adv * self.df_home[self._adj_labels[1]].loc[x[0]]
            + away_adv * self.df_away[self._adj_labels[1]].loc[x[0]] for x in matchups],

            'Pred Away Goals For': [self.home_adv * self.df_away[self._adj_labels[0]].loc[x[1]] 
            + away_adv * self.df_home[self._adj_labels[0]].loc[x[1]] for x in matchups],

            'Pred Away Goals Against': [self.home_adv * self.df_away[self._adj_labels[1]].loc[x[1]] 
            + away_adv * self.df_home[self._adj_labels[1]].loc[x[1]] for x in matchups],

            'Home Form For': [self.df_form['Goals For'].loc[x[0]] for x in matchups],

            'Home Form Against': [self.df_form['Goals Against'].loc[x[0]] for x in matchups],

            'Away Form For': [self.df_form['Goals For'].loc[x[1]] for x in matchups],

            'Away Form Against': [self.df_form['Goals Against'].loc[x[1]] for x in matchups]},
            
            index = self.season_remaining.index)
        
        df_out = pd.DataFrame({
            'Home Team': self.df_adjust['Home Team'],
            'Away Team': self.df_adjust['Away Team'],
                                
            'Home Goals': self.df_adjust.apply(lambda x: 
            (anti_form * (self.home_adv * x['Pred Home Goals For'] 
            + away_adv * x['Pred Away Goals Against'])
            + self.form_adv * np.mean(x[['Home Form For', 'Away Form Against']])), axis = 1),
                                
            'Away Goals': self.df_adjust.apply(lambda x: 
            (anti_form * (self.home_adv * x['Pred Away Goals For'] 
            + away_adv *  x['Pred Home Goals Against'])
            + self.form_adv * np.mean(x[['Away Form For', 'Home Form Against']])), axis = 1)})
                                
        return(df_out)

# mse_run

## Calculate optimal weights minising MSE by iterating over home and form advantage weights.

### Keyword arguments:

granular -- (float) Size of step to be used in iteration.

In [18]:
%%add_to FootballDF
def mse_run(self, granular = 0.1):
        
        if len(self.matchups_remaining) > 0:
        
            variables = np.arange(0, 1 + granular, granular)
    
            mse_full = pd.DataFrame(columns = variables, index = variables)
    
            for entry in itertools.product(variables, variables):
                
                self.home_adv, self.form_adv = entry[0], entry[1] 
                
                score_pred = self.get_poiss(self.matchups_remaining)
                
                mse_df = pd.DataFrame({'Home Team': score_pred['Home Team'],
                                       'Away Team': score_pred['Away Team'],
                                       'Actual Home': self.season_remaining['FTHG'],
                                       'Actual Away': self.season_remaining['FTAG'],
                                       'Predicted Home': score_pred['Home Goals'],
                                       'Predicted Away': score_pred['Away Goals']})
                
                y_true = mse_df[['Actual Home', 'Actual Away']]
                y_pred = mse_df[['Predicted Home', 'Predicted Away']]
                
                mse_full.loc[self.home_adv, self.form_adv] = mean_squared_error(y_true, y_pred)
            
            mse_full = mse_full.apply(pd.to_numeric)
    
            opt_home = mse_full.min(axis = 1).idxmin()
            opt_form = mse_full.min(axis = 0).idxmin()
        
            return [opt_home, opt_form, mse_full]
        
        else:
            
            print("Season Finished")
            
            return [np.nan, np.nan, np.nan]

# match_outcome

## Predict remaining match outcomes using a Poisson distribution returning a randomised scoreline for each match.

In [19]:
%%add_to FootballDF
def match_outcome(self):
        
        score_pred = self.get_poiss(self.matchups_remaining)
        
        dates = score_pred.index
        score_pred.reset_index(inplace = True)

        df_outcome = score_pred[['Home Team', 'Away Team']].copy()
        df_outcome[['Home Goals', 'Away Goals']] = np.nan
        
        for match in score_pred.index:
            
            i = score_pred.loc[match]['Home Goals']
            j = score_pred.loc[match]['Away Goals']
            
            poiss_i = np.random.poisson(i)
            poiss_j = np.random.poisson(j)
                        
            df_outcome.loc[match, ['Home Goals', 'Away Goals']] = [poiss_i, poiss_j]

        df_outcome.index = dates 
        
        return df_outcome

# run_season

## With predicted scorelines return a final table and ranking of teams.

In [20]:
%%add_to FootballDF
def run_season_is(self):
        
        pre_date = self.season_roundup.copy()
        
        pre_date = self.season_roundup.copy()
        post_date = self.match_outcome()
 
        self.full_season = pd.concat([pre_date, post_date])
        
        self.full_season['Winner'] = self.full_season.apply(
            lambda x: x['Home Team'] if x['Home Goals'] > x['Away Goals'] 
            else x['Away Team'] if x['Away Goals'] > x['Home Goals'] 
            else 'Draw', axis = 1)
        
        self.full_season['Loser'] = self.full_season.apply(
            lambda x: x['Away Team'] if x['Home Goals'] > x['Away Goals'] 
            else x['Home Team'] if x['Away Goals'] > x['Home Goals'] 
            else 'Draw', axis = 1)
        
        self.full_season['Draw 1'] = self.full_season.apply(
            lambda x: np.nan if x['Home Goals'] > x['Away Goals'] 
            else np.nan if x['Away Goals'] > x['Home Goals'] 
            else x['Home Team'], axis = 1)
        
        self.full_season['Draw 2'] = self.full_season.apply(
            lambda x: np.nan if x['Home Goals'] > x['Away Goals'] 
            else np.nan if x['Away Goals'] > x['Home Goals'] 
            else x['Away Team'], axis = 1)
        
        self.table = pd.DataFrame(columns = ['Goals For', 'Goals Against', 'Wins', 'Losses', 'Draws'], index = self.teams)
        
        self.table['Goals For'] = self.full_season.groupby('Home Team')['Home Goals'].sum() 
        + self.full_season.groupby('Away Team')['Away Goals'].sum()

        self.table['Goals Against'] = self.full_season.groupby('Home Team')['Away Goals'].sum() 
        + self.full_season.groupby('Away Team')['Home Goals'].sum()
        
        self.table['Goal Differential'] = self.table['Goals For'] - self.table['Goals Against']
        self.table['Wins'] = self.full_season.groupby('Winner').count()
        self.table['Losses'] = self.full_season.groupby('Loser').count()
        
        self.table['Wins'].fillna(0, inplace = True)
        self.table['Losses'].fillna(0, inplace = True)
        
        self.table['Draws'] = 38 - (self.table['Wins'] + self.table['Losses'])
        self.table['Points'] = self.table.apply(lambda x: 3 * x['Wins'] + 1 * x['Draws'], axis = 1)
        
        self.table.sort_values(['Points', 'Goal Differential'], ascending = [False, False], inplace = True)
        self.table['Rank'] = np.arange(1, 21, 1)

## Import optimal home and form advantage from years 2012 through 2021.

### Use these to predict the final outcome of the 2021 season by iterating over the final set of games and averaging to get an expected result.

In [21]:
evolution = data.index[300:].unique()
date = evolution[0]

optimal_weights_url = pd.read_csv('https://raw.githubusercontent.com/BenBowring/Football/main/DataDump/OptimalWeights_FullHist.csv',
                                  header=[0, 1], index_col = 0)

home_adv = optimal_weights_url.xs('Home Advantage', axis = 1, level = 1).median().median()
form_adv = optimal_weights_url.xs('Form Advantage', axis = 1, level = 1).median().median()

print(f'Home Advantage: {home_adv}')
print(f'Form Advantage: {form_adv}')

Home Advantage: 0.65
Form Advantage: 0.3


In [22]:
dict_position = {}

for date in evolution:
    
    if date < evolution[-1]:
        dict_seasons = {}
        
        for x in np.arange(1, 11, 1):
            
            wrapper = FootballDF(data, home_adv, form_adv)
            wrapper.get_history(date)
            wrapper.get_performance()
            wrapper.run_season_is()
            
            dict_seasons[x] = wrapper.table
        
        dict_summary = {team: pd.DataFrame({'Rank': [dict_seasons[x].loc[team, 'Rank'] for x in dict_seasons],
                                            'Points': [dict_seasons[x].loc[team, 'Points'] for x in dict_seasons]},
                                           index = np.arange(1, 11, 1)) for team in wrapper.teams}
        
        df_position = pd.concat([dict_summary[x]['Rank'] for x in wrapper.teams], axis =1)
        df_position.columns = wrapper.teams
        df_position = df_position.T
        df_position['Average'] = df_position.mean(axis = 1)    
        df_position.sort_values(by = 'Average', inplace = True)    
        df_position['Exp Position'] = np.arange(1, 21, 1)     
    
        dict_position[date] = df_position.copy()
    
    else:
    
        pass
    
dict_position[evolution[0]]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,Average,Exp Position
Man City,1,1,1,1,1,1,1,1,1,1,1.0,1
Man United,3,3,2,3,2,2,2,2,2,2,2.3,2
Leicester,2,4,5,2,3,3,5,3,4,4,3.5,3
West Ham,6,2,4,6,4,6,4,4,3,3,4.2,4
Tottenham,4,6,8,5,5,5,6,5,5,5,5.4,5
Liverpool,5,5,9,4,6,7,3,8,6,7,6.0,6
Chelsea,8,7,3,8,9,4,7,9,7,6,6.8,7
Everton,7,9,6,7,8,9,8,7,9,11,8.1,8
Arsenal,9,8,10,9,11,8,9,6,11,10,9.1,9
Aston Villa,10,10,7,10,7,10,10,10,10,9,9.3,10
