In [152]:
import pandas as pd
import numpy as np
import itertools
import random
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from pandas.core.frame import DataFrame


file_path = 'FullData.xlsx'
data = pd.read_excel(file_path, index_col='Date', sheet_name = "F1")
data.index = pd.to_datetime(data.index, format='%d/%m/%Y')

In [153]:
data.head()

Unnamed: 0_level_0,Div,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-06,F1,20:00:00,Monaco,Nantes,1,1,D,1,1,D,...,2.02,-1.25,2.02,1.91,2.02,1.91,2.03,1.99,1.97,1.89
2021-08-07,F1,16:00:00,Lyon,Brest,1,1,D,0,1,A,...,2.76,-1.5,2.0,1.93,2.0,1.93,2.0,1.94,1.96,1.89
2021-08-07,F1,20:00:00,Troyes,Paris SG,1,2,A,1,2,A,...,2.35,1.5,1.95,1.98,1.93,2.0,2.04,2.0,1.91,1.95
2021-08-08,F1,12:00:00,Rennes,Lens,1,1,D,1,1,D,...,1.82,-0.5,1.93,2.0,1.93,2.0,1.94,2.0,1.91,1.95
2021-08-08,F1,14:00:00,Bordeaux,Clermont,0,2,A,0,0,D,...,1.69,0.0,1.87,2.06,1.88,2.06,1.89,2.1,1.84,2.03


In [159]:
class FootballDF:

    def __init__(self, data = data, home_adv = 0.5, form_adv = 0.5):
        
        self.data = data
        self.data.index = pd.to_datetime(data.index, format='%d/%m/%Y')
        
        self.home_adv = home_adv
        self.form_adv = form_adv

        self.home_labels = ['GF', 'GA', 'Opp']
        self.home_cols = ['FTHG', 'FTAG', 'AwayTeam']
        
        self.away_labels = ['GF', 'GA', 'Opp']
        self.away_cols = ['FTAG', 'FTHG', 'HomeTeam']
        
        self.teams = data['HomeTeam'].unique()
        
    def get_history(self, date):

        self.doi = date
        self.doi_delta = self.doi + pd.Timedelta("1 days")

        self.season = self.data.loc[:self.doi]
        self.season_remaining = self.data.loc[self.doi_delta:]
        self.matchups_remaining = [[x[0], x[1]] for x in zip(self.season_remaining['HomeTeam'], self.season_remaining['AwayTeam'])]

        self.season_roundup = pd.DataFrame({'Home Team': self.season['HomeTeam'], 'Away Team': self.season['AwayTeam'],
                                            'Home Goals': self.season['FTHG'], 'Away Goals': self.season['FTAG']})

        self.dict_goals = {}

        for team in self.teams:

            home_games = self.season[self.season['HomeTeam'] == team][self.home_cols]
            home_games.columns = self.home_labels
            home_games['H/A'] = 'H'

            away_games = self.season[self.season['AwayTeam'] == team][self.away_cols]
            away_games.columns = self.away_labels
            away_games['H/A'] = 'A'

            self.dict_goals[team] = pd.concat([home_games, away_games], axis = 0).sort_index()
            
    def get_performance(self):

        self._labels = ['Average Goals For', 'Average Goals Against', 'Predicted Goals For', 'Predicted Goals Against']
        self._adj_labels = ['Adjusted Goals For', 'Adjusted Goals Against']

        self.df_total = pd.DataFrame({
            self._labels[0]: [self.dict_goals[team]['GF'].mean() for team in self.teams],
            self._labels[1]: [self.dict_goals[team]['GA'].mean() for team in self.teams],
            self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams],
            self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() for opp in self.dict_goals[team]['Opp']]) for team in self.teams]}, 
            index = self.teams)

        self.df_home = pd.DataFrame({
            self._labels[0]: [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],
            self._labels[1]: [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'H'].mean() for team in self.teams],

            self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() 
            for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) 
            for team in self.teams],

            self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() 
            for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'H']]) 
            for team in self.teams]},

            index = self.teams)

        self.df_away = pd.DataFrame({
            self._labels[0]: [self.dict_goals[team]['GF'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],
            self._labels[1]: [self.dict_goals[team]['GA'][self.dict_goals[team]['H/A'] == 'A'].mean() for team in self.teams],

            self._labels[2]: [np.mean([self.dict_goals[opp]['GA'].mean() 
            for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) 
            for team in self.teams],

            self._labels[3]: [np.mean([self.dict_goals[opp]['GF'].mean() 
            for opp in self.dict_goals[team]['Opp'][self.dict_goals[team]['H/A'] == 'A']]) 
            for team in self.teams]}, 

            index = self.teams)

        self.df_form = pd.DataFrame({
            'Goals For': [self.dict_goals[team]['GF'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams],
            'Goals Against': [self.dict_goals[team]['GA'].ewm(alpha=0.5).mean().iloc[-1] for team in self.teams]}, 
            index = self.teams)

        self.df_total[self._adj_labels[0]] = self.df_total.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
        self.df_total[self._adj_labels[1]] = self.df_total.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 

        self.df_home[self._adj_labels[0]] = self.df_home.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
        self.df_home[self._adj_labels[1]] = self.df_home.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 

        self.df_away[self._adj_labels[0]] = self.df_away.apply(lambda x: (x[self._labels[0]]/x[self._labels[2]]) * x[self._labels[0]], axis = 1)
        self.df_away[self._adj_labels[1]] = self.df_away.apply(lambda x: (x[self._labels[1]]/x[self._labels[3]]) * x[self._labels[1]], axis = 1) 

    def get_poiss(self, matchups):
        
        away_adv = 1 - self.home_adv
        anti_form = 1 - self.form_adv
        
        self.df_adjust = pd.DataFrame({
            'Home Team': [x[0] for x in matchups],
            'Away Team': [x[1] for x in matchups],

            'Pred Home Goals For': [self.home_adv * self.df_home[self._adj_labels[0]].loc[x[0]] 
            + away_adv * self.df_away[self._adj_labels[0]].loc[x[0]] for x in matchups],

            'Pred Home Goals Against': [self.home_adv * self.df_home[self._adj_labels[1]].loc[x[0]]
            + away_adv * self.df_away[self._adj_labels[1]].loc[x[0]] for x in matchups],

            'Pred Away Goals For': [self.home_adv * self.df_away[self._adj_labels[0]].loc[x[1]] 
            + away_adv * self.df_home[self._adj_labels[0]].loc[x[1]] for x in matchups],

            'Pred Away Goals Against': [self.home_adv * self.df_away[self._adj_labels[1]].loc[x[1]] 
            + away_adv * self.df_home[self._adj_labels[1]].loc[x[1]] for x in matchups],

            'Home Form For': [self.df_form['Goals For'].loc[x[0]] for x in matchups],

            'Home Form Against': [self.df_form['Goals Against'].loc[x[0]] for x in matchups],

            'Away Form For': [self.df_form['Goals For'].loc[x[1]] for x in matchups],

            'Away Form Against': [self.df_form['Goals Against'].loc[x[1]] for x in matchups]},
            
            index = np.arange(0, len(matchups)))
        
        df_out = pd.DataFrame({
            'Home Team': self.df_adjust['Home Team'],
            'Away Team': self.df_adjust['Away Team'],
                                
            'Home Goals': self.df_adjust.apply(lambda x: 
            (anti_form * (self.home_adv * x['Pred Home Goals For'] 
            + away_adv * x['Pred Away Goals Against'])
            + self.form_adv * np.mean(x[['Home Form For', 'Away Form Against']])), axis = 1),
                                
            'Away Goals': self.df_adjust.apply(lambda x: 
            (anti_form * (self.home_adv * x['Pred Away Goals For'] 
            + away_adv *  x['Pred Home Goals Against'])
            + self.form_adv * np.mean(x[['Away Form For', 'Home Form Against']])), axis = 1)})
                                
        return(df_out)
    
    def mse_run(self, granular = 0.1):
        
        if len(self.matchups_remaining) > 0:
        
            variables = np.arange(0, 1 + granular, granular)
    
            mse_full = pd.DataFrame(columns = variables, index = variables)
    
            for entry in itertools.product(variables, variables):
                
                self.home_adv, self.form_adv = entry[0], entry[1] 
                
                score_pred = self.get_poiss(self.matchups_remaining)
                
                mse_df = pd.DataFrame({'Home Team': score_pred['Home Team'],
                                       'Away Team': score_pred['Away Team'],
                                       'Actual Home': self.season_remaining['FTHG'],
                                       'Actual Away': self.season_remaining['FTAG'],
                                       'Predicted Home': score_pred['Home Goals'],
                                       'Predicted Away': score_pred['Away Goals']})
                
                y_true = mse_df[['Actual Home', 'Actual Away']]
                y_pred = mse_df[['Predicted Home', 'Predicted Away']]
                
                mse_full.loc[self.home_adv, self.form_adv] = mean_squared_error(y_true, y_pred)
            
            mse_full = mse_full.apply(pd.to_numeric)
    
            opt_home = mse_full.min(axis = 1).idxmin()
            opt_form = mse_full.min(axis = 0).idxmin()
        
            return [opt_home, opt_form]
        
        else:
            
            print("Season Finished")
            
            return [np.nan, np.nan, np.nan]
        

In [156]:
print(data.HomeTeam.unique())

matchups = [['Troyes', 'Lille'],
           ['Brest', 'Clermont'],
           ['Lorient', 'Reims'],
           ['Monaco', 'Angers'],
           ['Montpellier', 'Metz'],
           ['Bordeaux', 'Nice'],
           ['Marseille', 'Lyon']]

['Monaco' 'Lyon' 'Troyes' 'Rennes' 'Bordeaux' 'Nice' 'St Etienne'
 'Strasbourg' 'Metz' 'Montpellier' 'Lorient' 'Lille' 'Paris SG' 'Angers'
 'Brest' 'Clermont' 'Nantes' 'Reims' 'Lens' 'Marseille']


In [157]:
len_time = len(data.index.unique())

time_evolve = [data.index.unique()[int(np.floor(len_time) / 5)],
               data.index.unique()[int(np.floor(len_time) / 4)],
               data.index.unique()[int(np.floor(len_time) / 2)],
               data.index.unique()[5 * int(np.floor(len_time) / 6)]]

time_evolve

[Timestamp('2021-09-22 00:00:00'),
 Timestamp('2021-10-03 00:00:00'),
 Timestamp('2021-12-11 00:00:00'),
 Timestamp('2022-03-05 00:00:00')]

In [158]:
for time in time_evolve:

    date = time
    print(date)

    model = FootballDF(data)

    model.get_history(date)

    model.get_performance()


    print(model.mse_run())

2021-09-22 00:00:00
[1.0, 0.9]
2021-10-03 00:00:00
[0.6000000000000001, 0.6000000000000001]
2021-12-11 00:00:00
[0.6000000000000001, 0.0]
2022-03-05 00:00:00
[0.6000000000000001, 0.2]


In [160]:
predict_model = FootballDF(data, 0.1, 0.9)
predict_model.get_history(date)
predict_model.get_performance()
predict_model.get_poiss(matchups)

Unnamed: 0,Home Team,Away Team,Home Goals,Away Goals
0,Troyes,Lille,0.730229,1.604506
1,Brest,Clermont,0.952733,0.942988
2,Lorient,Reims,0.923465,1.988711
3,Monaco,Angers,1.377582,1.033775
4,Montpellier,Metz,1.045677,1.34476
5,Bordeaux,Nice,0.663007,1.610895
6,Marseille,Lyon,1.032761,1.686476


In [161]:
predict_model = FootballDF(data, 0.6, 0.6)
predict_model.get_history(date)
predict_model.get_performance()
predict_model.get_poiss(matchups)

Unnamed: 0,Home Team,Away Team,Home Goals,Away Goals
0,Troyes,Lille,0.754394,1.452492
1,Brest,Clermont,1.215818,0.957281
2,Lorient,Reims,0.843147,1.643348
3,Monaco,Angers,1.522842,1.007323
4,Montpellier,Metz,1.317969,1.185427
5,Bordeaux,Nice,0.876519,1.718368
6,Marseille,Lyon,1.216936,1.540922


In [162]:
predict_model = FootballDF(data, 0.6, 0.2)
predict_model.get_history(date)
predict_model.get_performance()
predict_model.get_poiss(matchups)

Unnamed: 0,Home Team,Away Team,Home Goals,Away Goals
0,Troyes,Lille,0.839717,1.423372
1,Brest,Clermont,1.522151,1.021028
2,Lorient,Reims,0.747908,1.391222
3,Monaco,Angers,1.661283,1.006651
4,Montpellier,Metz,1.745964,1.084602
5,Bordeaux,Nice,1.098435,2.265913
6,Marseille,Lyon,1.384625,1.27188
