# 3. Data

In [5]:
import pandas as pd
import glob
import numpy as np
from scipy.optimize import minimize
import os
from scipy.stats import poisson

In [6]:
path =r'..\\data'
filenames = glob.glob(path + "\\football-data*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

df = pd.concat(dfs)

df.dropna(subset=["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"], inplace=True)

df.reset_index(inplace=True)
df

# TEST
#df.to_csv("test.csv")
#os.startfile("test.csv")

Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,0,E1,06/08/10,Norwich,Watford,2.0,3.0,A,0.0,2.0,...,,,,,,,,,,
1,1,E1,07/08/10,Bristol City,Millwall,0.0,3.0,A,0.0,1.0,...,,,,,,,,,,
2,2,E1,07/08/10,Burnley,Nott'm Forest,1.0,0.0,H,1.0,0.0,...,,,,,,,,,,
3,3,E1,07/08/10,Coventry,Portsmouth,2.0,0.0,H,1.0,0.0,...,,,,,,,,,,
4,4,E1,07/08/10,Crystal Palace,Leicester,3.0,2.0,H,3.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10247,375,E0,23/05/2021,Liverpool,Crystal Palace,2.0,0.0,H,1.0,0.0,...,3.49,-2.25,1.86,2.04,1.88,2.03,1.98,2.14,1.88,2.00
10248,376,E0,23/05/2021,Man City,Everton,5.0,0.0,H,2.0,0.0,...,2.77,-1.75,2.01,1.89,1.99,1.89,2.20,2.00,2.03,1.85
10249,377,E0,23/05/2021,Sheffield United,Burnley,1.0,0.0,H,1.0,0.0,...,2.05,0.00,2.04,1.86,2.05,1.86,2.17,1.90,2.03,1.84
10250,378,E0,23/05/2021,West Ham,Southampton,3.0,0.0,H,2.0,0.0,...,2.14,-0.75,2.00,1.90,2.02,1.91,2.06,2.01,1.99,1.89


In [None]:
df_1 = df[["FTHG", "FTAG"]]
df_1

## TABLE 1
*Empirical estimates for each score probability for joint and marginal probability functions*

In [None]:
goalsCrosstable = pd.crosstab(index=df["FTHG"], columns=df["FTAG"])/len(df_1)
goalsCrosstable.style.background_gradient(cmap="RdYlGn")

## TABLE 2
*Estimates the ratios of the observed joint probability function and the empirical probability function obtained under the assumption of independence between the home and away scores* 

In [None]:
independenceCrosstable = goalsCrosstable.copy()

In [None]:
# Marginal home goals
f_H_i = independenceCrosstable.sum(axis=1)

In [None]:
# Marginal away goals
f_A_j = independenceCrosstable.sum(axis=0)

In [None]:
# Probabilities if independent
independentCrosstable = f_H_i.apply(lambda r: r*f_A_j)
#independentCrosstable.style.background_gradient(cmap="RdYlGn")

In [None]:
independenceTestCrosstable = (independenceCrosstable/independentCrosstable)*100
independenceTestCrosstable

Where table 2 differs from 100 outlines areas where the independence assumption between scores breaks down

# 4. Model and Inference

In [13]:
def tau(x, y, _lambda, mu, rho):
    if (x==0 and y==0):
        return (1 - _lambda*mu*rho)
    elif (x==0 and y==1):
        return (1 + _lambda*rho)
    elif (x==1 and y==0):
        return (1 + mu*rho)
    elif (x==1 and y==1):
        return (1 - rho)
    else:
        return 1.

In [None]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_ik, alpha_jk, beta_jk, rho, gamma):
    
    lambda_k = np.exp(alpha_ik + beta_jk + gamma)
    mu_k = np.exp(alpha_jk + beta_ik)
    
    return (
        np.log(tau(x_k, y_k, lambda_k, mu_k, rho))
        + np.log(poisson.pmf(x_k, lambda_k))
        + np.log(poisson.pmf(y_k, mu_k))
    )

In [7]:
epl1718 = pd.read_csv(path + "\\football-data EPL 17-18.csv")

In [None]:
dataset = epl1718[["HomeTeam", "AwayTeam", "FTHG", "FTAG"]]
dataset.head()

In [None]:
teams = dataset['HomeTeam'].unique()
n_teams = len(teams)

In [None]:
init_vals = np.concatenate((np.random.uniform(0,1,(n_teams)), # attack strength
                              np.random.uniform(0,-1,(n_teams)), # defence strength
                              np.array([0, 1.0]) # rho (score correction), gamma (home advantage)
                             ))

In [None]:
def log_likelhood(params):
    
    score_coefs = dict(zip(teams, params[:n_teams]))
    defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    rho, gamma = params[-2:]
    
    log_like = [match_log_likelihood(row.FTHG, row.FTAG, score_coefs[row.HomeTeam], defend_coefs[row.HomeTeam],
             score_coefs[row.AwayTeam], defend_coefs[row.AwayTeam], rho, gamma) for row in dataset.itertuples()]
    
    return -sum(log_like)

In [None]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - n_teams)

In [None]:
opt_params = minimize(
    log_likelhood,
    init_vals, 
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

In [None]:
parameters = dict(zip(["attack_"+team for team in teams] + 
                        ["defence_"+team for team in teams] +
                        ['rho', 'home_adv'],
                        opt_params.x))
parameters

In [None]:
# Use these parameters to make predictions

def calc_means(param_dict, homeTeam, awayTeam):
    return [np.exp(param_dict['attack_'+homeTeam] + param_dict['defence_'+awayTeam] + param_dict['home_adv']),
            np.exp(param_dict['defence_'+homeTeam] + param_dict['attack_'+awayTeam])]

def dixon_coles_simulate_match(params_dict, homeTeam, awayTeam, max_goals=10):
    team_avgs = calc_means(params_dict, homeTeam, awayTeam)
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in team_avgs]
    output_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    correction_matrix = np.array([[tau(home_goals, away_goals, team_avgs[0],
                                                   team_avgs[1], params_dict['rho']) for away_goals in range(2)]
                                   for home_goals in range(2)])
    output_matrix[:2,:2] = output_matrix[:2,:2] * correction_matrix
    return output_matrix

In [None]:
def probabilities(output_matrix) -> list[str, str, str]:  # [home_win_prob, draw_prob, away_win_prob]
        home_win_prob = np.tril(out).sum() - np.trace(out)
        draw_prob = np.trace(out)
        away_win_prob = np.triu(out).sum() - np.trace(out)
        return [home_win_prob, draw_prob, away_win_prob]

## Model Enhancement

Limitation of the model is that the parameters are static. In reality a teams performance is dynamic and varies overtime with a team's performance likely being more closely related to their performance in recent matches than in earlier matches. 

In [8]:
df_me = epl1718.copy()

In [9]:
# Dates come in the two formats which is a pain
df_me["lower_case_dates"] = pd.to_datetime(df_me["Date"], format='%d/%m/%y', errors='coerce')
df_me["upper_case_dates"] = pd.to_datetime(df_me["Date"], format='%d/%m/%Y', errors='coerce')
df_me["lower_case_dates"].fillna(df_me["upper_case_dates"], inplace=True)
df_me["Date"] = df_me["lower_case_dates"]
df_me.drop(columns=["lower_case_dates", "upper_case_dates"], inplace=True)

In [10]:
df_me['time_diff'] = (max(df_me['Date']) - df_me['Date']).dt.days
df_me = df_me[['HomeTeam','AwayTeam','FTHG','FTAG', 'FTR', 'time_diff']]
df_me

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,time_diff
0,Arsenal,Leicester,4,3,H,275
1,Brighton,Man City,0,2,A,274
2,Chelsea,Burnley,2,3,A,274
3,Crystal Palace,Huddersfield,0,3,A,274
4,Everton,Stoke,1,0,H,274
...,...,...,...,...,...,...
375,Newcastle,Chelsea,3,0,H,0
376,Southampton,Man City,0,1,A,0
377,Swansea,Stoke,1,2,A,0
378,Tottenham,Leicester,5,4,H,0


In [11]:
def solve_parameters_decay(dataset, xi, init_vals=None, options={'disp': True, 'maxiter':100}):
    teams = np.sort(dataset['HomeTeam'].unique())
    n_teams = len(teams)
    
    if init_vals is None:
        init_vals = np.concatenate((
            np.random.uniform(0,1,(n_teams)), # Attack strength
            np.random.uniform(0,-1,(n_teams)), # Defence strength
            np.array([0, 1.0]) # Rho (score correction), Gamma (home advantage)
        ))
    
    def me_match_log_likelihood(x_k, y_k, alpha_ik, beta_ik, alpha_jk, beta_jk, rho, gamma, t, xi):
        
        lambda_k = np.exp(alpha_ik + beta_jk + gamma)  # Home expectation
        mu_k = np.exp(alpha_jk + beta_ik)  # Away expectation
        
        return (
            np.exp(-xi*t) *
            (
                np.log(tau(x_k, y_k, lambda_k, mu_k, rho)) + 
                np.log(poisson.pmf(x_k, lambda_k)) + 
                np.log(poisson.pmf(y_k, mu_k))
            )
        )

    def me_log_likelhood(params):
        
        score_coefs = dict(zip(teams, params[:n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
        rho, gamma = params[-2:]

        log_like = [me_match_log_likelihood(
            row.FTHG,
            row.FTAG,
            score_coefs[row.HomeTeam], 
            defend_coefs[row.HomeTeam],
            score_coefs[row.AwayTeam],
            defend_coefs[row.AwayTeam], 
            rho,
            gamma,
            row.time_diff,
            xi
        )
                    for row in dataset.itertuples()]

        return -sum(log_like)
    
    def equality_constraint(params):
        return (sum(params[:n_teams]) - n_teams)
    
    opt_output = minimize(
        me_log_likelhood, 
        init_vals, 
        constraints={'type':'eq', 'fun': equality_constraint}, 
        options={'disp': True, 'maxiter':100}
    )
    
    parameters = dict(zip(["attack_"+team for team in teams] + 
                          ["defence_"+team for team in teams] +
                          ['rho', 'home_adv'],
                          opt_output.x))

    return parameters

In [14]:
params_xi = solve_parameters_decay(df_me, xi=0.0018)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))
  np.log(tau(x_k, y_k, lambda_k, mu_k, rho)) +


Optimization terminated successfully    (Exit mode 0)
            Current function value: 832.6598921048931
            Iterations: 55
            Function evaluations: 2446
            Gradient evaluations: 55


In [15]:
params_xi

{'attack_Arsenal': 1.4593655695091265,
 'attack_Bournemouth': 0.9854949336449211,
 'attack_Brighton': 0.6992877543930777,
 'attack_Burnley': 0.7043694610605504,
 'attack_Chelsea': 1.2374570961899718,
 'attack_Crystal Palace': 1.0097590260233091,
 'attack_Everton': 0.942875163093009,
 'attack_Huddersfield': 0.46240033726421015,
 'attack_Leicester': 1.1875062151730746,
 'attack_Liverpool': 1.55411824344368,
 'attack_Man City': 1.773196633400078,
 'attack_Man United': 1.2929705620193013,
 'attack_Newcastle': 0.7805557116855908,
 'attack_Southampton': 0.770070435017185,
 'attack_Stoke': 0.700480552681757,
 'attack_Swansea': 0.4682608325094173,
 'attack_Tottenham': 1.4286366733153364,
 'attack_Watford': 0.8873660996228072,
 'attack_West Brom': 0.597935872899844,
 'attack_West Ham': 1.0578928270537544,
 'defence_Arsenal': -0.9035595639772691,
 'defence_Bournemouth': -0.7435749724191211,
 'defence_Brighton': -0.8857435328341899,
 'defence_Burnley': -1.182411436494782,
 'defence_Chelsea': -1.1