# 3. Data

In [1]:
import pandas as pd
import glob
import numpy as np
from scipy.optimize import minimize
import os
from scipy.stats import poisson

In [2]:
path =r"..\\data"
filenames = glob.glob(path + "\\football-data*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

df = pd.concat(dfs)

df.dropna(subset=["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"], inplace=True)

df.reset_index(inplace=True)
df

Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,0,E1,06/08/10,Norwich,Watford,2.0,3.0,A,0.0,2.0,...,,,,,,,,,,
1,1,E1,07/08/10,Bristol City,Millwall,0.0,3.0,A,0.0,1.0,...,,,,,,,,,,
2,2,E1,07/08/10,Burnley,Nott'm Forest,1.0,0.0,H,1.0,0.0,...,,,,,,,,,,
3,3,E1,07/08/10,Coventry,Portsmouth,2.0,0.0,H,1.0,0.0,...,,,,,,,,,,
4,4,E1,07/08/10,Crystal Palace,Leicester,3.0,2.0,H,3.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10247,375,E0,23/05/2021,Liverpool,Crystal Palace,2.0,0.0,H,1.0,0.0,...,3.49,-2.25,1.86,2.04,1.88,2.03,1.98,2.14,1.88,2.00
10248,376,E0,23/05/2021,Man City,Everton,5.0,0.0,H,2.0,0.0,...,2.77,-1.75,2.01,1.89,1.99,1.89,2.20,2.00,2.03,1.85
10249,377,E0,23/05/2021,Sheffield United,Burnley,1.0,0.0,H,1.0,0.0,...,2.05,0.00,2.04,1.86,2.05,1.86,2.17,1.90,2.03,1.84
10250,378,E0,23/05/2021,West Ham,Southampton,3.0,0.0,H,2.0,0.0,...,2.14,-0.75,2.00,1.90,2.02,1.91,2.06,2.01,1.99,1.89


## TABLE 1
*Empirical estimates for each score probability for joint and marginal probability functions*

In [3]:
goals_crosstable = pd.crosstab(index=df["FTHG"], columns=df["FTAG"]) / len(df)
goals_crosstable.style.background_gradient(cmap="RdYlGn")

FTAG,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
FTHG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.07579,0.078131,0.047015,0.0238,0.007608,0.002048,0.00078,9.8e-05,9.8e-05,9.8e-05
1.0,0.101736,0.119782,0.072083,0.028872,0.008584,0.002341,0.00078,9.8e-05,0.0,0.0
2.0,0.077058,0.088275,0.054038,0.019021,0.004487,0.001463,9.8e-05,0.0,0.0,0.0
3.0,0.042236,0.040187,0.024581,0.010242,0.002536,0.000488,0.000195,9.8e-05,0.0,0.0
4.0,0.016387,0.014339,0.007998,0.003804,0.00078,0.000293,9.8e-05,0.0,0.0,0.0
5.0,0.006633,0.004292,0.002439,0.001171,0.00039,0.000293,0.0,0.0,0.0,0.0
6.0,0.001853,0.002048,0.000293,0.000195,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.000293,0.000585,0.00039,9.8e-05,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.00039,0.0,9.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,9.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TABLE 2
*Estimates the ratios of the observed joint probability function and the empirical probability function obtained under the assumption of independence between the home and away scores* 

In [4]:
# Marginal home goals
f_H_i = goals_crosstable.sum(axis=1)

# Marginal away goals
f_A_j = goals_crosstable.sum(axis=0)

# Probabilities if independent
independent_crosstable = f_H_i.apply(lambda r: r * f_A_j)

In [5]:
independence_test_crosstable = (goals_crosstable/independent_crosstable) * 100
independence_test_crosstable

FTAG,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
FTHG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,99.81355,95.447851,95.565009,115.91073,132.503065,125.612332,169.875725,141.563104,424.689312,424.689312
1.0,94.379126,103.075432,103.20945,99.048678,105.30213,101.122404,119.661512,99.717926,0.0,0.0
2.0,97.75788,103.881548,105.807835,89.232821,75.274062,86.42919,20.454908,0.0,0.0,0.0
3.0,108.636352,95.884774,97.582334,97.418605,86.262783,58.41196,82.944984,276.483279,0.0,0.0
4.0,116.288566,94.386574,87.604208,99.829219,73.228571,96.692656,114.419643,0.0,0.0,0.0
5.0,135.173034,81.13327,76.701621,88.212012,105.148718,277.681473,0.0,0.0,0.0,0.0
6.0,130.932312,134.238683,31.907874,50.96694,0.0,0.0,0.0,0.0,0.0,0.0
7.0,66.450609,123.280423,136.748033,81.911154,0.0,0.0,0.0,0.0,0.0,0.0
8.0,248.082275,0.0,95.723623,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,310.102843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Where table 2 differs from 100 outlines areas where the independence assumption between scores breaks down

# 4. Model and Inference

In [6]:
# Correlation function

def tau(x, y, _lambda, mu, rho):
    if (x==0 and y==0):
        return (1 - _lambda*mu*rho)
    elif (x==0 and y==1):
        return (1 + _lambda*rho)
    elif (x==1 and y==0):
        return (1 + mu*rho)
    elif (x==1 and y==1):
        return (1 - rho)
    else:
        return 1.

# Log Likelihood function for a single match

def match_log_likelihood(x_k, y_k, alpha_ik, beta_ik, alpha_jk, beta_jk, rho, gamma):
    
    lambda_k = np.exp(alpha_ik + beta_jk + gamma)
    mu_k = np.exp(alpha_jk + beta_ik)
    
    return (
        np.log(tau(x_k, y_k, lambda_k, mu_k, rho))
        + np.log(poisson.pmf(x_k, lambda_k))
        + np.log(poisson.pmf(y_k, mu_k))
    )

# Log Likelihood for a dataset of matches

def log_likelhood(params):
    
    score_coefs = dict(zip(teams, params[:n_teams]))
    defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    rho, gamma = params[-2 : ]
    
    log_like = [match_log_likelihood(
        row.FTHG,
        row.FTAG,
        score_coefs[row.HomeTeam], 
        defend_coefs[row.HomeTeam],
        score_coefs[row.AwayTeam],
        defend_coefs[row.AwayTeam],
        rho,
        gamma
    ) for row in dataset.itertuples()]
    
    return -sum(log_like)

In [7]:
epl1718 = pd.read_csv(path + "\\football-data EPL 17-18.csv")
dataset = epl1718[["HomeTeam", "AwayTeam", "FTHG", "FTAG"]]
dataset.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG
0,Arsenal,Leicester,4,3
1,Brighton,Man City,0,2
2,Chelsea,Burnley,2,3
3,Crystal Palace,Huddersfield,0,3
4,Everton,Stoke,1,0


In [8]:
# Set up and run optimiser to find mle parameters

teams = dataset["HomeTeam"].unique()
n_teams = len(teams)

init_vals = np.concatenate((np.random.uniform(0,1,(n_teams)), # attack strength
                              np.random.uniform(0,-1,(n_teams)), # defence strength
                              np.array([0, 1.0]) # rho (score correction), gamma (home advantage)
                             ))

def equality_constraint(params):
    return (sum(params[:n_teams]) - n_teams)

In [9]:
opt_params = minimize(
    log_likelhood,
    init_vals, 
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  + np.log(poisson.pmf(x_k, lambda_k))
  + np.log(poisson.pmf(y_k, mu_k))
  np.log(tau(x_k, y_k, lambda_k, mu_k, rho))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1050.800745660909
            Iterations: 57
            Function evaluations: 2535
            Gradient evaluations: 57


In [10]:
parameters = dict(zip([f"attack_{team}" for team in teams] + 
                      [f"defence_{team}" for team in teams] +
                      ["rho", "home_adv"],
                      opt_params.x))
parameters

{'attack_Arsenal': 1.447581739538375,
 'attack_Brighton': 0.684708022872723,
 'attack_Chelsea': 1.2572198610038117,
 'attack_Crystal Palace': 0.9493156603965089,
 'attack_Everton': 0.9377188245851249,
 'attack_Southampton': 0.7651783852659586,
 'attack_Watford': 0.9338757102521041,
 'attack_West Brom': 0.5837382645183146,
 'attack_Man United': 1.330941446146212,
 'attack_Newcastle': 0.7670461989755757,
 'attack_Bournemouth': 0.9564336747020977,
 'attack_Burnley': 0.6983167047570226,
 'attack_Leicester': 1.189897994104512,
 'attack_Liverpool': 1.5643527380377609,
 'attack_Stoke': 0.7195898977186809,
 'attack_Swansea': 0.46644872648808644,
 'attack_Huddersfield': 0.489319786327788,
 'attack_Tottenham': 1.427342123026886,
 'attack_Man City': 1.7860039332798494,
 'attack_West Ham': 1.0449703080026072,
 'defence_Arsenal': -0.9058105380956889,
 'defence_Brighton': -0.8945948601796857,
 'defence_Chelsea': -1.2203586571879101,
 'defence_Crystal Palace': -0.8536729764326991,
 'defence_Everton':

In [11]:
# Use these parameters to make predictions

def calc_means(param_dict, home_team, away_team):
    return [np.exp(param_dict[f"attack_{home_team}"] + param_dict[f"defence_{away_team}"] + param_dict["home_adv"]),
            np.exp(param_dict[f"defence_{home_team}"] + param_dict[f"attack_{away_team}"])]

def dixon_coles_simulate_match(params_dict, home_team, away_team, max_goals=10):
    team_avgs = calc_means(params_dict, home_team, away_team)
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals + 1)] for team_avg in team_avgs]
    output_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    correction_matrix = np.array([[tau(
        home_goals, 
        away_goals,
        team_avgs[0],
        team_avgs[1],
        params_dict['rho']
    ) for away_goals in range(2)] for home_goals in range(2)])
    
    output_matrix[ : 2, : 2] = output_matrix[ : 2, : 2] * correction_matrix
    return output_matrix

In [12]:
def probabilities(output_matrix) -> list[str, str, str]:  # [home_win_prob, draw_prob, away_win_prob]
        home_win_prob = np.tril(out).sum() - np.trace(out)
        draw_prob = np.trace(out)
        away_win_prob = np.triu(out).sum() - np.trace(out)
        return [home_win_prob, draw_prob, away_win_prob]

## Model Enhancement

Limitation of the model is that the parameters are static. In reality a teams performance is dynamic and varies overtime with a team's performance likely being more closely related to their performance in recent matches than in earlier matches. 

In [13]:
df_me = epl1718.copy()

In [14]:
# Dates come in the two formats which is a pain
df_me["lower_case_dates"] = pd.to_datetime(df_me["Date"], format="%d/%m/%y", errors="coerce")
df_me["upper_case_dates"] = pd.to_datetime(df_me["Date"], format="%d/%m/%Y", errors="coerce")
df_me["lower_case_dates"].fillna(df_me["upper_case_dates"], inplace=True)
df_me["Date"] = df_me["lower_case_dates"]
df_me.drop(columns=["lower_case_dates", "upper_case_dates"], inplace=True)

# Create time difference variable
df_me["time_diff"] = (max(df_me["Date"]) - df_me["Date"]).dt.days
df_me = df_me[["HomeTeam","AwayTeam", "FTHG", "FTAG", "FTR", "time_diff"]]
df_me

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,time_diff
0,Arsenal,Leicester,4,3,H,275
1,Brighton,Man City,0,2,A,274
2,Chelsea,Burnley,2,3,A,274
3,Crystal Palace,Huddersfield,0,3,A,274
4,Everton,Stoke,1,0,H,274
...,...,...,...,...,...,...
375,Newcastle,Chelsea,3,0,H,0
376,Southampton,Man City,0,1,A,0
377,Swansea,Stoke,1,2,A,0
378,Tottenham,Leicester,5,4,H,0


In [15]:
def solve_parameters_decay(dataset, xi, init_vals=None, options={'disp': True, 'maxiter':100}):
    teams = np.sort(dataset["HomeTeam"].unique())
    n_teams = len(teams)
    
    if init_vals is None:
        init_vals = np.concatenate((
            np.random.uniform(0,1,(n_teams)),  # Attack strength
            np.random.uniform(0,-1,(n_teams)),  # Defence strength
            np.array([0, 1.0])  # Rho (score correction), Gamma (home advantage)
        ))
    
    def me_match_log_likelihood(x_k, y_k, alpha_ik, beta_ik, alpha_jk, beta_jk, rho, gamma, t, xi):
        
        lambda_k = np.exp(alpha_ik + beta_jk + gamma)  # Home expectation
        mu_k = np.exp(alpha_jk + beta_ik)  # Away expectation
        
        return (
            np.exp(-xi*t) *
            (
                np.log(tau(x_k, y_k, lambda_k, mu_k, rho)) + 
                np.log(poisson.pmf(x_k, lambda_k)) + 
                np.log(poisson.pmf(y_k, mu_k))
            )
        )

    def me_log_likelhood(params):
        
        score_coefs = dict(zip(teams, params[ : n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams : (2*n_teams)]))
        rho, gamma = params[-2 : ]

        log_like = [me_match_log_likelihood(
            row.FTHG,
            row.FTAG,
            score_coefs[row.HomeTeam], 
            defend_coefs[row.HomeTeam],
            score_coefs[row.AwayTeam],
            defend_coefs[row.AwayTeam], 
            rho,
            gamma,
            row.time_diff,
            xi
        )
                    for row in dataset.itertuples()]

        return -sum(log_like)
    
    def equality_constraint(params):
        return (sum(params[ : n_teams]) - n_teams)
    
    opt_output = minimize(
        me_log_likelhood, 
        init_vals, 
        constraints={'type':'eq', 'fun': equality_constraint}, 
        options={'disp': True, 'maxiter':100}
    )
    
    parameters = dict(zip([f"attack_{team}" for team in teams] + 
                          [f"defence_{team}" for team in teams] +
                          ['rho', 'home_adv'],
                          opt_output.x))

    return parameters

In [16]:
params_xi = solve_parameters_decay(df_me, xi=0.0018)
params_xi

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))
  np.log(tau(x_k, y_k, lambda_k, mu_k, rho)) +


Optimization terminated successfully    (Exit mode 0)
            Current function value: 832.6598927433536
            Iterations: 43
            Function evaluations: 1915
            Gradient evaluations: 43


{'attack_Arsenal': 1.459334000632671,
 'attack_Bournemouth': 0.9855179816551554,
 'attack_Brighton': 0.6993298112685152,
 'attack_Burnley': 0.7044087657437679,
 'attack_Chelsea': 1.2374170940408478,
 'attack_Crystal Palace': 1.0098278011604558,
 'attack_Everton': 0.9429096364765497,
 'attack_Huddersfield': 0.46242984941796134,
 'attack_Leicester': 1.187478341209066,
 'attack_Liverpool': 1.5541097017432945,
 'attack_Man City': 1.7731795578461689,
 'attack_Man United': 1.2929458360337873,
 'attack_Newcastle': 0.7805381273816665,
 'attack_Southampton': 0.7700898860888815,
 'attack_Stoke': 0.7003859625087099,
 'attack_Swansea': 0.46822322293798296,
 'attack_Tottenham': 1.4286151329641115,
 'attack_Watford': 0.8873808515535514,
 'attack_West Brom': 0.5979649109872698,
 'attack_West Ham': 1.0579135283495855,
 'defence_Arsenal': -0.9036316438973376,
 'defence_Bournemouth': -0.7435993814484063,
 'defence_Brighton': -0.8857244597215709,
 'defence_Burnley': -1.1824575610115593,
 'defence_Chelsea