In [1]:
import pandas as pd
import glob
import numpy as np
from scipy.optimize import minimize
import os
from scipy.stats import poisson
from collections import Counter
import scipy

In [2]:
path =r'..\\data'
epl1718 = pd.read_csv(path+"\\football-data EPL 17-18.csv")
dataset = epl1718
dataset = dataset[["HomeTeam", "AwayTeam", "FTHG", "FTAG"]]
dataset.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG
0,Arsenal,Leicester,4,3
1,Brighton,Man City,0,2
2,Chelsea,Burnley,2,3
3,Crystal Palace,Huddersfield,0,3
4,Everton,Stoke,1,0


In [3]:
teams = dataset['HomeTeam'].unique()
n_teams = len(teams)
n_teams

20

Model Specification: (Home Team Attack, Away Team Defense, Home Team Defense, Away Team Attack)

# Model 4: (alpha_i, beta_i, gamma_i, delta_i)

Every team has 4 unique parameters:
   * Attack at home
   * Defense away
   * Defense at home
   * Attack away

In [4]:
init_vals = np.concatenate((
    np.random.uniform(0,1,(n_teams)),  # attack home strength
    np.random.uniform(0,-1,(n_teams)),  # defence away strength
    np.random.uniform(0,-1,(n_teams)),  # defence home strength
    np.random.uniform(0,1,(n_teams)),  # attack away strength
))

In [5]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)  # Home goals expected
    mu_k = np.exp(alpha_jk + beta_ik)  # Away goals expected
    
    return (
        np.log(poisson.pmf(x_k, lambda_k))
        + np.log(poisson.pmf(y_k, mu_k))
    )

In [6]:
def log_likelhood(params):
    score_h_coefs = dict(zip(teams, params[:n_teams]))
    defend_a_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    defend_h_coefs = dict(zip(teams, params[(2*n_teams):(3*n_teams)]))
    score_a_coefs = dict(zip(teams, params[(3*n_teams):])) 
    
    log_like = [match_log_likelihood
                (
                    row.FTHG, 
                    row.FTAG, 
                    score_h_coefs[row.HomeTeam], 
                    defend_a_coefs[row.AwayTeam], 
                    defend_h_coefs[row.HomeTeam], 
                    score_a_coefs[row.AwayTeam]
                ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [7]:
def equality_constraint_1(params):
    return (sum(params[:n_teams]) - sum(params[n_teams:(2*n_teams)]))

In [8]:
def equality_constraint_2(params):
    return (sum(params[(2*n_teams):(3*n_teams)]) - sum(params[(3*n_teams):]))

In [9]:
opt_params_m4 = minimize(
    log_likelhood,
    init_vals,
    constraints=[{'type':'eq', 'fun': equality_constraint_1}, {'type':'eq', 'fun': equality_constraint_2}], 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k))
  + np.log(poisson.pmf(y_k, mu_k))


Iteration limit reached    (Exit mode 9)
            Current function value: 1032.9658439461423
            Iterations: 100
            Function evaluations: 8225
            Gradient evaluations: 100


In [10]:
parameters_m4 = dict(zip(["attack_h_" + team for team in teams] + 
                         ["defence_a_" + team for team in teams] +
                         ['defence_h_' + team for team in teams] +
                         ['attack_a_' + team for team in teams],
                         opt_params_m4.x))

In [11]:
Model_4_ll = opt_params_m4["fun"]  # Model 4 Log-Likelihood
print(f"Model 4 Log-Likelihood: {Model_4_ll}")

Model 4 Log-Likelihood: 1032.9658439461423


# Model 2: (alpha_i, beta_i, kappa\*beta_i, kappa\*alpha_i)

**Every team has 2 unique parameters:**
* Attack
* Defense  

**1 Global parameter:**
* kappa - home adv same for every team

In [12]:
init_vals = np.concatenate((
    np.random.uniform(0,1,(n_teams)),  # attack strength
    np.random.uniform(0,-1,(n_teams)),  # defence strength
    [1.]  # Home effect
))

In [13]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk, kappa):
    lambda_k = np.exp(alpha_ik + beta_jk + kappa)  # Home goals expected 
    mu_k = np.exp(alpha_jk + beta_ik)  # Away goals expected
    
    return (
        np.log(poisson.pmf(x_k, lambda_k)) +
        np.log(poisson.pmf(y_k, mu_k))
    )

In [14]:
def log_likelhood(params):
    score_coefs = dict(zip(teams, params[:n_teams]))
    defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    kappa = params[-1]
    
    log_like = [match_log_likelihood(
        row.FTHG, 
        row.FTAG,
        score_coefs[row.HomeTeam], 
        defend_coefs[row.AwayTeam], 
        defend_coefs[row.HomeTeam],
        score_coefs[row.AwayTeam],
        kappa
    ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [15]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - n_teams)

In [16]:
opt_params_m2 = minimize(
    log_likelhood, 
    init_vals, 
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1052.3376825829237
            Iterations: 46
            Function evaluations: 2004
            Gradient evaluations: 46


In [17]:
parameters_m2 = dict(zip(["attack_" + team for team in teams] + 
                         ["defence_" + team for team in teams] +
                         ['home_adv'],
                         opt_params_m2.x))

In [18]:
Model_2_ll = opt_params_m2["fun"]
print(f"Model 2 Log-Likelihood: {Model_2_ll}")

Model 2 Log-Likelihood: 1052.3376825829237


# Model 0: (alpha, beta, gamma, delta)

**4 Global parameters:**
* Attack when at home
* Defense when away
* Defense when at home
* Attack when away


In [19]:
init_vals = [1.,  # Home Attack
             -1.,  # Away Defencee
             -1.,  # Home Defence
             1.]  # Away Attack

In [20]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)  # Home goals expected
    mu_k = np.exp(alpha_jk + beta_ik)  # Away goals expected
    
    return (
        np.log(poisson.pmf(x_k, lambda_k))
        + np.log(poisson.pmf(y_k, mu_k))
    )

In [21]:
def log_likelhood(params):
    home_attack = params[0]
    away_defence = params[1]
    home_defence = params[2]
    away_attack = params[3]    
    
    log_like = [match_log_likelihood(
        row.FTHG,
        row.FTAG,
        home_attack,
        away_defence,
        home_defence,
        away_attack
    ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [22]:
def equality_constraint_1(params):
    return (params[0] - params[1])

def equality_constraint_2(params):
    return (params[2] - params[3])

In [23]:
opt_params_m0 = minimize(
    log_likelhood,
    init_vals,
    constraints=[{'type':'eq', 'fun': equality_constraint_1}, {'type':'eq', 'fun': equality_constraint_2}], 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k))
  + np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1155.5755005191922
            Iterations: 6
            Function evaluations: 36
            Gradient evaluations: 6


In [24]:
parameters_m0 = {}
for team in teams:
    parameters_m0[f"attack_home_{team}"] = opt_params_m0.x[0]
    
for team in teams:
    parameters_m0[f"defence_away_{team}"] = opt_params_m0.x[1]

for team in teams:
    parameters_m0[f"defence_home_{team}"] = opt_params_m0.x[2]
    
for team in teams:
    parameters_m0[f"attack_away_{team}"] = opt_params_m0.x[3]

In [25]:
Model_0_ll = opt_params_m0["fun"]
print(f"Model 0 Log-Likelihood: {Model_0_ll}")

Model 0 Log-Likelihood: 1155.5755005191922


# Model 1A: (alpha_i, beta, gamma, alpha_i)

**Every team has 1 unique parameters:**
* Attack  

**2 Global parameter:**
* Defence when away
* Defence at home

In [26]:
init_vals = np.concatenate((
    np.random.uniform(0,1,(n_teams)),  # attack strength
    [-1., -1.]  # defence strength at home, defence strength away
))

In [27]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)  # Home goals expected
    mu_k = np.exp(alpha_jk + beta_ik)  # Away goals expected
    
    return (
        np.log(poisson.pmf(x_k, lambda_k)) + 
        np.log(poisson.pmf(y_k, mu_k))
    )

In [28]:
def log_likelhood(params):
    score_coefs = dict(zip(teams, params[:n_teams]))
    defend_h_coefs, defend_a_coefs = params[-2], params[-1]
    
    log_like = [match_log_likelihood(
        row.FTHG, 
        row.FTAG, 
        score_coefs[row.HomeTeam], 
        defend_a_coefs,
        defend_h_coefs,
        score_coefs[row.AwayTeam]
    ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [29]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - n_teams*params[-1])

In [30]:
opt_params_m1a = minimize(
    log_likelhood, 
    init_vals,
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1080.1266122835514
            Iterations: 29
            Function evaluations: 710
            Gradient evaluations: 29


In [31]:
parameters_m1a = dict(zip(["attack_h_" + team for team in teams],
                        opt_params_m1a.x[:n_teams]))
for team in teams:
    parameters_m1a[f"defence_a_{team}"] = opt_params_m1a.x[-1]
    
for team in teams:
    parameters_m1a[f"defence_h_{team}"] = opt_params_m1a.x[-2]
    
parameters_m1a.update(
    dict(zip(["attack_a_" + team for team in teams], opt_params_m1a.x[:n_teams]))
)

In [32]:
Model_1a_ll = opt_params_m1a["fun"]
print(f"Model 1a Log-Likelihood: {Model_1a_ll}")

Model 1a Log-Likelihood: 1080.1266122835514


# Model 1B: (alpha, beta_i, beta_i, delta)

**Every team has 1 unique parameters:**
* Defence  

**2 Global parameters:**
* Attack when away
* Attack at home

In [33]:
init_vals = np.concatenate((
    np.random.uniform(0,-1,(n_teams)),  # defence strength
    [1., 1.]  # attack strength at home, attack strength away
))

In [34]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)  # Home goals expected
    mu_k = np.exp(alpha_jk + beta_ik)  # Away goals expected
    
    return (
        np.log(poisson.pmf(x_k, lambda_k)) + 
        np.log(poisson.pmf(y_k, mu_k))
    )

In [35]:
def log_likelhood(params):
    defend_coefs = dict(zip(teams, params[:n_teams]))
    attack_h_coef, attack_a_coef = params[-2], params[-1]
    
    log_like = [match_log_likelihood(
        row.FTHG,
        row.FTAG,
        attack_h_coef, 
        defend_coefs[row.HomeTeam],
        defend_coefs[row.AwayTeam], 
        attack_a_coef
    ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [36]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - n_teams*params[-2])

In [37]:
opt_params_m1b = minimize(
    log_likelhood,
    init_vals,
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1080.1266122721522
            Iterations: 31
            Function evaluations: 755
            Gradient evaluations: 31


In [38]:
parameters_m1b = {}
for team in teams:
    parameters_m1b[f"attack_home_{team}"] = opt_params_m1b.x[-2]

parameters_m1b.update(
    dict(zip(["defense_away_" + team for team in teams], opt_params_m1b.x[:n_teams]))
)

parameters_m1b.update(
    dict(zip(["defense_home_" + team for team in teams], opt_params_m1b.x[:n_teams]))
)
    
for team in teams:
    parameters_m1b[f"attack_away_{team}"] = opt_params_m1b.x[-1]

In [39]:
Model_1b_ll = opt_params_m1b["fun"]
print(f"Model 1b Log-Likelihood: {Model_1b_ll}")

Model 1b Log-Likelihood: 1080.1266122721522


# Model 3C: (alpha_i, beta_i, gamma_i, alpha_i)

**Every team has 3 unique parameters:**  
* Attack
* Defense at home  
* Defense away

In [40]:
init_vals = np.concatenate((
    np.random.uniform(0,1,(n_teams)),  # attack strength
    np.random.uniform(0,-1,(n_teams)),  # defence home strength
    np.random.uniform(0,-1,(n_teams)),  # defence away strength
))

In [41]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)
    mu_k = np.exp(alpha_jk + beta_ik)
    
    return (
        np.log(poisson.pmf(x_k, lambda_k)) + 
        np.log(poisson.pmf(y_k, mu_k))
    )

In [42]:
def log_likelhood(params):
    score_coefs = dict(zip(teams, params[:n_teams]))
    defend_h_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    defend_a_coefs = dict(zip(teams, params[(2*n_teams):]))
    
    log_like = [match_log_likelihood(
        row.FTHG,
        row.FTAG,
        score_coefs[row.HomeTeam], 
        defend_a_coefs[row.AwayTeam],
        defend_h_coefs[row.HomeTeam],
        score_coefs[row.AwayTeam]
    )
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [43]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - params[(2*n_teams):])

In [44]:
opt_params_m3c = minimize(
    log_likelhood,
    init_vals,
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1061.0918193409052
            Iterations: 60
            Function evaluations: 3734
            Gradient evaluations: 60


In [45]:
parameters_m3c = dict(zip(["attack_home_" + team for team in teams], opt_params_m3c.x[ :n_teams]))
    
parameters_m3c.update(
    dict(zip(["defense_away_" + team for team in teams], opt_params_m3c.x[2*n_teams: ]))
)

parameters_m3c.update(
    dict(zip(["defense_home_" + team for team in teams], opt_params_m3c.x[n_teams: 2*n_teams]))
)
    
for team in teams:
    parameters_m3c[f"attack_away_{team}"] = parameters_m3c[f"attack_home_{team}"]

In [46]:
Model_3c_ll = opt_params_m3c["fun"]
print(f"Model 3c Log-Likelihood: {Model_3c_ll}")

Model 3c Log-Likelihood: 1061.0918193409052


# Model 3D: (alpha_i, beta_i, beta_i, delta_i)

**Every team has 3 unique parameters:**  
* Defense
* Attack at home  
* Attack away

In [47]:
init_vals = np.concatenate((
    np.random.uniform(0,-1,(n_teams)),  # defence strength
    np.random.uniform(0,1,(n_teams)),  # attack home strength
    np.random.uniform(0,1,(n_teams)),  # attack away strength
))

In [48]:
def match_log_likelihood(x_k, y_k, alpha_ik, beta_jk, beta_ik, alpha_jk):
    lambda_k = np.exp(alpha_ik + beta_jk)
    mu_k = np.exp(alpha_jk + beta_ik)
    
    return (
        np.log(poisson.pmf(x_k, lambda_k)) + 
        np.log(poisson.pmf(y_k, mu_k))
    )

In [49]:
def log_likelhood(params):
    defend_coefs = dict(zip(teams, params[:n_teams]))
    attack_h_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
    attack_a_coefs = dict(zip(teams, params[(2*n_teams):]))
    
    log_like = [match_log_likelihood(
        row.FTHG,
        row.FTAG,
        attack_h_coefs[row.HomeTeam],
        defend_coefs[row.AwayTeam], 
        defend_coefs[row.HomeTeam],
        attack_a_coefs[row.AwayTeam]
    ) 
                for row in dataset.itertuples()]
    
    return -sum(log_like)

In [50]:
def equality_constraint(params):
    return (sum(params[:n_teams]) - params[n_teams:(2*n_teams)])

In [51]:
opt_params_m3d = minimize(
    log_likelhood,
    init_vals,
    constraints={'type':'eq', 'fun': equality_constraint}, 
    options={'disp': True, 'maxiter':100}
)

  np.log(poisson.pmf(x_k, lambda_k)) +
  np.log(poisson.pmf(y_k, mu_k))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1084.3992351556574
            Iterations: 58
            Function evaluations: 3597
            Gradient evaluations: 58


In [52]:
parameters_m3d = dict(zip(["attack_home_" + team for team in teams], opt_params_m3d.x[n_teams: 2*n_teams]))
    
parameters_m3d.update(
    dict(zip(["defense_away_" + team for team in teams], opt_params_m3d.x[ : n_teams]))
)

parameters_m3d.update(
    dict(zip(["defense_home_" + team for team in teams], opt_params_m3d.x[ : n_teams]))
)
    
parameters_m3d.update(
    dict(zip(["attack_away_" + team for team in teams], opt_params_m3d.x[2*n_teams: ]))
)

In [53]:
Model_3d_ll = opt_params_m3d["fun"]
print(f"Model 3d Log-Likelihood: {Model_3d_ll}")

Model 3d Log-Likelihood: 1084.3992351556574


# 4 Goodness-of-fit-tests

Likelihood ratio test: difference between models log likelihood's is chi2(k=n-1) distributed

In [73]:
print(f"Model 0 Log-Likelihood: {Model_0_ll}")
print(f"Model 1a Log-Likelihood: {Model_1a_ll}")
print(f"Model 1b Log-Likelihood: {Model_1b_ll}")
print(f"Model 2 Log-Likelihood: {Model_2_ll}")
print(f"Model 3c Log-Likelihood: {Model_3c_ll}")
print(f"Model 3d Log-Likelihood: {Model_3d_ll}")
print(f"Model 4 Log-Likelihood: {Model_4_ll}")

Model 0 Log-Likelihood: 1155.5755005191922
Model 1a Log-Likelihood: 1080.1266122835514
Model 1b Log-Likelihood: 1080.1266122721522
Model 2 Log-Likelihood: 1052.3376825829237
Model 3c Log-Likelihood: 1061.0918193409052
Model 3d Log-Likelihood: 1084.3992351556574
Model 4 Log-Likelihood: 1032.9658439461423


In [54]:
ll_increase_4_3d = Model_4_ll - Model_3d_ll
ll_increase_4_3c = Model_4_ll - Model_3c_ll
ll_increase_3d_2 = Model_3d_ll - Model_2_ll
ll_increase_3c_2 = Model_3c_ll - Model_2_ll
ll_increase_2_1b = Model_2_ll - Model_1b_ll
ll_increase_2_1a = Model_2_ll - Model_1a_ll
ll_increase_1b_0 = Model_1b_ll - Model_0_ll
ll_increase_1a_0 = Model_1a_ll - Model_0_ll

In [55]:
scipy.stats.chi2.ppf(0.95, n_teams-1)

30.14352720564616

In [56]:
scipy.stats.chi2.ppf(0.99, n_teams-1)

36.19086912927004

Model 2 is judged most appropriate and carried forward for chi-squared goodness-of-fit analysis

## Table 4.    Observed and expected frequencies of home and away scores

In [86]:
table_4_df = pd.DataFrame({"number_of_goals": [0, 1, 2, 3, ">=4"], 
                           "home_obs": [sum(dataset["FTHG"]==0), sum(dataset["FTHG"]==1), sum(dataset["FTHG"]==2), sum(dataset["FTHG"]==3), sum(dataset["FTHG"]>=4)], 
                           "home_exp": 0, 
                           "away_obs": [sum(dataset["FTAG"]==0), sum(dataset["FTAG"]==1), sum(dataset["FTAG"]==2), sum(dataset["FTAG"]==3), sum(dataset["FTAG"]>=4)], 
                           "away_exp": 0})

In [87]:
def calc_goals_pr(param_dict, homeTeam, awayTeam):
    means = [np.exp(param_dict['attack_'+homeTeam] + param_dict['defence_'+awayTeam] + param_dict['home_adv']),
     np.exp(param_dict['defence_'+homeTeam] + param_dict['attack_'+awayTeam])]
    
    team_pred = [[poisson.pmf(i, team_avg) for i in range(4)] for team_avg in means]
    
    for i in range(len(means)):
        team_pred[i].append(1 - poisson.cdf(3, means[i]))
    
    return np.array(team_pred)

In [88]:
sum_array = np.zeros((2, 5))
for row in dataset.itertuples():
    sum_array += calc_goals_pr(parameters_m2, row.HomeTeam, row.AwayTeam)

In [89]:
table_4_df["home_exp"] = sum_array[0]
table_4_df["away_exp"] = sum_array[1]

In [90]:
table_4_df

Unnamed: 0,number_of_goals,home_obs,home_exp,away_obs,away_exp
0,0,90,100.402215,136,135.934783
1,1,126,118.079646,127,126.427138
2,2,91,81.412585,65,69.666592
3,3,35,44.021443,33,30.344685
4,>=4,38,36.08411,19,17.626802


### Chi-Squared goodness of fit test

In [91]:
table_5_df = table_4_df.copy()
table_5_df["home_t_stat"] = (table_5_df["home_obs"] - table_5_df["home_exp"])**2/table_5_df["home_exp"]
table_5_df["away_t_stat"] = (table_5_df["away_obs"] - table_5_df["away_exp"])**2/table_5_df["away_exp"]

In [92]:
home_t_stat = table_5_df["home_t_stat"].sum()
away_t_stat = table_5_df["away_t_stat"].sum()

In [93]:
home_p_val = 1-scipy.stats.chi2.cdf(home_t_stat, len(table_4_df) - 2)
away_p_val = 1-scipy.stats.chi2.cdf(away_t_stat, len(table_4_df) - 2)

Both p-values are > 0.05, so cannot reject that the observed goals come from an independent Poisson distribution at the 5% level

# 5 A bivariate Poisson model

## Table 6. 
**Observed and estimated frequencies for Z, the difference in the teams' scores, for (i) the independent Poisson model and (ii) the bivariate Possion with Q = 0.2**

In [165]:
dataset_1 = dataset.copy()
dataset_1["Z"] = dataset_1["FTHG"] - dataset_1["FTAG"]

In [166]:
table_6_df = pd.DataFrame({
    "Z": ["<=-3", -2, -1, 0, 1, 2, 3, 4,">=5"], 
    "observed": [sum(dataset_1["Z"]<=-3), sum(dataset_1["Z"]==-2), 
                 sum(dataset_1["Z"]==-1), sum(dataset_1["Z"]==0), 
                 sum(dataset_1["Z"]==1), sum(dataset_1["Z"]==2), 
                 sum(dataset_1["Z"]==3), sum(dataset_1["Z"]==4), 
                 sum(dataset_1["Z"]>=5)]
})

In [167]:
def calc_goals_pr_dict(param_dict, homeTeam, awayTeam, q=0):
    means = [np.exp(param_dict['attack_'+homeTeam] + param_dict['defence_'+awayTeam] + param_dict['home_adv']),
             np.exp(param_dict['defence_'+homeTeam] + param_dict['attack_'+awayTeam])]
    
    team_pred = []
    for mean in means:
        dicti = {}
        for i in range(0, 15):
            correction = q*np.sqrt(means[0]*means[1])
            expectation = mean - correction
            dicti[i] = poisson.pmf(i, expectation)
        
        team_pred.append(dicti)

    return team_pred

In [168]:
def get_goal_diff_pr(team_pred):
    out = {}
    for k_h, v_h in team_pred[0].items():
        for k_a, v_a in team_pred[1].items():
            diff = k_h - k_a
            if (diff <= -3):
                diff = "<=-3"
            elif (diff >= 5):
                diff = ">=5"
            
            if (out.get(diff, 0) == 0):
                out[diff] = v_h*v_a
            else:
                out[diff] += v_h*v_a
    return out

In [169]:
def add_to_dict(dic, dic_to_add):
    for k_b, v_b in dic_to_add.items():
        if dic.get(k_b, 0) == 0:
            dic[k_b] = v_b
        else:
            dic[k_b] += v_b

In [176]:
q0_sum_dict = {}
q200_sum_dict = {}
for row in dataset_1.itertuples():
    goals_dict_q0 = calc_goals_pr_dict(parameters_m2, row.HomeTeam, row.AwayTeam)
    goal_diff_dict_q0 = get_goal_diff_pr(goals_dict_q0)
    add_to_dict(q0_sum_dict, goal_diff_dict_q0)
    
    goals_dict_q200 = calc_goals_pr_dict(parameters_m2, row.HomeTeam, row.AwayTeam, 0.2)
    goal_diff_dict_q200 = get_goal_diff_pr(goals_dict_q200)
    add_to_dict(q200_sum_dict, goal_diff_dict_q200)

In [177]:
estimated_q0 = []
estimated_q200 = []
for row in table_6_df.itertuples():
    estimated_q0.append(q0_sum_dict[row.Z])
    estimated_q200.append(q200_sum_dict[row.Z])
    
table_6_df["estimated_q0"] = estimated_q0
table_6_df["estimated_q200"] = estimated_q200

In [178]:
table_6_df

Unnamed: 0,Z,observed,estimated_q0,estimated_q200,t_stat_q0,t_stat_q200
0,<=-3,29,23.452338,19.068378,1.312302,8.589438
1,-2,21,32.830931,29.94563,4.263386,1.811768
2,-1,58,62.214914,62.709093,0.285551,0.297543
3,0,99,87.917985,97.847063,1.396882,0.279494
4,1,83,76.100519,79.740847,0.625526,0.032135
5,2,41,48.852503,47.553285,1.262204,0.655718
6,3,27,26.336168,24.1489,0.016733,0.726032
7,4,12,12.74111,11.15417,0.043108,0.262223
8,>=5,10,9.5534,7.832582,0.020878,1.249643


In [179]:
table_6_df["t_stat_q0"] = (table_6_df["observed"] - table_6_df["estimated_q0"])**2/table_6_df["estimated_q0"]
table_6_df["t_stat_q200"] = (table_6_df["observed"] - table_6_df["estimated_q200"])**2/table_6_df["estimated_q200"]
                                                            
t_stat_q0 = table_6_df["t_stat_q0"].sum()
t_stat_q200 = table_6_df["t_stat_q200"].sum()

q0_p_val = 1-scipy.stats.chi2.cdf(t_stat_q0,  len(table_6_df)-3)
q200_p_val = 1-scipy.stats.chi2.cdf(t_stat_q200,  len(table_6_df)-3)

In [181]:
q200_p_val

0.11454366408069061

P-value for the bivariate case (q = 0.2) actually has a lower p-value than the q=0 independent model in this sample.  
Q=0.1 provides a better fit