In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import poisson, nbinom
import glob
import scipy

In [2]:
path =r"..\\data"
filenames = glob.glob(path + "\\football-data*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

df = pd.concat(dfs)

df.dropna(subset=["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"], inplace=True)

df.reset_index(inplace=True)
df = df[["FTHG", "FTAG"]]
df["FTTG"] = df["FTHG"] + df["FTAG"]
df

Unnamed: 0,FTHG,FTAG,FTTG
0,2.0,3.0,5.0
1,0.0,3.0,3.0
2,1.0,0.0,1.0
3,2.0,0.0,2.0
4,3.0,2.0,5.0
...,...,...,...
10247,2.0,0.0,2.0
10248,5.0,0.0,5.0
10249,1.0,0.0,1.0
10250,3.0,0.0,3.0


# Model 1
Total goals per match taken as trials from a single Poisson distribution with mean = df["FTTG"].mean()

In [3]:
av_TG = df["FTTG"].mean()
av_TG

2.6510924697619975

In [4]:
goals = []
observed_frequency = []
expected_frequency = []

max_goals = 7
for i in range(max_goals + 1):
    goals.append(i)
    observed_frequency.append(sum(df["FTTG"] == i))
    expected_frequency.append(poisson.pmf(i, av_TG) * len(df["FTTG"]))    

goals.append(">= 8")
observed_frequency.append(sum(df["FTTG"] >= max_goals + 1))
expected_frequency.append((1 - poisson.cdf(max_goals, av_TG)) * len(df["FTTG"]))

table_df = pd.DataFrame({"Goals_per_match": goals, "Observed_Frequency": observed_frequency,
                        "Poisson_Frequency": expected_frequency})
table_df["T_Stat"] = (table_df["Observed_Frequency"] - table_df["Poisson_Frequency"])**2 / table_df["Poisson_Frequency"]
table_df

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency,T_Stat
0,0,777,723.525375,3.952226
1,1,1844,1918.132673,2.865106
2,2,2500,2542.573542,0.712863
3,3,2321,2246.865857,2.446017
4,4,1508,1489.162289,0.238295
5,5,771,789.581386,0.43728
6,6,328,348.875544,1.249123
7,7,138,132.128761,0.260893
8,>= 8,65,61.154572,0.241802


In [5]:
t_stat = table_df["T_Stat"].sum()
k = len(table_df)  # Number of categories
p = 1

p_value = 1 - scipy.stats.chi2.cdf(t_stat, k - p - 1)
p_value

0.08804323540261771

P-value is > 0.05, so cannot reject that the observed goals come from a Poisson distribution at the 5% level

# Model 2
Modifying the Poisson distribution to account for the expectation varying from trial to trial - Negative binomial

In [6]:
var_TG = np.var(df["FTTG"])
var_TG

2.668060648101703

In [7]:
p = av_TG / var_TG
n = av_TG**2 / (var_TG-av_TG)

In [8]:
expected_frequency = []
for i in range(max_goals + 1):
    expected_frequency.append(nbinom.pmf(i, n, p) * len(df["FTTG"]))
    
expected_frequency.append((1-nbinom.cdf(max_goals, n, p)) * len(df["FTTG"]))
    
table_df["NBinomial_Frequency"] = expected_frequency
table_df["T_Stat2"] = (table_df["Observed_Frequency"] - table_df["NBinomial_Frequency"])**2 / table_df["NBinomial_Frequency"]
table_df

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency,T_Stat,NBinomial_Frequency,T_Stat2
0,0,777,723.525375,3.952226,729.663653,3.070908
1,1,1844,1918.132673,2.865106,1922.103493,3.173687
2,2,2500,2542.573542,0.712863,2537.745502,0.561413
3,3,2321,2246.865857,2.446017,2239.096582,2.995927
4,4,1508,1489.162289,0.238295,1485.255139,0.34831
5,5,771,789.581386,0.43728,790.058056,0.459725
6,6,328,348.875544,1.249123,351.053201,1.513873
7,7,138,132.128761,0.260893,134.021613,0.118097
8,>= 8,65,61.154572,0.241802,63.002761,0.063314


In [9]:
t_stat = table_df["T_Stat2"].sum()
p = 2  # Number of parameters

p_value = 1 - scipy.stats.chi2.cdf(t_stat, k - p - 1)
p_value

0.05549530119750545

P-value is > 0.05, so cannot reject that the observed goals come from a Negative Binomial distribution at the 5% level.  
P-value for this model is actually marginally worse than for the Poisson model, suggesting a worse fit