In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import poisson, nbinom
import glob
import scipy

In [2]:
path =r'..\\data'
filenames = glob.glob(path + "\\football-data*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

df = pd.concat(dfs)

df.dropna(subset=["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"], inplace=True)

df.reset_index(inplace=True)
df = df[["FTHG", "FTAG"]]
df["FTTG"] = df["FTHG"] + df["FTAG"]
df

Unnamed: 0,FTHG,FTAG,FTTG
0,2.0,3.0,5.0
1,0.0,3.0,3.0
2,1.0,0.0,1.0
3,2.0,0.0,2.0
4,3.0,2.0,5.0
...,...,...,...
10247,2.0,0.0,2.0
10248,5.0,0.0,5.0
10249,1.0,0.0,1.0
10250,3.0,0.0,3.0


# Model 1
Total goals per match taken as trials from a single Poisson distribution with mean = df["FTTG"].mean()

In [8]:
avTG = df["FTTG"].mean()
avTG

2.6510924697619975

In [4]:
goals = []
observedFrequency = []
expectedFrequency = []
for i in range(11):
    goals.append(i)
    observedFrequency.append(sum(df["FTTG"]==i))
    expectedFrequency.append(poisson.pmf(i, avTG)*len(df["FTTG"]))    
    
tableDf = pd.DataFrame({"Goals_per_match": goals, "Observed_Frequency": observedFrequency, "Poisson_Frequency": expectedFrequency})
tableDf

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency
0,0,777,723.525375
1,1,1844,1918.132673
2,2,2500,2542.573542
3,3,2321,2246.865857
4,4,1508,1489.162289
5,5,771,789.581386
6,6,328,348.875544
7,7,138,132.128761
8,8,41,43.785695
9,9,17,12.89777


In [7]:
tableDf["T_Stat"] = (tableDf["Observed_Frequency"] - tableDf["Poisson_Frequency"])**2 / tableDf["Poisson_Frequency"]
tableDf

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency,T_Stat
0,0,777,723.525375,3.952226
1,1,1844,1918.132673,2.865106
2,2,2500,2542.573542,0.712863
3,3,2321,2246.865857,2.446017
4,4,1508,1489.162289,0.238295
5,5,771,789.581386,0.43728
6,6,328,348.875544,1.249123
7,7,138,132.128761,0.260893
8,8,41,43.785695,0.177229
9,9,17,12.89777,1.304744


In [6]:
tStat = tableDf["T_Stat"].sum()
pValue = 1 - scipy.stats.chi2.cdf(tStat, len(tableDf) - 1)
pValue

0.06609909163260375

P-value is > 0.05, so cannot reject that the observed goals come from a Poisson distribution at the 5% level

# Model 2
Modifying the Poisson distribution to account for the expectation varying from trial to trial - Negative binomial

In [11]:
varTG = np.var(df["FTTG"])

In [21]:
p = avTG/varTG
n = avTG**2 / (varTG - avTG)

In [22]:
expectedFrequency = []
for i in range(11):
    expectedFrequency.append(nbinom.pmf(i, n, p)*len(df["FTTG"]))
    
tableDf["NBinomial_Frequency"] = expectedFrequency
tableDf

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency,T_Stat,NBinomial_Frequency
0,0,777,723.525375,3.952226,729.663653
1,1,1844,1918.132673,2.865106,1922.103493
2,2,2500,2542.573542,0.712863,2537.745502
3,3,2321,2246.865857,2.446017,2239.096582
4,4,1508,1489.162289,0.238295,1485.255139
5,5,771,789.581386,0.43728,790.058056
6,6,328,348.875544,1.249123,351.053201
7,7,138,132.128761,0.260893,134.021613
8,8,41,43.785695,0.177229,44.876306
9,9,17,12.89777,1.304744,13.388647


In [23]:
tableDf["T_Stat2"] = (tableDf["Observed_Frequency"] - tableDf["NBinomial_Frequency"])**2 / tableDf["NBinomial_Frequency"]
tableDf

Unnamed: 0,Goals_per_match,Observed_Frequency,Poisson_Frequency,T_Stat,NBinomial_Frequency,T_Stat2
0,0,777,723.525375,3.952226,729.663653,3.070908
1,1,1844,1918.132673,2.865106,1922.103493,3.173687
2,2,2500,2542.573542,0.712863,2537.745502,0.561413
3,3,2321,2246.865857,2.446017,2239.096582,2.995927
4,4,1508,1489.162289,0.238295,1485.255139,0.34831
5,5,771,789.581386,0.43728,790.058056,0.459725
6,6,328,348.875544,1.249123,351.053201,1.513873
7,7,138,132.128761,0.260893,134.021613,0.118097
8,8,41,43.785695,0.177229,44.876306,0.334826
9,9,17,12.89777,1.304744,13.388647,0.974099


In [24]:
tStat = tableDf["T_Stat2"].sum()
pValue = 1 - scipy.stats.chi2.cdf(tStat, len(tableDf) - 1)
pValue

0.08002977080844587

P-value is > 0.05, so cannot reject that the observed goals come from a Negative Binomial distribution at the 5% level  
P-value for this model is also marginally higher than for the Poisson model, suggesting a better fit