In [1]:
package_paths = [r"C:\Users\benja\Documents\projects\goalscorers"]
import sys
for path in package_paths:
    sys.path.append(path)
from goalscorer_package.constants import *
import goalscorer_package.data_cleaning as dc
import goalscorer_package.modelling as m
import datetime as dt
import pandas as pd
import numpy as np
import pymc as pm
print(f"Running on PyMC v{pm.__version__}")
import arviz as az
print(f"Running on Az v{az.__version__}")
import pytensor.tensor as pt
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import glob
from statistics import mode
import pickle
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 100)
pd.options.display.float_format = "{: ,.3f}".format
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
%config InlineBackend.figure_formats = ["retina"]

Running on PyMC v5.6.1
Running on Az v0.16.0


# Data

In [2]:
seasons_leagues = [    
    # xG season leagues
    ## Top 5 and championship
    SeasonLeague(SEASON_22_23, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, SPANISH_LA_LIGA, xg_league_bool=True),
    # SeasonLeague(SEASON_17_18, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, GERMAN_BUNDESLIGA, xg_league_bool=True),
    ## Lesser leagues
    SeasonLeague(SEASON_22_23, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, MEXICAN_LIGA_MX, xg_league_bool=True),
]

In [3]:
def data(seasons_leagues: list[SeasonLeague]) -> pd.DataFrame:
    df = dc.load_seasons_leagues_files("summary", True, seasons_leagues)
    df = dc.add_npg(df)
    return df

In [5]:
df = data(seasons_leagues)

# Maths

In [29]:
goals = [0, 1, 2, 3, 4, 5]

In [33]:
# exp
df_pred_npg = pd.DataFrame(
    np.array([scipy.stats.poisson.pmf(goal, mu=df.npxg.values) for goal in goals]).T, 
    columns=goals
)

In [35]:
# obs
df_npg = pd.get_dummies(df.npg)

In [60]:
df_table = (
    pd.DataFrame({"goals": goals, "obs_mean": df_npg.mean(), "exp_mean": df_pred_npg.mean(),
                  "obs": df_npg.sum(), "exp": df_pred_npg.sum()})
    .assign(
        t_stat=lambda x: (x.obs - x.exp) ** 2 / x.exp,
    )
)
df_table.iloc[3] = df_table.iloc[3] + df_table.iloc[4] + df_table.iloc[5]
df_table.loc[3, "goals"] = 3
df_table = df_table.iloc[:4]

t_stat = df_table.t_stat.sum()
p_val = 1 - scipy.stats.chi2.cdf(t_stat, len(df_table) - 1)

In [61]:
print(f"{p_val: .5f}")
df_table

 0.00000


Unnamed: 0,goals,obs_mean,exp_mean,obs,exp,t_stat
0,0,0.925,0.932,481551,484398.975,16.744
1,1,0.069,0.055,35760,28432.639,1888.331
2,2,0.006,0.01,3081,5389.053,988.506
3,3,0.001,0.003,314,1418.043,866.921
