In [1]:
package_paths = [r"C:\Users\benja\Documents\projects\goalscorers"]
import sys
for path in package_paths:
    sys.path.append(path)
from goalscorer_package.constants import *
import goalscorer_package.data_cleaning as dc
import goalscorer_package.modelling as m
import datetime as dt
import pandas as pd
import numpy as np
import pymc as pm
print(f"Running on PyMC v{pm.__version__}")
import arviz as az
print(f"Running on Az v{az.__version__}")
import pytensor.tensor as pt
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import glob
from statistics import mode
import pickle
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 100)
pd.options.display.float_format = "{: ,.3f}".format
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
%config InlineBackend.figure_formats = ["retina"]

Running on PyMC v5.6.1
Running on Az v0.16.0


# Data

In [2]:
seasons_leagues = [    
    # xG season leagues
    ## Top 5 and championship
    SeasonLeague(SEASON_22_23, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ENGLISH_CHAMPIONSHIP, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, SPANISH_LA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, GERMAN_BUNDESLIGA, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, ENGLISH_PREMIER_LEAGUE, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, ITALIAN_SERIE_A, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, SPANISH_LA_LIGA, xg_league_bool=True),
    # SeasonLeague(SEASON_17_18, FRENCH_LIGUE_1, xg_league_bool=True),
    SeasonLeague(SEASON_17_18, GERMAN_BUNDESLIGA, xg_league_bool=True),
    ## Lesser leagues
    SeasonLeague(SEASON_22_23, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_22_23, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_21_22, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_20_21, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_19_20, MEXICAN_LIGA_MX, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, PORTUGUESE_PREMIERA_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, DUTCH_ERIDIVISIE_LIGA, xg_league_bool=True),
    SeasonLeague(SEASON_18_19, MEXICAN_LIGA_MX, xg_league_bool=True),
]

In [3]:
def data(seasons_leagues: list[SeasonLeague]) -> pd.DataFrame:
    df = dc.load_seasons_leagues_files("summary", True, seasons_leagues)
    df = dc.drop_na_npxg(df)
    df = dc.add_opp_team(df)
    df = dc.split_positions(df)
    df = dc.position_to_generic_position(df)
    df = dc.drop_gk(df)
    df = dc.add_season(df)
    return df

In [4]:
df = data(seasons_leagues)

# Maths

$\beta$ means teams propensity for goals below average
$$
\Rightarrow e^{-\beta} = \text{multiplicative impact of team}
$$
$$
\Rightarrow -\beta = ln(\text{multiplicative impact of team})
$$
$$
\Rightarrow \beta = -ln(\text{multiplicative impact of team})
$$

In [5]:
df

Unnamed: 0,home_team,away_team,datetime,shirtnumber,nationality,position,age,minutes,goals,assists,pens_made,pens_att,shots,shots_on_target,cards_yellow,cards_red,touches,tackles,interceptions,blocks,xg,npxg,xg_assist,sca,gca,passes_completed,passes,passes_pct,progressive_passes,carries,progressive_carries,take_ons,take_ons_won,player_id,player,squad,start,season_league,opposition_team,position_0,position_1,position_2,position_3,position_4,complex_position,season
0,Crystal Palace,Arsenal,1659726000,22.000,fr FRA,FW,24-201,57.000,0,0,0,0,3.000,1.000,0,0,20.000,0.000,0.000,0.000,0.200,0.200,0.000,1.000,0.000,3.000,6.000,50.000,0.000,12.000,2.000,2.000,1.000,0562b7f1,Odsonne Édouard,Crystal Palace,True,Premier-League 2022-2023,Arsenal,FW,,,,,FW,2022-2023
1,Crystal Palace,Arsenal,1659726000,14.000,fr FRA,FW,25-038,33.000,0,0,0,0,1.000,0.000,0,0,13.000,0.000,0.000,0.000,0.100,0.100,0.200,0.000,0.000,8.000,10.000,80.000,0.000,8.000,1.000,0.000,0.000,50e6dc35,Jean-Philippe Mateta,Crystal Palace,False,Premier-League 2022-2023,Arsenal,FW,,,,,FW,2022-2023
2,Crystal Palace,Arsenal,1659726000,11.000,ci CIV,W,29-268,90.000,0,0,0,0,1.000,0.000,0,0,50.000,3.000,0.000,0.000,0.100,0.100,0.600,3.000,0.000,30.000,39.000,76.900,2.000,44.000,7.000,5.000,3.000,b2bc3b1f,Wilfried Zaha,Crystal Palace,True,Premier-League 2022-2023,Arsenal,LW,,,,,LW,2022-2023
3,Crystal Palace,Arsenal,1659726000,9.000,gh GHA,W,30-328,90.000,0,0,0,0,1.000,0.000,0,0,43.000,2.000,0.000,0.000,0.100,0.100,0.100,7.000,0.000,27.000,34.000,79.400,0.000,34.000,5.000,9.000,6.000,da052c14,Jordan Ayew,Crystal Palace,True,Premier-League 2022-2023,Arsenal,RW,AM,,,,"RW,AM",2022-2023
4,Crystal Palace,Arsenal,1659726000,10.000,eng ENG,AM,24-037,85.000,0,0,0,0,1.000,1.000,0,0,55.000,1.000,0.000,1.000,0.500,0.500,0.000,1.000,0.000,32.000,41.000,78.000,0.000,34.000,1.000,4.000,1.000,ae4fc6a4,Eberechi Eze,Crystal Palace,True,Premier-League 2022-2023,Arsenal,AM,,,,,AM,2022-2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483622,León,UANL,1558919160,3.000,mx MEX,DM,25-239,14.000,0,0,0,0,0.000,0.000,0,0,4.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000,1.000,0.000,0.000,0.000,64c07ae1,Carlos Salcedo,UANL,False,Liga-MX 2018-2019,León,DM,CB,,,,"DM,CB",2018-2019
483623,León,UANL,1558919160,6.000,mx MEX,FB,31-130,90.000,0,0,0,0,0.000,0.000,0,0,37.000,2.000,2.000,1.000,0.000,0.000,0.000,1.000,0.000,18.000,27.000,66.700,3.000,23.000,0.000,1.000,0.000,9920b309,Jorge Torres Nilo,UANL,True,Liga-MX 2018-2019,León,LB,,,,,LB,2018-2019
483624,León,UANL,1558919160,21.000,co COL,CB,27-270,90.000,0,0,0,0,0.000,0.000,0,0,47.000,3.000,3.000,1.000,0.000,0.000,0.000,1.000,0.000,23.000,36.000,63.900,0.000,21.000,0.000,0.000,0.000,05540190,Francisco Meza,UANL,True,Liga-MX 2018-2019,León,CB,CM,,,,"CB,CM",2018-2019
483625,León,UANL,1558919160,4.000,mx MEX,CB,32-056,90.000,0,0,0,0,0.000,0.000,0,0,46.000,2.000,1.000,1.000,0.000,0.000,0.000,1.000,0.000,34.000,39.000,87.200,2.000,25.000,0.000,0.000,0.000,cf9989c2,Hugo Ayala,UANL,True,Liga-MX 2018-2019,León,CB,,,,,CB,2018-2019


In [17]:
df_teams_positions = (
    df
    .groupby(["opposition_team", "position"], as_index=False)
    .agg(
        npxg=("npxg", sum),
        minutes=("minutes", sum),
    )
    .assign(
        npxg90=lambda x: (x.npxg / x.minutes) * 90.0 * 11,
    )
)

df_teams_positions

Unnamed: 0,opposition_team,position,npxg,minutes,npxg90
0,ADO Den Haag,AM,16.000,3764.000,4.208
1,ADO Den Haag,CB,13.100,17192.000,0.754
2,ADO Den Haag,CM,10.500,6296.000,1.651
3,ADO Den Haag,DM,6.600,6848.000,0.954
4,ADO Den Haag,FB,10.100,16888.000,0.592
...,...,...,...,...,...
2117,Zwolle,FB,11.700,21303.000,0.544
2118,Zwolle,FW,59.400,13958.000,4.213
2119,Zwolle,W,45.000,17053.000,2.612
2120,Zwolle,WB,0.900,1310.000,0.680


In [18]:
df_positions = (
    df_teams_positions
    .groupby(["position"], as_index=False)
        .agg(
        npxg=("npxg", sum),
        minutes=("minutes", sum),
    )
    .assign(
        npxg90=lambda x: (x.npxg / x.minutes) * 90.0 * 11,
    )
)

df_positions

Unnamed: 0,position,npxg,minutes,npxg90
0,AM,3457.1,1563202.0,2.189
1,CB,4057.2,7331374.0,0.548
2,CM,3387.4,4243012.0,0.79
3,DM,1405.8,2054645.0,0.677
4,FB,1898.9,4927616.0,0.382
5,FW,16808.7,4478009.0,3.716
6,W,7199.4,3123691.0,2.282
7,WB,918.7,1163399.0,0.782
8,WM,4697.2,3194415.0,1.456


In [33]:
df_teams_positions_β = (
    df_teams_positions
    .merge(df_positions[["position", "npxg90"]], how="left", on=["position"], suffixes=["", "_pos"], validate="m:1")
    .query("(minutes > 5000)")
    .assign(
        npxg90_ratio=lambda x: x.npxg90 / x.npxg90_pos,
        β_team_pos=lambda x: -np.log(x.npxg90_ratio + 0.00001),
    )
)

df_teams_positions_β

Unnamed: 0,opposition_team,position,npxg,minutes,npxg90,npxg90_pos,npxg90_ratio,β_team_pos
1,ADO Den Haag,CB,13.100,17192.000,0.754,0.548,1.377,-0.320
2,ADO Den Haag,CM,10.500,6296.000,1.651,0.790,2.089,-0.737
3,ADO Den Haag,DM,6.600,6848.000,0.954,0.677,1.409,-0.343
4,ADO Den Haag,FB,10.100,16888.000,0.592,0.382,1.552,-0.440
5,ADO Den Haag,FW,61.300,9965.000,6.090,3.716,1.639,-0.494
...,...,...,...,...,...,...,...,...
2116,Zwolle,DM,9.700,9159.000,1.048,0.677,1.548,-0.437
2117,Zwolle,FB,11.700,21303.000,0.544,0.382,1.425,-0.354
2118,Zwolle,FW,59.400,13958.000,4.213,3.716,1.134,-0.126
2119,Zwolle,W,45.000,17053.000,2.612,2.282,1.145,-0.135


In [36]:
df_teams_positions_β_std = (
    df_teams_positions_β
    .groupby(["opposition_team"], as_index=False)
    .agg(
        β_team_pos_std=("β_team_pos", "std"),
    )
)

df_teams_positions_β_std

Unnamed: 0,opposition_team,β_team_pos_std
0,ADO Den Haag,0.168
1,AZ Alkmaar,0.202
2,Ajaccio,0.087
3,Ajax,0.13
4,Alavés,0.098
5,Almería,0.449
6,Amiens,0.186
7,América,0.163
8,Angers,0.198
9,Arminia,0.148


In [47]:
df_teams_positions_β.query("opposition_team == 'Bayern Munich'")

Unnamed: 0,opposition_team,position,npxg,minutes,npxg90,npxg90_pos,npxg90_ratio,β_team_pos
216,Bayern Munich,AM,25.4,11247.0,2.236,2.189,1.021,-0.021
217,Bayern Munich,CB,13.1,43837.0,0.296,0.548,0.54,0.616
218,Bayern Munich,CM,17.8,25973.0,0.678,0.79,0.858,0.153
219,Bayern Munich,DM,7.1,12459.0,0.564,0.677,0.833,0.183
220,Bayern Munich,FB,6.6,23916.0,0.273,0.382,0.716,0.334
221,Bayern Munich,FW,70.4,24927.0,2.796,3.716,0.752,0.284
222,Bayern Munich,W,26.4,16923.0,1.544,2.282,0.677,0.39
223,Bayern Munich,WB,5.8,9214.0,0.623,0.782,0.797,0.227
224,Bayern Munich,WM,15.5,14531.0,1.056,1.456,0.725,0.321


In [41]:
print(f"β_team_pos sigma = {df_teams_positions_β_std.β_team_pos_std.mean(): .2f}")

β_team_pos sigma =  0.17
