In [1]:
CACHED_TRAIN = True
CACHED_VAL = True

In [2]:
package_paths = [r"C:\Users\benja\Documents\projects\goalscorers"]
import sys
for path in package_paths:
    sys.path.append(path)
import goalscorer_package.constants as c
import goalscorer_package.data_cleaning as dc
import goalscorer_package.modelling as m
import datetime as dt
import pandas as pd
import numpy as np
import pymc as pm
print(f"Running on PyMC v{pm.__version__}")
import arviz as az
print(f"Running on Az v{az.__version__}")
import pytensor.tensor as pt
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import glob
from statistics import mode
import pickle
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 100)
pd.options.display.float_format = "{: ,.3f}".format
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
%config InlineBackend.figure_formats = ["retina"]

Running on PyMC v5.6.1
Running on Az v0.16.0


# Hyper-Parameters

In [3]:
DICT_PLAYER_POSITION_BASED_PRIORS = {
    "FB": (-3.32, 0.86),
    "CB": (-2.75, 0.74),
    "WB": (-3.08, 0.73),
    "DM": (-3.13, 0.82),
    "CM": (-3.04, 0.86),
    "WM": (-2.55, 0.68),
    "AM": (-2.20, 0.56),
    "W": (-2.06, 0.50),
    "FW": (-1.70, 0.38),
}
DICT_PLAYER_POSITION_BASED_MINS = {
    "FB": -4.02,
    "CB": -3.98,
    "WB": -4.68,
    "DM": -4.51,
    "CM": -4.52,
    "WM": -4.81,
    "AM": -4.99,
    "W": -4.96,
    "FW": -5.05,
}
DICT_HOME_PRIOR = {
    "home": (0.2,  0.064),
}
DICT_POSITION_PRIORS = {
    "FB": (-0.59, 0.037),
    "CB": (-0.63, 0.043),
    "WB": (0.076, 0.044),
    "DM": (-0.098, 0.032),
    "CM": (-0.089, 0.027),
    "WM": (0.20, 0.025),
    "AM": (0.38, 0.036),
    "W": (0.30, 0.029),
    "FW": (0.45, 0.040),
}

In [4]:
DICT_PROM_PLAYER_POSITION_BASED_PRIORS = {
    "FB": (-3.53, 0.86),
    "CB": (-2.85, 0.74),
    "WB": (-3.28, 0.73),
    "DM": (-3.30, 0.82),
    "CM": (-3.22, 0.86),
    "WM": (-2.65, 0.68),
    "AM": (-2.32, 0.56),
    "W": (-2.24, 0.50),
    "FW": (-1.80, 0.38),
}

In [5]:
PLAYER_COEF, SQUAD_POS_COEF = 0.56, 0.4
CHANGE_TEAM_VARIANCE_INCREASE = 1.2

In [6]:
NUM_CHAINS = 4

# Functions

## Data

In [7]:
def load_season(seasons: list[str], comp_ids: list[int]) -> pd.DataFrame:
    seasons_leagues = dc.get_seasons_leagues_from_str(seasons, comp_ids)
    df = dc.load_seasons_leagues_files("summary", True, seasons_leagues)
    return df    

In [8]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = dc.add_home(df)
    df = dc.add_opp_team(df)
    df = dc.add_frac_90(df)
    df = dc.add_league(df)
    df = dc.add_season(df)
    df["league_10"] = np.where((df.league == 10), 1, 0)
    df = dc.drop_na_npxg(df)
    df = dc.drop_na_frac_90(df)
    df = dc.add_npg(df)
    
    # Positions
    df = dc.split_positions(df)
    df = dc.position_to_generic_position(df)
    df = dc.drop_gk(df)

    # Time
    df = dc.add_datetime(df)
    df_matches = (
        df[["datetime", "home_team", "away_team"]]
        .drop_duplicates(ignore_index=True)
        .sort_values(["datetime", "home_team"], ignore_index=True)
        .assign(match_number=lambda x: range(1, len(x)+1))
    )
    df = (
        df
        .merge(df_matches, how="left", on=["datetime", "home_team", "away_team"], validate="m:1")
        .sort_values(["match_number", "home", "position"], ignore_index=True)
        .assign(time_interval=lambda x: pd.qcut(x.match_number, q=38))
    )
    df_interval_to_t = pd.DataFrame({"t": range(1, 39), "time_interval": np.sort(df.time_interval.unique())})
    df = df.merge(df_interval_to_t, how="left", on=["time_interval"], validate="m:1")


    return df 

In [9]:
def df_to_model_df(df: pd.DataFrame) -> pd.DataFrame:
    df_model = df[["npxg", "player_id", "home", "opposition_team", "position", "league_10", "frac_90", "start", "league", "t"]].copy()
    return df_model

## Prior Specification

### Initial Prior

In [10]:
def first_player_prior_specification(df: pd.DataFrame, player_ids: list) -> pd.DataFrame:

    # # position, squad, league
    # df_player_priors = (
    #     dc
    #     .calc_main_position(df)
    #     .merge(dc.calc_main_team(df), how="left", on=["player_id"], validate="1:1")
    # )
    # df_team_league = (
    #     df_model[["opposition_team", "league"]]
    #     .drop_duplicates(ignore_index=True)
    #     .rename(columns={"opposition_team": "squad"})
    # )
    # df_player_priors = (
    #     df_player_priors
    #     .merge(df_team_league, how="left", on=["squad"], validate="m:1")
    # )

    # # mu, sigma, lower
    df_player_priors = (
        df
        .groupby("player_id", as_index=False)
        .agg(position=("position", mode), squad=("squad", mode), league=("league", mode))
        .assign(
            mu=lambda x: [DICT_PLAYER_POSITION_BASED_PRIORS[pos][0] for pos in x.position.values] - np.where((x.league == 10), 0.4, 0.0),
            sigma=lambda x: [DICT_PLAYER_POSITION_BASED_PRIORS[pos][1] for pos in x.position.values],
            lower=lambda x: [DICT_PLAYER_POSITION_BASED_MINS[pos] for pos in x.position.values] ,
        )
        .set_index("player_id", drop=False)
        .loc[player_ids]
    )
    return df_player_priors

In [11]:
def first_team_prior_specification(df: pd.DataFrame, teams: list) -> pd.DataFrame:
    df_teams_leagues = df[["home_team", "league"]].drop_duplicates()

    df_team_priors = pd.DataFrame({
        "team": df_teams_leagues.home_team.values,
        "league": df_teams_leagues.league.values,
        "mu": [0.0 for _ in range(len(teams))],
        "sigma": [0.22 for _ in range(len(teams))],
    }, index=df_teams_leagues.home_team.values).loc[teams]

    return df_team_priors

In [12]:
def first_home_prior_specification() -> pd.DataFrame:
    return pd.DataFrame([{"mu": DICT_HOME_PRIOR["home"][0], "sigma": DICT_HOME_PRIOR["home"][0]}])

In [13]:
def first_position_prior_specification(player_ids: list, positions: list) -> pd.DataFrame:
    position_mus = [DICT_POSITION_PRIORS[position][0] for position in positions]
    position_sigmas = [DICT_POSITION_PRIORS[position][1] for position in positions]
    
    df_position_priors = (
        pd.DataFrame({"player_id": player_ids})
        .assign(
            mu=lambda x: [position_mus for _ in range(len(x))],
            sigma=lambda x: [position_sigmas for _ in range(len(x))],
        )
    )

    return df_position_priors

### Next Season Prior

In [14]:
def get_team_position_Δ(df_posterior: pd.DataFrame) -> pd.DataFrame:
    return (
        df_posterior
        .assign(Δ_minutes=lambda x: x.minutes * x.Δ)
        .groupby(["squad", "position"], as_index=False)
        .agg(total_minutes=("minutes", sum), total_Δ=("Δ_minutes", sum))
        .assign(Δ=lambda x: x.total_Δ / x.total_minutes)
    )

In [15]:
def get_new_season_player_priors(df_post_player_priors: pd.DataFrame, df_posterior: pd.DataFrame, df: pd.DataFrame, player_ids: list) -> pd.DataFrame:
    # Seen players
    df_player_priors = (
        df
        .groupby("player_id", as_index=False)
        .agg(squad=("squad", mode), league=("league", mode), position=("position", mode))
        .merge(df_post_player_priors[["player_id", "squad", "mu", "sigma"]], how="left", on=["player_id", "squad"], validate="1:1")
        .query("player_id in @player_ids").reset_index(drop=True)
    )
    df_player_seen_priors = df_player_priors.dropna(subset=["mu", "sigma"], ignore_index=True)
    
    # players moving teams
    df_team_position_Δ = get_team_position_Δ(df_posterior)
    df_player_moving_teams_priors = (
        df_player_priors
        .query("mu.isna()")
        .reset_index(drop=True)
        .merge(df_team_position_Δ[["squad", "position", "Δ"]], how="left", on=["squad", "position"], validate="m:1")
        .rename(columns={"Δ": "squad_pos_Δ"})
        .merge(df_post_player_priors[["player_id", "mu", "sigma"]].rename(columns={"mu": "player_Δ", "sigma": "player_sigma"}), 
               how="left", on=["player_id"], validate="1:1")
        .assign(
            mu=lambda x: PLAYER_COEF * x.player_Δ + SQUAD_POS_COEF * x.squad_pos_Δ,
        )
    )
    df_player_moving_teams_priors.mu = (
        df_player_moving_teams_priors
        .mu
        # have player info and team info
        .fillna(PLAYER_COEF * df_player_moving_teams_priors.player_Δ + SQUAD_POS_COEF * df_player_moving_teams_priors.squad_pos_Δ)
        # only have team info
        .fillna((PLAYER_COEF + SQUAD_POS_COEF) * df_player_moving_teams_priors.squad_pos_Δ)
        # only have player info, unseen promoted team
        .fillna((PLAYER_COEF + SQUAD_POS_COEF) * df_player_moving_teams_priors.player_Δ)
    )
    df_player_moving_teams_priors.sigma = df_player_moving_teams_priors.player_sigma * CHANGE_TEAM_VARIANCE_INCREASE
    df_player_moving_teams_priors = df_player_moving_teams_priors[["player_id", "position", "squad", "mu", "sigma"]]
    df_player_prom_teams_priors = df_player_moving_teams_priors.query("mu.isna()").reset_index(drop=True)
    df_player_moving_teams_priors = df_player_moving_teams_priors.dropna(subset=["mu"], ignore_index=True)
    df_player_moving_teams_priors.sigma = (
        df_player_moving_teams_priors
        .sigma
        .fillna(
            pd.Series(np.vectorize(DICT_PLAYER_POSITION_BASED_PRIORS.get)(df_player_moving_teams_priors.position)[1])
        )
    )
    
    # promoted team, unseen player
    if (len(df_player_prom_teams_priors.mu) > 0):
        df_player_prom_teams_priors.mu = np.vectorize(DICT_PROM_PLAYER_POSITION_BASED_PRIORS.get)(df_player_prom_teams_priors.position)[0]
        df_player_prom_teams_priors.sigma = np.vectorize(DICT_PROM_PLAYER_POSITION_BASED_PRIORS.get)(df_player_prom_teams_priors.position)[1]
    
    # All players
    df_player_priors = (
        pd
        .concat([df_player_seen_priors, df_player_moving_teams_priors, df_player_prom_teams_priors], ignore_index=True)
        .set_index("player_id", drop=False)
        .loc[player_ids]
        .assign(lower=lambda x: np.vectorize(DICT_PLAYER_POSITION_BASED_MINS.get)(df_player_priors.position))
    )
    
    return df_player_priors

In [16]:
def get_new_season_team_priors(df_post_team_priors: pd.DataFrame, df: pd.DataFrame, teams: list) -> pd.DataFrame:
    # seen teams
    df_team_priors = (
        df[["home_team", "league", "season"]]
        .drop_duplicates(ignore_index=True)
        .rename(columns={"home_team": "team"})
        .merge(df_post_team_priors, how="left", on=["team", "league"], validate="1:1")
    )
    df_team_seen_priors = df_team_priors.dropna(subset=["mu", "sigma"], ignore_index=True)

    # teams promoted / relegated - seen
    df_team_prom_priors = (
        df_team_priors
        .query("mu.isna()")
        .reset_index(drop=True)
        [["team", "league", "season"]]
        .merge(df_post_team_priors[["team", "mu", "sigma"]], how="left", on=["team"], validate="1:1")
        .assign(
            mu=lambda x: np.where((x.league == 9), -0.16 + 0.32 * x.mu, x.mu),  # promoted
            sigma=0.22
        )
        .assign(mu=lambda x: np.where((x.league == 10), 0.2 + x.mu, x.mu))  # relegated
    )
    df_team_unseen_prom_priors = df_team_prom_priors.query("mu.isna() & ((season != '2018-2019') | (league != 10)) ").reset_index(drop=True)
    df_team_unseen_champ_priors = df_team_prom_priors.query("mu.isna() & (season == '2018-2019') & (league == 10) ").reset_index(drop=True)
    df_team_prom_priors = df_team_prom_priors.dropna(subset=["mu"], ignore_index=True)

    # teams promoted unseen
    df_team_unseen_prom_priors.mu = -0.14
    df_team_unseen_prom_priors.sigma = 0.17

    # unseen 2018-2019 champ teams (no champ 2017-2018 data, so all champ teams unseen here)
    df_team_unseen_champ_priors.mu = 0.0
    df_team_unseen_champ_priors.sigma = 0.22

    # All teams
    df_teams_prior = (
        pd
        .concat([df_team_seen_priors, df_team_prom_priors, df_team_unseen_prom_priors, df_team_unseen_champ_priors])
        .set_index("team", drop=False)
        .loc[teams]
    )
    return df_teams_prior

In [17]:
def get_new_season_home_priors(df_post_home_priors: pd.DataFrame) -> pd.DataFrame:
    return df_post_home_priors

In [18]:
def get_new_season_position_priors(df_post_position_priors: pd.DataFrame, player_ids: list, positions: list) -> pd.DataFrame:
    def fill_nan_with_array(x, mu_or_sigma: str):
        if (isinstance(x, np.ndarray) and (x.size > 0)):
            return x
        elif (mu_or_sigma == "mu"):
            return [DICT_POSITION_PRIORS[position][0] for position in positions]
        elif (mu_or_sigma == "sigma"):
            return [DICT_POSITION_PRIORS[position][1] for position in positions]

    df_position_priors = (
        pd
        .DataFrame({"player_id": player_ids})
        .merge(df_post_position_priors, how="left", on=["player_id"], validate="1:1")
        .assign(
            mu=lambda x: x.mu.apply(fill_nan_with_array, args=("mu",)),
            sigma=lambda x: x.sigma.apply(fill_nan_with_array, args=("sigma",)),
            )
    )
    return df_position_priors

## Fit

In [19]:
def create_model(df_all: pd.DataFrame, df_model: pd.DataFrame, priors: tuple, player_id_codes: list, team_codes: list, position_codes: list, 
                 coords: dict) -> pm.model.Model:
    df_player_priors, df_team_priors, df_home_priors, df_position_priors = priors
    league_masks = m.create_league_masks(df_all, coords["teams"], coords["leagues"])

    # Weighted Exponential
    def logp(value: pt.TensorVariable, lam: pt.TensorVariable, w: pt.TensorVariable) -> pt.TensorVariable:
        log_prob = w * (np.log(lam) - lam * value)
        return log_prob
    
    def random(lam, w, rng=None, size=None):
        return scipy.stats.expon.rvs(scale=1.0/lam, size=size)

    with pm.Model(coords=coords) as model:
        # Data
        home_data = pm.MutableData("home_data", df_model.home.values)
        player_id_codes_data = pm.MutableData("player_id_codes_data", player_id_codes)
        team_codes_data = pm.MutableData("team_codes_data", team_codes)
        position_codes_data = pm.MutableData("position_codes_data", position_codes)
        league_10_data = pm.MutableData("league_10_data", df_model.league_10.values)
        frac_90_data = pm.MutableData("frac_90_data", df_model.frac_90.values)
        npxg_data = pm.MutableData("npxg_data", df_model.npxg.values)

        # Priors
        γ = pm.Normal("γ", mu=df_home_priors.mu.values, sigma=df_home_priors.sigma.values)
        Δ = pm.TruncatedNormal("Δ", mu=df_player_priors.mu.values, sigma=df_player_priors.sigma.values, 
                               lower=df_player_priors.lower.values, dims="player_ids")
        β_ = pm.Normal("β_", mu=df_team_priors.mu.values, sigma=df_team_priors.sigma.values, dims="teams")
        P = pm.Normal("P", mu=np.array([list(x) for x in df_position_priors.mu.values]), 
                      sigma=np.array([list(x) for x in df_position_priors.sigma.values]), dims=("player_ids", "positions"))
        l_10 = 0.4

        # Deterministic transform
        for league_mask in league_masks:
            β_ = pt.add(β_, -(pt.sum(league_mask * β_) / pt.sum(league_mask)) * league_mask)
        β = pm.Deterministic("β", β_, dims="teams")
    
        η = pm.Deterministic("η", pt.exp(
            γ * home_data
            + Δ[player_id_codes_data]
            - β[team_codes_data]
            + P[player_id_codes_data, position_codes_data]
            + l_10 * league_10_data
        ))
        τ = frac_90_data

        exp = pm.Deterministic("exp", 1.0 / (η * τ))
        exp = pt.switch(pt.lt(exp, 0.000001), 0.000001, exp)
        
        # Likelihood
        x = pm.CustomDist(
            "x",
            exp, frac_90_data,
            logp=logp,
            random=random,
            observed=npxg_data
        )

    return model

## Posterior

### Posterior to priors

In [20]:
def players_posterior_to_priors(df: pd.DataFrame, idata: az.InferenceData) -> pd.DataFrame:
    df_post_player_priors = (
        pd
        .DataFrame({"player_id": idata.posterior.Δ.player_ids})
        .merge(dc.calc_main_position(df), how="left", on=["player_id"], validate="1:1")
        .merge(dc.calc_main_squad(df), how="left", on=["player_id"], validate="1:1")
        .assign(
            mu=lambda x: [idata.posterior.Δ.sel(player_ids=player_id).data.mean() for player_id in x.player_id.values], 
            sigma=lambda x: [idata.posterior.Δ.sel(player_ids=player_id).data.std() for player_id in x.player_id.values], 
        )
    )
    return df_post_player_priors

In [21]:
def teams_posterior_to_priors(df: pd.DataFrame, idata: az.InferenceData) -> pd.DataFrame:
    df_post_team_priors = (
        df[["home_team", "league"]]
        .drop_duplicates(ignore_index=True)
        .rename(columns={"home_team": "team"})
        .assign(
            mu=lambda x: [idata.posterior.β.sel(teams=team).data.mean() for team in x.team.values], 
            sigma=lambda x: [idata.posterior.β.sel(teams=team).data.std() for team in x.team.values], 
        )
    )
    return df_post_team_priors

In [22]:
def positions_posterior_to_priors(positions: list, idata: az.InferenceData) -> pd.DataFrame:
    mus, sigmas = [], []
    for player_id in idata.posterior.P.player_ids:
        mus_player, sigmas_player = [], []
        for position in positions:
            mu_player = idata.posterior.P.sel(player_ids=player_id).data.mean()
            sigma_player = idata.posterior.P.sel(player_ids=player_id).data.std()
            mus_player.append(mu_player)
            sigmas_player.append(sigma_player)
        mus.append(np.array(mus_player))
        sigmas.append(np.array(sigmas_player))
    
    df_position_priors = (
        pd
        .DataFrame({"player_id": idata.posterior.Δ.player_ids})
        .assign(
            mu=mus, 
            sigma=sigmas, 
        )
    )
    return df_position_priors

In [23]:
def home_posterior_to_priors(idata: az.InferenceData) -> pd.DataFrame:
    return pd.DataFrame([{"mu": idata.posterior.γ.data.mean(), "sigma": idata.posterior.γ.data.std()}])

### Utils

In [24]:
def increase_prior_variance(df_post_prior: pd.DataFrame, perc=10.0) -> pd.DataFrame:
    df_post_prior.sigma = df_post_prior.sigma * (1.0 + perc / 100.0)
    return df_post_prior

In [25]:
def get_df_posterior(df: pd.DataFrame, idata: az.InferenceData) -> pd.DataFrame:
    dict_bu = {"player_id": [], "Δ": []}

    for player_id in df.player_id.unique():
        Δ = idata.posterior.Δ.sel(player_ids=player_id).data.flatten().mean()

        dict_bu["player_id"].append(player_id)
        dict_bu["Δ"].append(Δ)

    df_posterior = (
        df
        .merge(pd.DataFrame(dict_bu), how="left", on=["player_id"], validate="m:1")
    )
    return df_posterior

## Prediction

In [26]:
def oos_predictions(
    model: pm.model.Model, idata: az.InferenceData, df_model_test: pd.DataFrame, player_id_codes_test: list, team_codes_test: list,
    position_codes_test: list
) -> az.InferenceData:
    with model:
        pm.set_data({
            "home_data": df_model_test.home.values,
            "player_id_codes_data": player_id_codes_test,
            "team_codes_data": team_codes_test,
            "position_codes_data": position_codes_test,
            "league_10_data": df_model_test.league_10.values,
            "frac_90_data": df_model_test.frac_90.values,
            "npxg_data": df_model_test.npxg.values,
        })
        pm.sample_posterior_predictive(idata, extend_inferencedata=True, predictions=True)

    return idata

In [27]:
def add_predictions_to_df(df_test: pd.DataFrame, idata: az.InferenceData) -> pd.DataFrame:
    df_posterior_predictive_chains = pd.DataFrame(
        [
            [idata.predictions.sel(chain=c, x_dim_2=r).x.data for c in range(NUM_CHAINS)]
            for r in range(len(df_test))
        ],
        columns=[f"pred_npxg_{c}" for c in range(NUM_CHAINS)]
    )
    
    df_posterior_predictive = (
        df_test
        .join(df_posterior_predictive_chains)
    )
    return df_posterior_predictive

In [28]:
def add_pred_to_df(df_test: pd.DataFrame) -> pd.DataFrame:
    df_test["pred_npxg"] = 0.0
    for c in range(NUM_CHAINS):
        df_test.pred_npxg += np.array([df_test[f"pred_npxg_{c}"][n].mean() for n in range(len(df_test))]) / NUM_CHAINS
    return df_test

In [29]:
def add_ll_to_df(df_test: pd.DataFrame) -> pd.DataFrame:
    df_test["log_likelihood"] = (
        np.maximum(-10_000.0, np.log(scipy.stats.expon.pdf(df_test.npxg, scale=df_test["pred_npxg"])))
    )
    return df_test

# Main

## TRAIN

In [30]:
post_priors_cache = dict()

In [31]:
train_seasons = ["2017-2018", "2018-2019", "2019-2020"]
train_comp_ids = [9, 10, 11, 12, 13, 20]

In [32]:
season = "2017-2018"

In [33]:
if not CACHED_TRAIN:
    for season in train_seasons:
        # Data
        df = load_season([season], train_comp_ids)
        df = clean_df(df)
        df_model = df_to_model_df(df)

        # Model data
        player_id_codes, player_ids = pd.factorize(df_model.loc[:, "player_id"], sort=True)
        team_codes, teams = pd.factorize(df_model.loc[:, "opposition_team"], sort=True)
        position_codes, positions = pd.factorize(df_model.loc[:, "position"], sort=True)
        league_codes, leagues = pd.factorize(df_model.loc[:, "league"], sort=True)

        # Prior specification
        if (season == train_seasons[0]):
            df_player_priors = first_player_prior_specification(df, player_ids)
            df_team_priors = first_team_prior_specification(df, teams)
            df_home_priors = first_home_prior_specification()
            df_player_position_priors = first_position_prior_specification(player_ids, positions)
        else:
            df_player_priors = get_new_season_player_priors(df_post_player_priors, df_posterior, df, player_ids)
            df_team_priors = get_new_season_team_priors(df_post_team_priors, df, teams)
            df_home_priors = get_new_season_home_priors(df_post_home_priors)
            df_player_position_priors = get_new_season_position_priors(df_post_position_priors, player_ids, positions)
        priors = (df_player_priors, df_team_priors, df_home_priors, df_player_position_priors)

        # Model
        coords = {"player_ids": player_ids, "teams": teams, "positions": positions, "leagues": leagues}
        model = create_model(df_model, df_model, priors, player_id_codes, team_codes, position_codes, coords)

        with model:
            idata = pm.sample()

        # Posterior priors
        df_posterior = get_df_posterior(df, idata)

        df_post_player_priors = players_posterior_to_priors(df, idata)
        df_post_team_priors = teams_posterior_to_priors(df, idata)
        df_post_position_priors = positions_posterior_to_priors(positions, idata)
        df_post_home_priors = home_posterior_to_priors(idata)

        df_post_player_priors = increase_prior_variance(df_post_player_priors)
        df_post_team_priors = increase_prior_variance(df_post_team_priors)
        df_post_position_priors = increase_prior_variance(df_post_position_priors)
        df_post_home_priors = increase_prior_variance(df_post_home_priors)

        post_priors_cache[season] = {
            "df_posterior": df_posterior,
            "df_post_player_priors": df_post_player_priors,
            "df_post_team_priors": df_post_team_priors,
            "df_post_position_priors": df_post_position_priors,
            "df_post_home_priors": df_post_home_priors,
        }
    
    with open(r"cache\post-priors-cache.pickle", "wb") as post_priors_cache_file:
        pickle.dump(post_priors_cache, post_priors_cache_file)

else:
    with open(r"cache\post-priors-cache.pickle", "rb") as post_priors_cache_file:
        post_priors_cache = pickle.load(post_priors_cache_file)

    post_priors_cache_final_season = post_priors_cache[train_seasons[-1]]

    df_posterior = post_priors_cache_final_season["df_posterior"]
    df_post_player_priors = post_priors_cache_final_season["df_post_player_priors"]
    df_post_team_priors = post_priors_cache_final_season["df_post_team_priors"]
    df_post_position_priors = post_priors_cache_final_season["df_post_position_priors"]
    df_post_home_priors = post_priors_cache_final_season["df_post_home_priors"]

## VAL

In [34]:
def add_to_df_test(df_test: pd.DataFrame) -> pd.DataFrame:
    df_test = add_pred_to_df(df_test)
    df_test = add_ll_to_df(df_test)
    df_test["mse"] = (df_test.pred_npxg - df_test.npxg) ** 2
    df_test["pred_npxg90"] = df_test.pred_npxg / df_test.frac_90
    return df_test

In [35]:
val_seasons = ["2020-2021"]
val_comp_ids = [9, 11, 12, 13, 20]

In [36]:
season = "2020-2021"

In [37]:
if not CACHED_VAL:
    df_test_preds = []
    # for season in val_seasons:
    df = load_season([season], val_comp_ids)
    df = clean_df(df)

    ts = np.sort(df.t.unique())
    ts = ts[1:]
    counter = 2
    for t in ts:
        # Train / Test
        df_train, df_test = df.query("t < @t").reset_index(drop=True), df.query("t == @t").reset_index(drop=True)
        df_model_train, df_model_test = df_to_model_df(df_train), df_to_model_df(df_test)
        df_model = pd.concat([df_model_train, df_model_test], ignore_index=True)

        # Model data
        player_id_codes, player_ids = pd.factorize(df_model.loc[:, "player_id"], sort=True)
        team_codes, teams = pd.factorize(df_model.loc[:, "opposition_team"], sort=True)
        position_codes, positions = pd.factorize(df_model.loc[:, "position"], sort=True)
        league_codes, leagues = pd.factorize(df_model.loc[:, "league"], sort=True)

        player_id_codes_train, player_id_codes_test = player_id_codes[: len(df_model_train)], player_id_codes[len(df_model_train): ]
        team_codes_train, team_codes_test = team_codes[: len(df_model_train)], team_codes[len(df_model_train): ]
        position_codes_train, position_codes_test = position_codes[: len(df_model_train)], position_codes[len(df_model_train): ]

        # Prior specification
        df_player_priors = get_new_season_player_priors(df_post_player_priors, df_posterior, df, player_ids)
        df_team_priors = get_new_season_team_priors(df_post_team_priors, df, teams)
        df_home_priors = get_new_season_home_priors(df_post_home_priors)
        df_player_position_priors = get_new_season_position_priors(df_post_position_priors, player_ids, positions)

        priors = (df_player_priors, df_team_priors, df_home_priors, df_player_position_priors)

        # Train model
        coords = {"player_ids": player_ids, "teams": teams, "positions": positions, "leagues": leagues}
        model = create_model(df_model, df_model, priors, player_id_codes, team_codes, position_codes, coords)

        with model:
            idata = pm.sample()

        # Test predictions
        idata =  oos_predictions(model, idata, df_model_test, player_id_codes_test, team_codes_test, position_codes_test)
        df_test_pred = add_predictions_to_df(df_test, idata)
        df_test_preds.append(df_test_pred)

        print("Finished", counter, "/", len(ts) + 1)
        counter += 1
        
    # Test DataFrame
    df_test = pd.concat(df_test_preds, ignore_index=True)
    df_test = add_to_df_test(df_test)
    df_test.to_pickle(r"cache\df_test.pickle")

else:
    df_test = pd.read_pickle(r"cache\df_test.pickle")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 142 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
Sampling: [x]


Finished 2 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 152 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
There were 5 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 3 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 183 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
There were 5 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 4 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 192 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3756 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 5 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 198 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3933 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 6 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 180 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3883 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 7 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 220 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3684 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 8 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 199 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3766 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 9 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 249 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3750 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 10 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 264 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3591 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 11 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 271 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3647 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 12 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 277 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3821 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 13 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 285 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3962 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 14 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 308 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3981 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 15 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 334 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3895 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 16 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 364 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3717 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 17 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 364 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3573 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 18 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 394 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3860 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 19 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 441 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3944 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 20 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 401 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3937 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 21 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 480 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 3833 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 22 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 462 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 23 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 471 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 24 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 488 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 25 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 574 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 26 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 602 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 27 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 630 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 28 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 615 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 29 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 558 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 30 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 622 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 31 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 609 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 32 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 637 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 33 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 614 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 34 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 617 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 35 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 670 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 36 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 685 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 37 / 38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_moving_teams_priors.sigma = (
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [γ, Δ, β_, P]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 641 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 4000 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [x]


Finished 38 / 38


In [39]:
df_test = add_to_df_test(df_test)

# Model Evaluation

In [40]:
df_test_set = df_test.query("start").reset_index(drop=True)

In [41]:
print(f"Number of predictions = {len(df_test_set)}")
print(f"Log likelihood = {df_test_set['log_likelihood'].sum(): ,.2f}")
print(f"MSE = {df_test_set['mse'].mean(): .5f}")

Number of predictions = 28078
Log likelihood =  78,777.33
MSE =  0.02822


In [42]:
position = "FW"
df_position = df_test_set.query("position == @position").reset_index(drop=True)
print(f"Number of predictions = {len(df_position)}")
print(f"Log likelihood = {df_position['log_likelihood'].sum(): ,.2f}")
print(f"MSE = {df_position['mse'].mean(): .5f}")

Number of predictions = 4059
Log likelihood =  1,744.97
MSE =  0.09220


In [43]:
player_id = "87935cf3"
df_player = df_test_set.query("player_id == @player_id").reset_index(drop=True)
print(f"Number of predictions = {len(df_player)}")
print(f"Log likelihood = {df_player['log_likelihood'].sum(): ,.2f}")
print(f"MSE = {df_player['mse'].mean(): .5f}")

Number of predictions = 2
Log likelihood =  7.03
MSE =  0.00197


In [44]:
home_team = "Manchester City"
away_team = "Brighton & Hove Albion"
home = 1
df_test_match = df_test_set.query("(home_team == @home_team) & (away_team == @away_team) & (home == @home)").reset_index(drop=True)
print(f"Number of predictions = {len(df_test_match)}")
print(f"Log likelihood = {df_test_match['log_likelihood'].sum(): ,.2f}")
print(f"MSE = {df_test_match['mse'].mean(): .5f}")

Number of predictions = 10
Log likelihood =  14.15
MSE =  0.02234


In [45]:
df_test_set[["player_id", "t", "position", "home_team", "away_team", "frac_90", "pred_npxg", "npxg"]]

Unnamed: 0,player_id,t,position,home_team,away_team,frac_90,pred_npxg,npxg
0,6eaed4eb,2,CB,Granada,Alavés,1.000,0.026,0.000
1,fd47b4f2,2,CB,Granada,Alavés,0.500,0.026,0.000
2,b7c99b9b,2,CB,Granada,Alavés,1.000,0.093,0.000
3,29b61deb,2,CM,Granada,Alavés,1.000,0.011,0.000
4,d637fc22,2,CM,Granada,Alavés,1.000,0.056,0.000
...,...,...,...,...,...,...,...,...
28073,04e17fd5,38,FW,Sevilla,Alavés,1.000,0.614,0.200
28074,a08b974a,38,W,Sevilla,Alavés,0.611,0.214,0.000
28075,4e219ad2,38,W,Sevilla,Alavés,0.811,0.071,0.100
28076,467bae22,38,WM,Sevilla,Alavés,0.611,0.064,0.000
