In [1]:
import json
import os
import tempfile

import cmdstanpy
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
SEASONS = [*range(2025, 2026)]

data_path = os.path.join(os.getcwd(), "..", "..", "Data", "results", "processed")

players_mapping = {"None": 1}
players_time_on_match = {}
stan_data = {
    "home_players": [],
    "home_players_minutes": [],
    "away_players": [],
    "away_players_minutes": [],
    "home_goals": [],
    "away_goals": [],
}

for season in SEASONS:
    file_name = f"Serie_A_{season}_squads.json"
    with open(os.path.join(data_path, file_name)) as f:
        data = json.load(f)

    for game_data in data.values():
        home_goals, away_goals = list(
            map(int, game_data["Summary"]["Result"].upper().split(" X "))
        )
        del game_data["Summary"]
        home_players = {}
        away_players = {}
        for sub_game_data in game_data.values():
            if sub_game_data["Time"] == 0:
                continue

            sub_game_time = sub_game_data["Time"]
            for player in sub_game_data["Home"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                home_players[player] = (
                    home_players.get(player, 0) + sub_game_data["Time"]
                )

            for player in sub_game_data["Away"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                away_players[player] = (
                    away_players.get(player, 0) + sub_game_data["Time"]
                )

        if sum(home_players.values()) < 990:
            home_players["None"] = 990 - sum(home_players.values())
        if sum(away_players.values()) < 990:
            away_players["None"] = 990 - sum(away_players.values())

        stan_data["home_players"].append([players_mapping[x] for x in home_players])
        stan_data["home_players_minutes"].append(list(home_players.values()))
        stan_data["away_players"].append([players_mapping[x] for x in away_players])
        stan_data["away_players_minutes"].append(list(away_players.values()))
        stan_data["home_goals"].append(home_goals)
        stan_data["away_goals"].append(away_goals)

num_players_per_game = max(
    [len(x) for x in stan_data["home_players"]]
    + [len(x) for x in stan_data["away_players"]]
)

stan_data["num_games"] = len(stan_data["home_goals"])
stan_data["num_players"] = len(players_mapping)
stan_data["num_players_per_game"] = num_players_per_game

for i in range(stan_data["num_games"]):
    while len(stan_data["home_players"][i]) < num_players_per_game:
        stan_data["home_players"][i].append(1)
        stan_data["home_players_minutes"][i].append(0)
    while len(stan_data["away_players"][i]) < num_players_per_game:
        stan_data["away_players"][i].append(1)
        stan_data["away_players_minutes"][i].append(0)

In [None]:
STAN_CODE = """
data {
    int<lower=1> num_games;
    int<lower=1> num_players;
    int<lower=1> num_players_per_game;
    array[num_games, num_players_per_game] int<lower=1, upper=num_players> home_players;
    array[num_games, num_players_per_game] int<lower=1, upper=num_players> away_players;
    array[num_games, num_players_per_game] int<lower=0> home_players_minutes;
    array[num_games, num_players_per_game] int<lower=0> away_players_minutes;
    array[num_games] int<lower=0> home_goals;
    array[num_games] int<lower=0> away_goals;
}

transformed data {
    real log_90 = log(90);
}

parameters {
    vector[num_players - 1] raw_alpha;
    real nu;
}

transformed parameters {
    vector[num_players] alpha = append_row(-sum(raw_alpha), raw_alpha);
}

model {
    raw_alpha ~ normal(0, 1);
    nu ~ normal(0, 1);

    for (game in 1:num_games) {
        vector[num_players_per_game] log_terms_home;
        vector[num_players_per_game] log_terms_away;

        for (p in 1:num_players_per_game) {
            real mins_hp = home_players_minutes[game,p];
            real mins_ap = away_players_minutes[game,p];
            if (mins_hp > 0)
                log_terms_home[p] = alpha[home_players[game,p]] + log(mins_hp) - log_90;
            else
                log_terms_home[p] = negative_infinity();
            if (mins_ap > 0)
                log_terms_away[p] = alpha[away_players[game,p]] + log(mins_ap) - log_90;
            else
                log_terms_away[p] = negative_infinity();
        }

        real log_home_skill = log_sum_exp(log_terms_home);
        real log_away_skill = log_sum_exp(log_terms_away);

        real log_lambda_home = log_home_skill - log_away_skill + nu;
        real log_lambda_away = log_away_skill - log_home_skill;
        target += poisson_log_lpmf(home_goals[game] | log_lambda_home);
        target += poisson_log_lpmf(away_goals[game] | log_lambda_away);
    }
}

generated quantities {
    real log_lik = 0;
    for (game in 1:num_games) {
        vector[num_players_per_game] log_terms_home;
        vector[num_players_per_game] log_terms_away;

        for (p in 1:num_players_per_game) {
            real mins_hp = home_players_minutes[game,p];
            real mins_ap = away_players_minutes[game,p];
            if (mins_hp > 0)
                log_terms_home[p] = alpha[home_players[game,p]] + log(mins_hp) - log_90;
            else
                log_terms_home[p] = negative_infinity();
            if (mins_ap > 0)
                log_terms_away[p] = alpha[away_players[game,p]] + log(mins_ap) - log_90;
            else
                log_terms_away[p] = negative_infinity();
        }

        real log_home_skill = log_sum_exp(log_terms_home);
        real log_away_skill = log_sum_exp(log_terms_away);

        real log_lambda_home = log_home_skill - log_away_skill + nu;
        real log_lambda_away = log_away_skill - log_home_skill;
        log_lik += poisson_log_lpmf(home_goals[game] | log_lambda_home);
        log_lik += poisson_log_lpmf(away_goals[game] | log_lambda_away);
    }
}
"""

In [4]:
ranks = []
posterior_means = []
with tempfile.NamedTemporaryFile(mode="w", suffix=".stan", delete=False) as f:
    f.write(STAN_CODE)
    stan_file_path = f.name
try:
    model = cmdstanpy.CmdStanModel(stan_file=stan_file_path)
    fit = model.sample(
        data=stan_data,
        chains=1,
        iter_warmup=1_000,
        iter_sampling=1_000,
        show_progress=False,
    )

    samples = fit.stan_variables()
finally:
    os.unlink(stan_file_path)

23:48:51 - cmdstanpy - INFO - compiling stan file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmpqp36y7_i.stan to exe file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmpqp36y7_i
23:49:06 - cmdstanpy - INFO - compiled model executable: /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmpqp36y7_i
23:49:06 - cmdstanpy - INFO - CmdStan start processing
23:49:06 - cmdstanpy - INFO - Chain [1] start processing
23:50:20 - cmdstanpy - INFO - Chain [1] done processing
	Chain 1 had 65 divergent transitions (6.5%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.


In [5]:
fit.summary().describe()

Unnamed: 0,Mean,MCSE,StdDev,MAD,5%,50%,95%,ESS_bulk,ESS_tail,R_hat
count,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0
mean,-1.62825,0.028411,0.979051,1.002246,-3.286023,-1.597658,-0.078808,1292.216653,661.089766,1.001513
std,44.882229,0.036273,0.633404,0.651479,45.671148,44.882688,44.091633,206.043976,104.73453,0.003034
min,-1376.37,0.001368,0.043987,0.04329,-1408.38,-1376.47,-1343.9,241.36,273.128,0.999001
25%,-0.099744,0.024974,0.903465,0.91346,-1.6818,-0.077211,1.343047,1162.145,593.794,0.999581
50%,0.011014,0.02663,0.951507,0.968245,-1.596205,0.028243,1.54256,1287.3,666.039,1.00048
75%,0.118884,0.028576,0.996732,1.02744,-1.50429,0.155335,1.71509,1418.045,737.211,1.00217
max,0.867601,1.2653,19.3976,19.6667,0.256621,1.02916,2.779,2250.83,944.195,1.01819


In [6]:
draws = fit.draws_pd()
np.min(draws["nu"]), np.mean(draws["nu"]), np.max(draws["nu"])

(np.float64(0.188861), np.float64(0.327728205), np.float64(0.463656))

In [7]:
cols = [f"alpha[{i}]" for i in range(1, stan_data["num_players"] + 1)]
params = (
    draws[cols]
    .describe()
    .T.reset_index()
    .rename(
        columns={
            "index": "player",
            0: "mean",
            1: "std",
            2: "min",
            3: "25%",
            4: "50%",
            5: "75%",
            6: "max",
        }
    )
)

params["player"] = params["player"].str.replace("alpha[", "").str.replace("]", "")
params["amplitude"] = params["max"] - params["min"]
reversed_players_mapping = {v: k for k, v in players_mapping.items()}
params["player_code"] = params["player"].apply(
    lambda x: reversed_players_mapping[int(x)]
)

params["time_played"] = params["player_code"].apply(
    lambda x: players_time_on_match.get(x, 0)
)

params.sort_values("mean", ascending=False)

Unnamed: 0,player,count,mean,std,min,25%,50%,75%,max,amplitude,player_code,time_played
382,383,1000.0,0.867601,1.073499,-2.72517,0.111553,0.954535,1.723895,3.17240,5.89757,546294,2387
259,260,1000.0,0.862910,1.032221,-2.72829,0.128505,0.991383,1.645142,2.85256,5.58085,346636,2408
80,81,1000.0,0.860854,1.130849,-2.80095,0.096355,0.951920,1.755335,3.00754,5.80849,297016,1689
3,4,1000.0,0.836094,1.150758,-2.68697,-0.007832,0.926793,1.785553,3.13053,5.81750,310373,2774
41,42,1000.0,0.832445,0.992481,-2.48950,0.203741,1.029165,1.549943,2.64750,5.13700,511021,2492
...,...,...,...,...,...,...,...,...,...,...,...,...
108,109,1000.0,-0.455564,0.823500,-3.86384,-0.986580,-0.385961,0.128522,1.86544,5.72928,331937,2549
403,404,1000.0,-0.533204,0.813877,-3.31971,-1.053602,-0.467025,0.031631,1.61246,4.93217,526090,1949
124,125,1000.0,-0.590521,0.828544,-3.05409,-1.126677,-0.546936,0.009075,1.41987,4.47396,517052,1782
554,555,1000.0,-0.597404,0.831486,-3.59760,-1.116382,-0.560024,-0.030766,1.20910,4.80670,337242,904


In [8]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[::-1][:20]
top_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            top_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(top_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 546294 0.867600962771 ['88Christian Christian Roberto Al ... TP546294', 'Cruzeiro / MG'] 2387
2 346636 0.86290990124 ['6Rene Rene Rodrigues Martins TP346636', 'Fluminense / RJ'] 2408
3 297016 0.8608544305200001 ['13Alex Telles Alex Nicolao Telles TP297016', 'Botafogo / RJ'] 1689
4 310373 0.83609416108 ['4Leo Pereira Leonardo Pereira TP310373', 'Flamengo / RJ'] 2774
5 511021 0.832444846 ['96Paulo Henr ... Paulo Henrique de Ol ... TP511021', 'Vasco da Gama / RJ'] 2492
6 815100 0.81054146862 ['1ROSSI Agustin Daniel Rossi T(g)P815100', 'Flamengo / RJ'] 3150
7 303716 0.7932759858790001 ['9Pedro Pedro Guilherme Abre ... RP303716', 'Flamengo / RJ'] 1011
8 458582 0.76692455561 ['2Vitinho Victor Alexander da Silva TP458582', 'Botafogo / RJ'] 2875
9 176390 0.760824045907 ['2Marcos Rocha Marcos Luis Rocha Aquino RP176390', 'Palmeiras / SP'] 1050
10 435315 0.71555114447 ['44Gabriel  B ... Gabriel Baralhas dos ... TP435315', 'Vitória / BA'] 2274
11 548055 0.67776156741 ['30Pierre Wagner Pierre Wa

In [9]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[:20]
top_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            top_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(top_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 None -21.221424048 None 0
2 337242 -0.5974037687599999 ['21Lucas Lucas Kal Schenfeld  ... TP337242', 'Sport / PE'] 904
3 517052 -0.59052136194 ['33Matheus Al ... Matheus Alexandre An ... RP517052', 'Sport / PE'] 1782
4 526090 -0.5332040949319999 ['2Ramon Ramon Ramos Lima RP526090', 'Internacional / RS'] 1949
5 331937 -0.45556408272 ['10Lucas Lima Lucas Rafael Araujo Lima TP331937', 'Sport / PE'] 2549
6 321898 -0.442130130578 ['6Pepe Joao Pedro Vilardi Pinto TP321898', 'Vitória / BA'] 608
7 187597 -0.42727062147000006 ['28Alan Ruschel Alan Luciano Ruschel TP187597', 'Juventude / RS'] 1275
8 337840 -0.420040592191 ['79Renato Kayzer Renato Kayzer de Souza RP337840', 'Vitória / BA'] 2042
9 398805 -0.414414309689 ['17Matheusinho Matheus Leonardo Sal ... TP398805', 'Sport / PE'] 1702
10 658643 -0.413560700129 ['85Mateus Car ... Mateus Carvalho dos  ... RP658643', 'Vasco da Gama / RJ'] 590
11 548978 -0.401109121812 ['25Luis Luís Fellipe Campos  ... RP548978', 'Santos / SP'] 441
12 307317 -0