In [1]:
import json
import os
import tempfile

import cmdstanpy
import numpy as np

In [2]:
SEASONS = [*range(2025, 2026)]

data_path = os.path.join(os.getcwd(), "..", "..", "Data", "results", "processed")

players_mapping = {"None": 1}
players_time_on_match = {}
stan_data = {
    "home_players": [],
    "home_players_minutes": [],
    "away_players": [],
    "away_players_minutes": [],
    "home_goals": [],
    "away_goals": [],
}

for season in SEASONS:
    file_name = f"Serie_A_{season}_squads.json"
    with open(os.path.join(data_path, file_name)) as f:
        data = json.load(f)

    for game_data in data.values():
        home_goals, away_goals = list(
            map(int, game_data["Summary"]["Result"].upper().split(" X "))
        )
        del game_data["Summary"]
        home_players = {}
        away_players = {}
        for sub_game_data in game_data.values():
            if sub_game_data["Time"] == 0:
                continue

            sub_game_time = sub_game_data["Time"]
            for player in sub_game_data["Home"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                home_players[player] = (
                    home_players.get(player, 0) + sub_game_data["Time"]
                )

            for player in sub_game_data["Away"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                away_players[player] = (
                    away_players.get(player, 0) + sub_game_data["Time"]
                )

        if sum(home_players.values()) < 990:
            home_players["None"] = 990 - sum(home_players.values())
        if sum(away_players.values()) < 990:
            away_players["None"] = 990 - sum(away_players.values())

        stan_data["home_players"].append([players_mapping[x] for x in home_players])
        stan_data["home_players_minutes"].append(list(home_players.values()))
        stan_data["away_players"].append([players_mapping[x] for x in away_players])
        stan_data["away_players_minutes"].append(list(away_players.values()))
        stan_data["home_goals"].append(home_goals)
        stan_data["away_goals"].append(away_goals)

num_players_per_game = max(
    [len(x) for x in stan_data["home_players"]]
    + [len(x) for x in stan_data["away_players"]]
)

stan_data["num_games"] = len(stan_data["home_goals"])
stan_data["num_players"] = len(players_mapping)
stan_data["num_players_per_game"] = num_players_per_game

for i in range(stan_data["num_games"]):
    while len(stan_data["home_players"][i]) < num_players_per_game:
        stan_data["home_players"][i].append(1)
        stan_data["home_players_minutes"][i].append(0)
    while len(stan_data["away_players"][i]) < num_players_per_game:
        stan_data["away_players"][i].append(1)
        stan_data["away_players_minutes"][i].append(0)

In [3]:
STAN_CODE = """
data {
    int<lower=1> num_games;
    int<lower=1> num_players;
    int<lower=1> num_players_per_game;
    array[num_games, num_players_per_game] int<lower=1, upper=num_players> home_players;
    array[num_games, num_players_per_game] int<lower=1, upper=num_players> away_players;
    array[num_games, num_players_per_game] int<lower=0> home_players_minutes;
    array[num_games, num_players_per_game] int<lower=0> away_players_minutes;
    array[num_games] int<lower=0> home_goals;
    array[num_games] int<lower=0> away_goals;
}

transformed data {
    real log_90 = log(90);
}

parameters {
    vector[num_players - 1] raw_alpha;
    real nu;
    real correlation_strength;
}

transformed parameters {
    vector[num_players] alpha = append_row(-sum(raw_alpha), raw_alpha);
}

model {
    raw_alpha ~ normal(0, 0.1);
    nu ~ normal(0, 1);

    for (game in 1:num_games) {
        vector[num_players_per_game] log_terms_home;
        vector[num_players_per_game] log_terms_away;

        for (p in 1:num_players_per_game) {
            real mins_hp = home_players_minutes[game,p];
            real mins_ap = away_players_minutes[game,p];
            if (mins_hp > 0)
                log_terms_home[p] = alpha[home_players[game,p]] + log(mins_hp) - log_90;
            else
                log_terms_home[p] = negative_infinity();
            if (mins_ap > 0)
                log_terms_away[p] = alpha[away_players[game,p]] + log(mins_ap) - log_90;
            else
                log_terms_away[p] = negative_infinity();
        }

        real log_home_skill = log_sum_exp(log_terms_home);
        real log_away_skill = log_sum_exp(log_terms_away);

        real log_lambda_h = log_home_skill - log_away_skill + nu + correlation_strength;
        real log_lambda_a = log_away_skill - log_home_skill + correlation_strength;
        target += poisson_log_lpmf(home_goals[game] | log_lambda_h);
        target += poisson_log_lpmf(away_goals[game] | log_lambda_a);
    }
}

generated quantities {
    real log_lik = 0;
    for (game in 1:num_games) {
        vector[num_players_per_game] log_terms_home;
        vector[num_players_per_game] log_terms_away;

        for (p in 1:num_players_per_game) {
            real mins_hp = home_players_minutes[game,p];
            real mins_ap = away_players_minutes[game,p];
            if (mins_hp > 0)
                log_terms_home[p] = alpha[home_players[game,p]] + log(mins_hp) - log_90;
            else
                log_terms_home[p] = negative_infinity();
            if (mins_ap > 0)
                log_terms_away[p] = alpha[away_players[game,p]] + log(mins_ap) - log_90;
            else
                log_terms_away[p] = negative_infinity();
        }

        real log_home_skill = log_sum_exp(log_terms_home);
        real log_away_skill = log_sum_exp(log_terms_away);

        real log_lambda_h = log_home_skill - log_away_skill + nu + correlation_strength;
        real log_lambda_a = log_away_skill - log_home_skill + correlation_strength;
        log_lik += poisson_log_lpmf(home_goals[game] | log_lambda_h);
        log_lik += poisson_log_lpmf(away_goals[game] | log_lambda_a);
    }
}
"""

In [4]:
ranks = []
posterior_means = []
with tempfile.NamedTemporaryFile(mode="w", suffix=".stan", delete=False) as f:
    f.write(STAN_CODE)
    stan_file_path = f.name
try:
    model = cmdstanpy.CmdStanModel(stan_file=stan_file_path)
    fit = model.sample(
        data=stan_data,
        chains=1,
        iter_warmup=1_000,
        iter_sampling=1_000,
        show_progress=False,
    )

    samples = fit.stan_variables()
finally:
    os.unlink(stan_file_path)

20:47:52 - cmdstanpy - INFO - compiling stan file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp1pukw3va.stan to exe file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp1pukw3va
20:48:00 - cmdstanpy - INFO - compiled model executable: /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp1pukw3va
20:48:01 - cmdstanpy - INFO - CmdStan start processing
20:48:01 - cmdstanpy - INFO - Chain [1] start processing
20:48:19 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
fit.summary().describe()

Unnamed: 0,Mean,MCSE,StdDev,MAD,5%,50%,95%,ESS_bulk,ESS_tail,R_hat
count,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0
mean,-1.729264,0.00292,0.11462,0.115254,-1.917592,-1.729161,-1.542305,2142.112099,650.658806,1.001704
std,47.355092,0.025158,0.481815,0.503112,48.042417,47.356454,46.690049,404.172566,101.673971,0.003294
min,-1452.83,0.001428,0.049584,0.048639,-1482.7,-1452.97,-1423.48,359.335,216.622,0.999001
25%,-0.00047,0.002032,0.097241,0.095951,-0.166664,-0.000653,0.159605,1846.735,592.558,0.999546
50%,0.002582,0.002176,0.099621,0.099744,-0.160174,0.002589,0.165834,2113.25,653.768,1.00046
75%,0.005687,0.002328,0.102036,0.103534,-0.153597,0.006334,0.172646,2414.08,715.062,1.00246
max,0.420364,0.961997,18.3477,19.1923,0.313074,0.420171,0.526105,3000.0,936.666,1.02693


In [6]:
draws = fit.draws_pd()
np.min(draws["nu"]), np.mean(draws["nu"]), np.max(draws["nu"])

(0.148021, 0.42036359000000006, 0.609404)

In [7]:
cols = [f"alpha[{i}]" for i in range(1, stan_data["num_players"] + 1)]
params = (
    draws[cols]
    .describe()
    .T.reset_index()
    .rename(
        columns={
            "index": "player",
            0: "mean",
            1: "std",
            2: "min",
            3: "25%",
            4: "50%",
            5: "75%",
            6: "max",
        }
    )
)

params["player"] = params["player"].apply(
    lambda x: x.replace("alpha[", "").replace("]", "")
)
params["amplitude"] = params["max"] - params["min"]
reversed_players_mapping = {v: k for k, v in players_mapping.items()}
params["player_code"] = params["player"].apply(
    lambda x: reversed_players_mapping[int(x)]
)

params["time_played"] = params["player_code"].apply(
    lambda x: players_time_on_match.get(x, 0)
)

params.sort_values("mean", ascending=False)

Unnamed: 0,player,count,mean,std,min,25%,50%,75%,max,amplitude,player_code,time_played
1,2,1000.0,0.049056,0.095428,-0.213067,-0.018527,0.048728,0.119548,0.383431,0.596498,815100,3330
3,4,1000.0,0.045134,0.097854,-0.234839,-0.019822,0.041938,0.114348,0.322389,0.557228,310373,2954
361,362,1000.0,0.034532,0.097432,-0.278530,-0.031514,0.033814,0.102326,0.337942,0.616472,521990,2402
2,3,1000.0,0.030996,0.097941,-0.304191,-0.036555,0.030195,0.098587,0.366691,0.670882,422469,2248
439,440,1000.0,0.028580,0.102570,-0.311524,-0.046028,0.031665,0.103593,0.290960,0.602484,778033,1326
...,...,...,...,...,...,...,...,...,...,...,...,...
281,282,1000.0,-0.023057,0.103750,-0.338408,-0.088137,-0.023520,0.043758,0.300130,0.638538,302192,2920
124,125,1000.0,-0.024434,0.092189,-0.308418,-0.088001,-0.024855,0.042869,0.241373,0.549791,517052,1782
108,109,1000.0,-0.026017,0.100752,-0.354330,-0.094922,-0.026287,0.044643,0.293845,0.648175,331937,2549
109,110,1000.0,-0.028364,0.097624,-0.449113,-0.089703,-0.028856,0.034280,0.238830,0.687943,896161,2818


In [8]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[::-1][:20]
top_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            top_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(top_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 815100 0.049055783095 ['1ROSSI Agustin Daniel Rossi T(g)P815100', 'Flamengo / RJ'] 3330
2 310373 0.04513419067695 ['4Leo Pereira Leonardo Pereira TP310373', 'Flamengo / RJ'] 2954
3 521990 0.03453176750439999 ['10De Arrascaeta Giorgian Daniel de A ... TP521990', 'Flamengo / RJ'] 2402
4 422469 0.0309958263539 ['3Léo Ortiz Leonardo Rech Ortiz TP422469', 'Flamengo / RJ'] 2248
5 778033 0.028580243021000002 ['2Varela Guillermo Varela Olivera RP778033', 'Flamengo / RJ'] 1326
6 424455 0.02712864974942 ['7Luiz Luiz de Araujo Guima ... TP424455', 'Flamengo / RJ'] 1712
7 837958 0.026661899025099998 ['25Villalba Lucas Hernan Villalba RP837958', 'Cruzeiro / MG'] 2864
8 546294 0.02649000874 ['88Christian Christian Roberto Al ... TP546294', 'Cruzeiro / MG'] 2387
9 169050 0.026414513348 ['21Weverton Weverton Pereira da Silva T(g)P169050', 'Palmeiras / SP'] 2340
10 320111 0.0253965179803 ['16Lucas Silva Lucas Silva Borges RP320111', 'Cruzeiro / MG'] 2752
11 582558 0.025254360990000002 ['3Jemmes Jemme

In [9]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[:20]
top_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            top_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(top_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 None -2.4015896866000004 None 0
2 896161 -0.028364474088 ['14RIVERA Christian Hernando R ... TP896161', 'Sport / PE'] 2818
3 331937 -0.0260171247113 ['10Lucas Lima Lucas Rafael Araujo Lima TP331937', 'Sport / PE'] 2549
4 517052 -0.0244339626491 ['33Matheus Al ... Matheus Alexandre An ... RP517052', 'Sport / PE'] 1782
5 302192 -0.023057140638 ['16Jadson Jadson Alves dos Santos TP302192', 'Juventude / RS'] 2920
6 398805 -0.022496192725 ['17Matheusinho Matheus Leonardo Sal ... TP398805', 'Sport / PE'] 1702
7 337242 -0.021298963746 ['21Lucas Lucas Kal Schenfeld  ... TP337242', 'Sport / PE'] 904
8 313474 -0.020730656645 ['15R. Thyere Rafael Thyere de Alb ... RP313474', 'Sport / PE'] 2151
9 539820 -0.0190872229611 ['44Luiz Luiz Gustavo da Silv ... TP539820', 'Juventude / RS'] 2407
10 315345 -0.018344143136000003 ['21Gabriel Gabriel Vasconcelos  ... T(g)P315345', 'Vitória / BA'] 2160
11 402095 -0.0183094626791 ['40Ramon Menezes Ramon Menezes Roma RP402095', 'Ceará / CE'] 1923
12 546794 -0.0