In [1]:
import json
import os
import tempfile

import cmdstanpy
import numpy as np
import plotly.express as px

In [2]:
SEASONS = [*range(2025, 2026)]

data_path = os.path.join(os.getcwd(), "..", "..", "Data", "results", "processed")

players_mapping = {"None": 1}
players_time_on_match = {}
stan_data = {
    "home_players": [],
    "home_players_minutes": [],
    "away_players": [],
    "away_players_minutes": [],
    "home_goals": [],
    "away_goals": [],
}

for season in SEASONS:
    file_name = f"Serie_A_{season}_squads.json"
    with open(os.path.join(data_path, file_name)) as f:
        data = json.load(f)

    for game_data in data.values():
        home_goals, away_goals = list(
            map(int, game_data["Summary"]["Result"].upper().split(" X "))
        )
        del game_data["Summary"]
        home_players = {}
        away_players = {}
        for sub_game_data in game_data.values():
            if sub_game_data["Time"] == 0:
                continue

            sub_game_time = sub_game_data["Time"]
            for player in sub_game_data["Home"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                home_players[player] = (
                    home_players.get(player, 0) + sub_game_data["Time"]
                )

            for player in sub_game_data["Away"]["Squad"]:
                players_mapping[player] = players_mapping.setdefault(
                    player, len(players_mapping) + 1
                )
                players_time_on_match[player] = (
                    players_time_on_match.get(player, 0) + sub_game_time
                )
                away_players[player] = (
                    away_players.get(player, 0) + sub_game_data["Time"]
                )

        if sum(home_players.values()) < 990:
            home_players["None"] = 990 - sum(home_players.values())
        if sum(away_players.values()) < 990:
            away_players["None"] = 990 - sum(away_players.values())

        stan_data["home_players"].append([players_mapping[x] for x in home_players])
        stan_data["home_players_minutes"].append(list(home_players.values()))
        stan_data["away_players"].append([players_mapping[x] for x in away_players])
        stan_data["away_players_minutes"].append(list(away_players.values()))
        stan_data["home_goals"].append(home_goals)
        stan_data["away_goals"].append(away_goals)

num_players_per_game = max(
    [len(x) for x in stan_data["home_players"]]
    + [len(x) for x in stan_data["away_players"]]
)

stan_data["num_games"] = len(stan_data["home_goals"])
stan_data["num_players"] = len(players_mapping)
stan_data["num_players_per_game"] = num_players_per_game

for i in range(stan_data["num_games"]):
    while len(stan_data["home_players"][i]) < num_players_per_game:
        stan_data["home_players"][i].append(1)
        stan_data["home_players_minutes"][i].append(0)
    while len(stan_data["away_players"][i]) < num_players_per_game:
        stan_data["away_players"][i].append(1)
        stan_data["away_players_minutes"][i].append(0)

In [3]:
model_name = "poisson_7"
file_path = os.path.join(
    os.getcwd(), "..", "models", "player_level", f"{model_name}.stan"
)
with open(file_path) as f:
    stan_code = f.read()

In [4]:
ranks = []
posterior_means = []
with tempfile.NamedTemporaryFile(mode="w", suffix=".stan", delete=False) as f:
    f.write(stan_code)
    stan_file_path = f.name
try:
    model = cmdstanpy.CmdStanModel(stan_file=stan_file_path)
    fit = model.sample(
        data=stan_data,
        chains=1,
        iter_warmup=10_000,
        iter_sampling=10_000,
        show_progress=False,
    )

    samples = fit.stan_variables()
finally:
    os.unlink(stan_file_path)

23:59:45 - cmdstanpy - INFO - compiling stan file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp7qgdi_oo.stan to exe file /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp7qgdi_oo
23:59:54 - cmdstanpy - INFO - compiled model executable: /private/var/folders/67/qnnhzk_15ydg7t3gr6_lmx8r0000gq/T/tmp7qgdi_oo
23:59:57 - cmdstanpy - INFO - CmdStan start processing
23:59:57 - cmdstanpy - INFO - Chain [1] start processing
00:02:40 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
draws = fit.draws_pd()
fit.summary().describe()

Unnamed: 0,Mean,MCSE,StdDev,MAD,5%,50%,95%,ESS_bulk,ESS_tail,R_hat
count,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0,1467.0
mean,-1.729691,0.000988,0.1154,0.11544,-1.920172,-1.729166,-1.540817,18824.35259,6676.779005,1.000162
std,47.37025,0.00896,0.504136,0.50609,48.099768,47.360533,46.672535,1324.164642,368.891636,0.000355
min,-1453.48,0.000506,0.051729,0.052742,-1485.53,-1453.09,-1422.62,3101.45,4137.1,0.9999
25%,0.000118,0.000712,0.098913,0.09865,-0.164539,0.000134,0.163011,18058.3,6451.62,0.999959
50%,0.002917,0.000728,0.099774,0.099608,-0.161286,0.002892,0.166893,18799.6,6696.94,1.00004
75%,0.005574,0.000742,0.1006,0.100965,-0.157854,0.005403,0.1706,19558.4,6935.52,1.0002
max,0.421442,0.342789,19.2046,19.2886,0.314293,0.420484,0.531352,22600.9,7796.26,1.00295


In [6]:
home_effect_dist = draws["nu"].describe(
    percentiles=[0.01, 0.025, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.975, 0.99]
)

home_effect_dist

count    10000.000000
mean         0.421442
std          0.065771
min          0.197834
1%           0.273080
2.5%         0.294932
5%           0.314293
10%          0.336517
25%          0.377315
50%          0.420484
75%          0.465719
90%          0.506966
95%          0.531352
97.5%        0.551850
99%          0.575201
max          0.682303
Name: nu, dtype: float64

In [7]:
cols = [f"alpha[{i}]" for i in range(1, stan_data["num_players"] + 1)]
params = (
    draws[cols]
    .describe()
    .T.reset_index()
    .rename(
        columns={
            "index": "player",
            0: "mean",
            1: "std",
            2: "min",
            3: "25%",
            4: "50%",
            5: "75%",
            6: "max",
        }
    )
)

params["player"] = params["player"].apply(
    lambda x: x.replace("alpha[", "").replace("]", "")
)
params["amplitude"] = params["max"] - params["min"]
reversed_players_mapping = {v: k for k, v in players_mapping.items()}
params["player_code"] = params["player"].apply(
    lambda x: reversed_players_mapping[int(x)]
)

params["time_played"] = params["player_code"].apply(
    lambda x: players_time_on_match.get(x, 0)
)

params.sort_values("mean", ascending=False)

Unnamed: 0,player,count,mean,std,min,25%,50%,75%,max,amplitude,player_code,time_played
1,2,10000.0,0.047825,0.100397,-0.362704,-0.018850,0.047624,0.114125,0.466196,0.828900,815100,3330
3,4,10000.0,0.046626,0.101837,-0.294664,-0.023280,0.045627,0.115698,0.424913,0.719577,310373,2954
361,362,10000.0,0.034947,0.099989,-0.344359,-0.031354,0.036140,0.101621,0.468479,0.812838,521990,2402
2,3,10000.0,0.031824,0.099374,-0.306513,-0.034992,0.032967,0.098152,0.395349,0.701862,422469,2248
5,6,10000.0,0.028493,0.100839,-0.339261,-0.040035,0.028981,0.095839,0.465646,0.804907,424455,1712
...,...,...,...,...,...,...,...,...,...,...,...,...
281,282,10000.0,-0.022535,0.098023,-0.340961,-0.090317,-0.022817,0.046096,0.313125,0.654086,302192,2920
124,125,10000.0,-0.023537,0.098825,-0.394929,-0.090428,-0.023523,0.043410,0.339427,0.734356,517052,1782
108,109,10000.0,-0.027129,0.098919,-0.383159,-0.094400,-0.027506,0.040502,0.341753,0.724912,331937,2549
109,110,10000.0,-0.028995,0.098709,-0.389122,-0.096255,-0.028717,0.036659,0.363821,0.752943,896161,2818


In [8]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[::-1][:20]
top_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            top_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(top_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 815100 0.047824924236322994 ['1ROSSI Agustin Daniel Rossi T(g)P815100', 'Flamengo / RJ'] 3330
2 310373 0.046626042908839 ['4Leo Pereira Leonardo Pereira TP310373', 'Flamengo / RJ'] 2954
3 521990 0.03494666572907999 ['10De Arrascaeta Giorgian Daniel de A ... TP521990', 'Flamengo / RJ'] 2402
4 422469 0.031824217160715 ['3Léo Ortiz Leonardo Rech Ortiz TP422469', 'Flamengo / RJ'] 2248
5 424455 0.0284925449487 ['7Luiz Luiz de Araujo Guima ... TP424455', 'Flamengo / RJ'] 1712
6 837958 0.027087161320938004 ['25Villalba Lucas Hernan Villalba RP837958', 'Cruzeiro / MG'] 2864
7 169050 0.02548863435703 ['21Weverton Weverton Pereira da Silva T(g)P169050', 'Palmeiras / SP'] 2340
8 582558 0.025401227654635 ['3Jemmes Jemmes Bruno Ribeiro ... TP582558', 'Mirassol / SP'] 3318
9 335362 0.02471872524328 ['8Daniel Daniel de Oliveira S ... TP335362', 'Mirassol / SP'] 3265
10 320111 0.024661012079877004 ['16Lucas Silva Lucas Silva Borges RP320111', 'Cruzeiro / MG'] 2752
11 725766 0.023828529014109 ['22Joa

In [9]:
top_20 = np.argsort(np.mean(samples["alpha"], axis=0))[:20]
worst_20_players = []
for player in top_20:
    for player_id in players_mapping:
        if players_mapping[player_id] == player + 1:
            force = np.mean(samples["alpha"][:, player])
            worst_20_players.append((player_id, force))
            break

for i, (player, force) in enumerate(worst_20_players):
    info = None
    for season in SEASONS[::-1]:
        file_name_games = f"Serie_A_{season}_games.json"
        with open(os.path.join(data_path, file_name_games)) as f:
            games = json.load(f)

        for game in games.values():
            players = game["Players"]
            for player_info in players:
                if player_info[0][-6:] == player:
                    info = player_info
                    break

            if info:
                break

        if info:
            break
    time_played = players_time_on_match.get(player, 0)
    print(i + 1, player, force, info, time_played)

1 None -2.5242802652115004 None 0
2 896161 -0.02899530150855 ['14RIVERA Christian Hernando R ... TP896161', 'Sport / PE'] 2818
3 331937 -0.027128771718909598 ['10Lucas Lima Lucas Rafael Araujo Lima TP331937', 'Sport / PE'] 2549
4 517052 -0.02353712375373 ['33Matheus Al ... Matheus Alexandre An ... RP517052', 'Sport / PE'] 1782
5 302192 -0.0225352396463391 ['16Jadson Jadson Alves dos Santos TP302192', 'Juventude / RS'] 2920
6 315345 -0.020396251328583 ['21Gabriel Gabriel Vasconcelos  ... T(g)P315345', 'Vitória / BA'] 2160
7 313474 -0.020020673049315003 ['15R. Thyere Rafael Thyere de Alb ... RP313474', 'Sport / PE'] 2151
8 398805 -0.019656749501930002 ['17Matheusinho Matheus Leonardo Sal ... TP398805', 'Sport / PE'] 1702
9 539820 -0.018120835632408 ['44Luiz Luiz Gustavo da Silv ... TP539820', 'Juventude / RS'] 2407
10 337242 -0.018016401542069998 ['21Lucas Lucas Kal Schenfeld  ... TP337242', 'Sport / PE'] 904
11 402095 -0.017857531148604 ['40Ramon Menezes Ramon Menezes Roma RP402095', 'C

In [10]:
df = params[["player", "mean", "time_played"]].copy()
file_name_games = "Serie_A_2025_games.json"
with open(os.path.join(data_path, file_name_games)) as f:
    games = json.load(f)

for player in df["player"].values[1:]:
    info = None
    for player_id in players_mapping:
        if player_id == "None":
            continue
        if players_mapping[player_id] == int(player):
            df.loc[df["player"] == player, "player_id"] = player_id
            break

    for game in games.values():
        players = game["Players"]
        for player_info in players:
            if player_info[0][-6:] == player_id:
                info = player_info
                df.loc[df["player_id"] == player_id, "player_name"] = player_info[0]
                df.loc[df["player_id"] == player_id, "team_name"] = player_info[1]
                break

        if info:
            break

df.dropna(inplace=True)
df

Unnamed: 0,player,mean,time_played,player_id,player_name,team_name
1,2,0.047825,3330,815100,1ROSSI Agustin Daniel Rossi T(g)P815100,Flamengo / RJ
2,3,0.031824,2248,422469,3Léo Ortiz Leonardo Rech Ortiz TP422469,Flamengo / RJ
3,4,0.046626,2954,310373,4Leo Pereira Leonardo Pereira TP310373,Flamengo / RJ
4,5,0.020318,1241,777370,5Erick Pulgar Erick Antonio Pulgar ... TP777370,Flamengo / RJ
5,6,0.028493,1712,424455,7Luiz Luiz de Araujo Guima ... TP424455,Flamengo / RJ
...,...,...,...,...,...,...
727,728,0.003929,22,696436,76Wanderson ... Wanderson Junior de ... RP69...,Flamengo / RJ
728,729,0.003431,22,687260,72LUCAS Lucas Vieira dos Santos RP687260,Flamengo / RJ
729,730,0.002770,17,622161,80João João Paulo Camargo d ... RP622161,Flamengo / RJ
730,731,-0.000011,56,647416,66Leandrinho Leandro Viana da Sil ... RP647416,Vasco da Gama / RJ


In [11]:
team_abbr = {
    "Flamengo / RJ": "FLA",
    "Cruzeiro / MG": "CRU",
    "Mirassol / SP": "MIR",
    "Palmeiras / SP": "PAL",
    "Botafogo / RJ": "BOT",
    "Fluminense / RJ": "FLU",
    "Bahia / BA": "BAH",
    "Atlético Mineiro / MG": "CAM",
    "São Paulo / SP": "SAO",
    "Vasco da Gama / RJ": "VAS",
    "Grêmio / RS": "GRE",
    "Corinthians / SP": "COR",
    "Santos / SP": "SAN",
    "Ceará / CE": "CEA",
    "Fortaleza / CE": "FOR",
    "Internacional / RS": "INT",
    "Red Bull Bragantino / SP": "BGT",
    "Vitória / BA": "VIT",
    "Juventude / RS": "JUV",
    "Sport / PE": "SPT",
}

df["team_abbr"] = df["team_name"].map(team_abbr).fillna(df["team_name"])
teams = df["team_abbr"].unique()

avg_team_skill = {}
for team in teams:
    team_data = df[df["team_abbr"] == team]
    team_skill = np.dot(team_data["mean"], team_data["time_played"])
    team_skill /= team_data["time_played"].sum()
    avg_team_skill[team] = team_skill

ordered_teams = sorted(avg_team_skill, key=avg_team_skill.get, reverse=True)

fig = px.scatter(
    df,
    x="time_played",
    y="mean",
    color="team_abbr",
    category_orders={"team_abbr": ordered_teams},
    hover_data=["player_name", "team_abbr", "mean", "time_played"],
    title="Player Skill vs Playing Time by Team",
    labels={
        "time_played": "Total Minutes Played",
        "mean": "Skill Parameter (α)",
        "team_abbr": "Team",
    },
    height=600,
    width=1000,
)

fig.update_traces(marker={"size": 12, "opacity": 0.6}, selector={"mode": "markers"})
fig.update_layout(hovermode="closest", paper_bgcolor="white", plot_bgcolor="white")
fig.show()

In [12]:
avg_team_skill = sorted(avg_team_skill.items(), key=lambda x: x[1], reverse=True)
avg_team_skill

[('FLA', 0.025137917578567124),
 ('CRU', 0.017252753293946394),
 ('MIR', 0.016487159835845533),
 ('PAL', 0.016023248518678396),
 ('BOT', 0.0112237780605328),
 ('FLU', 0.009723585620755183),
 ('BAH', 0.005897113948449463),
 ('CAM', 0.0031887332613041923),
 ('VAS', 0.002744826853832946),
 ('SAO', 0.0025822010607640575),
 ('GRE', 0.0016834480007121785),
 ('COR', 0.0012637229027704234),
 ('SAN', 0.0009829260754347583),
 ('CEA', -0.0002750539187649592),
 ('FOR', -0.0011873311623129736),
 ('BGT', -0.001885097331651111),
 ('INT', -0.0019684081577702383),
 ('VIT', -0.004231766824487648),
 ('JUV', -0.009010755409832599),
 ('SPT', -0.013825715213382123)]