In [1]:
import cmdstanpy
import numpy as np
import pandas as pd

from random import shuffle
from itertools import product
from IPython.display import clear_output

cmdstanpy.install_cmdstan();

CmdStan install directory: /Users/igor.michels/.cmdstan
CmdStan version 2.34.1 already installed
Test model compilation


In [2]:
NUM_SEASONS = 1
NUM_TEAMS = 20
NUM_PLAYERS_PER_TEAM = 12

In [3]:
def generate_matchups(n_teams, n_players_per_team, skills, home_force):
    if home_force is None: home_force = 0
    games = []
    for home_team, away_team in product(range(n_teams), repeat=2):
        if home_team == away_team: continue
        
        home_players = np.arange(n_players_per_team)
        away_players = np.arange(n_players_per_team)
        shuffle(home_players)
        shuffle(away_players)
        home_players = home_players[:11]
        away_players = away_players[:11]

        home_skill = sum(skills[home_team * n_players_per_team + home_players])
        away_skill = sum(skills[away_team * n_players_per_team + away_players])

        home_score = np.random.poisson((home_skill + home_force) / away_skill)
        away_score = np.random.poisson(away_skill / home_skill)
        games.append({
            'home_team': home_team,
            'away_team': away_team,
            'home_score': home_score,
            'away_score': away_score,
            'home_players': home_players,
            'away_players': away_players,
        })

    df = pd.DataFrame(games)

    return df

def generate_seasons(n_teams, n_players_per_team, skills, n_seasons, home_force=None):
    df = pd.DataFrame()
    for _ in range(n_seasons):
        df = pd.concat([df, generate_matchups(n_teams, n_players_per_team, skills, home_force)])
    
    return df

In [4]:
skills = abs(np.random.normal(0, 1, NUM_TEAMS * NUM_PLAYERS_PER_TEAM))
skills /= skills[0]
df = generate_seasons(NUM_TEAMS, NUM_PLAYERS_PER_TEAM, skills, NUM_SEASONS)

home_players = df['home_players'].values
away_players = df['home_players'].values

home_players = np.array([list(sub_array) for sub_array in home_players])
away_players = np.array([list(sub_array) for sub_array in away_players])

model_data = {
    'n_games': len(df),
    'n_teams': NUM_TEAMS,
    'n_players_per_team': NUM_PLAYERS_PER_TEAM,
    'home_team': df['home_team'].values,
    'away_team': df['away_team'].values,
    'home_score': df['home_score'].values,
    'away_score': df['away_score'].values,
    'home_players': home_players + 1,
    'away_players': away_players + 1,
}

model = cmdstanpy.CmdStanModel(stan_file='../models/Poisson_model_v2.stan', force_compile=True)
fitting = model.sample(data=model_data, chains=4, iter_warmup=2000, iter_sampling=10000, show_console=False)
results = fitting.summary() / fitting.summary()['Mean']['skills[1]']
inside_ci = (skills > results['5%'].values[1:])
inside_ci *= ((skills < results['95%'].values[1:]))

clear_output()
sum(inside_ci) / (NUM_TEAMS * NUM_PLAYERS_PER_TEAM)

0.9375

In [5]:
# home_force = abs(np.random.normal(0, 1))
# habilidades = abs(np.random.normal(0, 1, NUM_TEAMS))

# habilidades /= home_force
# home_force = 1

# df = generate_seasons(NUM_TEAMS, habilidades, NUM_SEASONS, home_force)
# modelo_compilado = cmdstanpy.CmdStanModel(stan_file='../models/Poisson_model_2.stan', force_compile=False)
# dados_modelo = {
#     'num_jogos': len(df),
#     'num_equipes': NUM_TEAMS,
#     'equipe1': df['equipe1'].values,
#     'equipe2': df['equipe2'].values,
#     'gols_equipe1': df['gols_equipe1'].values,
#     'gols_equipe2': df['gols_equipe2'].values,
# }

# ajuste = modelo_compilado.sample(data=dados_modelo, chains=4, iter_warmup=2000, iter_sampling=10000, show_console=False)
# inside_ci = (habilidades > ajuste.summary()['5%'].values[1:])
# inside_ci *= ((habilidades < ajuste.summary()['95%'].values[1:]))

# clear_output()
# sum(inside_ci) / NUM_TEAMS