In [1]:
import json
import numpy as np

from scipy.stats import poisson
from scipy.optimize import minimize
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
def generate_games(clubs, filename):
    games = list()
    for club1 in clubs:
        for club2 in clubs:
            if club1 == club2: continue
            games.append(f'{club1} vs. {club2}\n')

    with open(filename, 'w') as f: f.writelines(games)
    return games

def preprocessing(competition, year):
    with open(f'../data/BrazilianSoccerData/results/processed/{competition}_{year}_games.json', 'r') as f:
        data = json.load(f)

    inx = dict()
    played_games = dict()
    inx_count = 0
    for game in data:
        game = str(game).zfill(3)
        home, away, result = data[game]['Home'], data[game]['Away'], data[game]['Result']
        result = result.split(' X ')
        result = [int(x) for x in result]
        if home not in played_games: played_games[home] = dict()
        played_games[home][away] = result
        if home not in inx:
            inx[home] = dict()
            inx[home]['Atk'] = inx_count
            inx_count += 1
            inx[home]['Def'] = inx_count
            inx_count += 1

    games = generate_games(list(played_games.keys()), f'{competition}_{year}.csv')
    return played_games, inx, games

def likelihood(parameters, played_games, inx):
    lik = 0
    for home in played_games:
        for away in played_games[home]:
            result = played_games[home][away]
            mu0 = parameters[inx[home]['Atk']] / parameters[inx[away]['Def']]
            mu1 = parameters[inx[away]['Atk']] / parameters[inx[home]['Def']]
            lik -= poisson.logpmf(result[0], mu0)
            lik -= poisson.logpmf(result[1], mu1)

    return lik

def optimize_parameters(competition, year):
    played_games, inx, games = preprocessing(competition, year)
    parameters = np.random.random(40)
    bounds = [(0, None) for _ in parameters]
    bounds[0] = (1, 1)
    res = minimize(likelihood, parameters, args = (played_games, inx), bounds = bounds)
    parameters = res.x
    for club in inx:
        for force in inx[club]:
            inx[club][force] = parameters[inx[club][force]]

    parameters = inx
    with open(f'parameters_{competition}_{year}.json', 'w') as f: json.dump(parameters, f)
    games = [game.strip().split(' vs. ') for game in games]
    return parameters, games, played_games

In [3]:
parameters, games, played_games = optimize_parameters('Serie_A', 2023)

In [4]:
n_sims = 10_000_000
points = dict()
for i, club in enumerate(played_games):
    points[club] = np.zeros((n_sims, 4), dtype = int)
    points[club] = np.hstack([points[club], np.arange(n_sims).reshape(-1, 1)])
    points[club] = np.hstack([points[club], i * np.ones(n_sims).reshape(-1, 1)])

for game in tqdm(games):
    home, away = game
    if away in played_games[home]:
        home_score, away_score = played_games[home][away]
        
        # points
        points[home][:, 0] += 3 * (home_score > away_score) + 1 * (home_score == away_score)
        points[away][:, 0] += 3 * (away_score > home_score) + 1 * (home_score == away_score)

        # wins
        points[home][:, 1] += 1 * (home_score > away_score)
        points[away][:, 1] += 1 * (away_score > home_score)

        # goals
        points[home][:, 2] += home_score
        points[away][:, 2] += away_score

        # goal difference
        points[home][:, 3] += home_score - away_score
        points[away][:, 3] += away_score - home_score
    else:
        mu0 = parameters[home]['Atk'] / parameters[away]['Def']
        mu1 = parameters[away]['Atk'] / parameters[home]['Def']
        simulation = poisson.rvs((mu0, mu1), size = (n_sims, 2))

        # points
        points[home][:, 0] += 3 * (simulation[:, 0] > simulation[:, 1]) + 1 * (simulation[:, 0] == simulation[:, 1])
        points[away][:, 0] += 3 * (simulation[:, 1] > simulation[:, 0]) + 1 * (simulation[:, 0] == simulation[:, 1])

        # wins
        points[home][:, 1] += 1 * (simulation[:, 0] > simulation[:, 1])
        points[away][:, 1] += 1 * (simulation[:, 1] > simulation[:, 0])

        # goals
        points[home][:, 2] += simulation[:, 0]
        points[away][:, 2] += simulation[:, 1]

        # goal difference
        points[home][:, 3] += simulation[:, 0] - simulation[:, 1]
        points[away][:, 3] += simulation[:, 1] - simulation[:, 0]

100%|██████████| 380/380 [05:22<00:00,  1.18it/s]


In [5]:
points

{'Flamengo / RJ': array([[6.900000e+01, 2.000000e+01, 7.200000e+01, 2.500000e+01,
         0.000000e+00, 0.000000e+00],
        [6.600000e+01, 2.000000e+01, 6.600000e+01, 1.900000e+01,
         1.000000e+00, 0.000000e+00],
        [6.600000e+01, 1.900000e+01, 6.300000e+01, 2.500000e+01,
         2.000000e+00, 0.000000e+00],
        ...,
        [6.100000e+01, 1.700000e+01, 6.700000e+01, 2.100000e+01,
         9.999997e+06, 0.000000e+00],
        [6.400000e+01, 1.900000e+01, 6.600000e+01, 2.300000e+01,
         9.999998e+06, 0.000000e+00],
        [5.200000e+01, 1.400000e+01, 5.800000e+01, 2.000000e+00,
         9.999999e+06, 0.000000e+00]]),
 'Botafogo / RJ': array([[9.000000e+01, 2.900000e+01, 6.600000e+01, 4.700000e+01,
         0.000000e+00, 1.000000e+00],
        [8.900000e+01, 2.800000e+01, 5.800000e+01, 4.200000e+01,
         1.000000e+00, 1.000000e+00],
        [9.200000e+01, 2.900000e+01, 6.600000e+01, 4.800000e+01,
         2.000000e+00, 1.000000e+00],
        ...,
        [8.