In [1]:
import re
import json
import pickle
import numpy as np
import pandas as pd

from glob import glob

In [2]:
total = 0
without_hess = 0
didnt_converge = 0
wrong_number_of_parameters = 0
for file in glob('results/optimizer/*'):
    total += 1
    pars = int(re.findall('_(\d+)_pars', file)[0])
    with open(file, 'rb') as f:
        res = pickle.load(f)

    without_hess += res.hess_inv is None
    didnt_converge += not res.success
    wrong_number_of_parameters += res.x.shape[0] != pars - 1

if without_hess: print(f'{without_hess} / {total} ({without_hess / total:.2%}) of results don\'t have hessian')
if didnt_converge: print(f'{didnt_converge} / {total} ({didnt_converge / total:.2%}) of results didn\'t converge')
if wrong_number_of_parameters: print(f'{wrong_number_of_parameters} / {total} ({wrong_number_of_parameters / total:.2%}) of results has a wrong number of parameters')

In [3]:
df_results = pd.DataFrame(columns = ['competition', 'year', 'model', 'fitted_games', 'pars',
                                     'home', 'away', 'result', 'deFinetti', 'prob_home', 'prob_draw', 'prob_away'])

for file in glob('results/probs/game_probs*'):
    competition, year, model, games, pars = re.findall('game_probs_(Serie_[AB])_(\d+)_(.+)_(\d+)_games_(\d+)_pars', file)[0]
    games = int(games)
    if games == 380: continue
    with open(file, 'r') as f: probs = json.load(f)
    with open(f'../data/BrazilianSoccerData/results/processed/{competition}_{year}_games.json', 'r') as f: data = json.load(f)
    year = int(year)
    pars = int(pars)
    for game in range(games + 1, games + 11):
        game = f'{game}'
        if game not in data or 'Home' not in data[game]: continue
        home = data[game]['Home']
        away = data[game]['Away']
        result = data[game]['Result'].upper().split(' X ')
        result[0], result[1] = int(result[0]), int(result[1])
        if result[0] > result[1]: result, result_vector = 'H', np.array([1, 0, 0])
        elif result[0] == result[1]: result, result_vector = 'D', np.array([0, 1, 0])
        else: result, result_vector = 'A', np.array([0, 0, 1])
        prob_home = np.sum(np.tril(probs[home][away], -1))
        prob_draw = np.sum(np.diag(probs[home][away]))
        prob_away = np.sum(np.triu(probs[home][away], 1))
        probs_vector = np.array([prob_home, prob_draw, prob_away])
        definetti = np.sum((probs_vector - result_vector) ** 2)
        prob_result = probs_vector[np.argmax(result_vector)]
        df_results.loc[len(df_results)] = competition, year, model, games, pars, home, away, result, definetti, prob_home, prob_draw, prob_away

for result in ['home', 'draw', 'away']:
    df_results[f'prob_{result}'] = np.ceil(df_results[f'prob_{result}'] * 10) / 10
    df_results[f'{result}_occured'] = (df_results['result'] == result[0].upper()) * 1

df_confiability = pd.DataFrame()
for result in ['home', 'draw', 'away']:
    aux = df_results[[f'prob_{result}', f'{result}_occured']]
    aux.columns = ['prob', 'occured']
    df_confiability = pd.concat([df_confiability, aux], ignore_index=True)

df_confiability = df_confiability.groupby('prob').mean()
definetti_metric = df_results[['model', 'deFinetti', 'pars']].groupby(['model', 'pars']).mean().reset_index()
definetti_metric.loc[len(definetti_metric)] = ['baseline', '0', np.sum((1/3 * np.ones(3) - result_vector) ** 2)]
definetti_metric = definetti_metric.sort_values('deFinetti', ignore_index=True)

In [4]:
definetti_metric

Unnamed: 0,model,pars,deFinetti
0,shock_model,61,0.626612
1,independents_poisson,41,0.626729
2,independents_poisson,60,0.634672
3,shock_model,80,0.635364
4,holgates_poisson,61,0.643542
5,independents_poisson,80,0.649677
6,holgates_poisson,80,0.652468
7,shock_model,100,0.653057
8,independents_poisson,40,0.656425
9,shock_model,60,0.658912


In [5]:
df_confiability

Unnamed: 0_level_0,occured
prob,Unnamed: 1_level_1
0.0,0.21709
0.1,0.179103
0.2,0.229921
0.3,0.280552
0.4,0.330525
0.5,0.403449
0.6,0.481707
0.7,0.545846
0.8,0.588959
0.9,0.5982


In [6]:
competitions = ['Serie_A', 'Serie_B']
years = range(2013, 2024)
n_games = range(100, 390, 10)
total_ = 0
expected_ = 0
for year in years:
    for competition in competitions:
        total = 0
        expected = 0
        for games in n_games:
            expected += 12
            for file in glob(f'results/optimizer/*{competition}_{year}*{games}_games*'):
                with open(file, 'rb') as f: res = pickle.load(f)
                pars = int(re.findall('_(\d+)_pars', file)[0])
                if (res.x.shape[0] == (pars - 1)) and (res.success) and (res.hess_inv is not None):
                    total += 1

        total_ += total
        expected_ += expected
        if total != 0 and total != expected: print(f'{year}\'s {competition.replace("_", " ")} is {total / expected:.2%} completed ({expected - total} failed)')

print(f'\nAll is {total_ / expected_:.2%} completed')

2013's Serie B is 99.71% completed (1 failed)
2017's Serie B is 99.43% completed (2 failed)
2019's Serie B is 98.56% completed (5 failed)
2020's Serie B is 98.28% completed (6 failed)
2022's Serie A is 96.55% completed (12 failed)
2022's Serie B is 66.09% completed (118 failed)
2023's Serie A is 68.68% completed (109 failed)
2023's Serie B is 53.74% completed (161 failed)

All is 94.59% completed
