In [1]:
import math

import numpy as np
import pandas as pd
from scipy.optimize import brentq, newton

In [2]:
def f(j: float, odds: list[float]) -> float:
    return sum(odd ** (-j) for odd in odds) - 1


def f_prime(j: float, odds: list[float]) -> float:
    return -sum(odd ** (-j) * math.log(odd) for odd in odds)


def solve(
    odds: list[float],
    j0: float = 1.0,
    tol: float = 1e-6,
    max_iter: int = 1000,
    fallback_interval_steps: int = 10,
) -> float:
    try:
        j = newton(
            func=lambda x: f(x, odds),
            x0=j0,
            fprime=lambda x: f_prime(x, odds),
            tol=tol,
            maxiter=max_iter,
        )
    except Exception:
        lo, hi = -100, 100
        f_lo, f_hi = f(lo, odds), f(hi, odds)
        step = 0
        while f_lo * f_hi > 0 and step < fallback_interval_steps:
            lo, hi = lo * 2, hi * 2
            f_lo, f_hi = f(lo, odds), f(hi, odds)
            step += 1
        if f_lo * f_hi > 0:
            raise ValueError("Failed to find interval containing root")  # noqa: B904

        j = brentq(lambda x: f(x, odds), a=lo, b=hi, xtol=tol, maxiter=max_iter)

    return j


def add_cols(df: pd.DataFrame) -> pd.DataFrame:
    df["j"] = df.apply(lambda row: solve(row[["PSCH", "PSCD", "PSCA"]]), axis=1)
    df["P(H)"] = df["PSCH"] ** (-df["j"])
    df["P(D)"] = df["PSCD"] ** (-df["j"])
    df["P(A)"] = df["PSCA"] ** (-df["j"])
    df["H"] = (df["FTR"] == "H").astype(int)
    df["D"] = (df["FTR"] == "D").astype(int)
    df["A"] = (df["FTR"] == "A").astype(int)
    df["brier_score"] = (
        (df["H"] - df["P(H)"]) ** 2
        + (df["D"] - df["P(D)"]) ** 2
        + (df["A"] - df["P(A)"]) ** 2
    )
    df["log_score"] = -(
        df["H"] * np.log(df["P(H)"])
        + df["D"] * np.log(df["P(D)"])
        + df["A"] * np.log(df["P(A)"])
    )

    return df

In [3]:
results = pd.DataFrame()

In [4]:
df = pd.read_csv("https://www.football-data.co.uk/new/BRA.csv")
cols = ["Country", "Season", "Home", "Away", "HG", "AG", "Res", "PSCH", "PSCD", "PSCA"]
df = df[(df["Season"] >= 2020) & (df["Season"] <= 2024)]
df = df[cols].reset_index(drop=True)
df.rename(
    columns={
        "Home": "HomeTeam",
        "Away": "AwayTeam",
        "HG": "FTHG",
        "AG": "FTAG",
        "Res": "FTR",
    },
    inplace=True,
)
df = add_cols(df)
country_results = (
    df.groupby(["Country", "Season"])
    .agg(brier_score=("brier_score", "mean"), log_score=("log_score", "mean"))
    .reset_index()
)
results = pd.concat([results, country_results])
country_results

Unnamed: 0,Country,Season,brier_score,log_score
0,Brazil,2020,0.618753,1.02894
1,Brazil,2021,0.608228,1.011829
2,Brazil,2022,0.607527,1.014486
3,Brazil,2023,0.607355,1.016402
4,Brazil,2024,0.586338,0.984214


In [5]:
def prepare_df(country: str, seasons: list[int]) -> pd.DataFrame:
    df = pd.DataFrame()
    url_mask = "https://www.football-data.co.uk/mmz4281/{season}/{championship}.csv"
    championship_mask = {
        "england": "E0",
        "france": "F1",
        "germany": "D1",
        "italy": "I1",
        "netherlands": "N1",
        "portugal": "P1",
        "spain": "SP1",
    }[country.lower()]
    for season in seasons:
        season_mask = f"{str(season)[2:]}{str(season + 1)[2:]}"
        url = url_mask.format(championship=championship_mask, season=season_mask)
        data = pd.read_csv(url)
        data["Country"] = country
        data["Season"] = season
        data.dropna(subset=["PSCH", "PSCD", "PSCA"], inplace=True)
        df = pd.concat([df, data], ignore_index=True)

    cols = [
        "Country",
        "Season",
        "HomeTeam",
        "AwayTeam",
        "FTHG",
        "FTAG",
        "FTR",
        "PSCH",
        "PSCD",
        "PSCA",
    ]

    return df[cols]

In [6]:
countries = [
    "England",
    "France",
    "Germany",
    "Italy",
    "Netherlands",
    "Portugal",
    "Spain",
]

for country in countries:
    print(country)
    seasons = [*range(2020, 2025)]
    df = prepare_df(country, seasons)
    df = add_cols(df)
    country_results = (
        df.groupby(["Country", "Season"])
        .agg(brier_score=("brier_score", "mean"), log_score=("log_score", "mean"))
        .reset_index()
    )
    results = pd.concat([results, country_results])

England
France
Germany
Italy
Netherlands
Portugal
Spain


In [7]:
results.groupby(["Country"]).agg(
    brier_score=("brier_score", "mean"), log_score=("log_score", "mean")
).reset_index().sort_values("brier_score", ignore_index=True)

Unnamed: 0,Country,brier_score,log_score
0,Portugal,0.535562,0.909289
1,Netherlands,0.545286,0.921011
2,England,0.563604,0.952186
3,Italy,0.570258,0.95887
4,Spain,0.574774,0.967915
5,Germany,0.581851,0.978868
6,France,0.588274,0.986813
7,Brazil,0.60564,1.011174
