# Score Predictions

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
# df = df[(df['league_id'] == 2412) | (df['league_id'] == 2411)]
df = df[df['league_id'] == 2411]
df = df[df['season'] != 2021]
df = df[["season", "league_id", "team1", "team2", "score1", "score2", "date"]].dropna()
df.head()

Unnamed: 0,season,league_id,team1,team2,score1,score2,date
12,2016,2411,Hull City,Leicester City,2.0,1.0,2016-08-13
13,2016,2411,Southampton,Watford,1.0,1.0,2016-08-13
14,2016,2411,Crystal Palace,West Bromwich Albion,0.0,1.0,2016-08-13
15,2016,2411,Middlesbrough,Stoke City,1.0,1.0,2016-08-13
16,2016,2411,Burnley,Swansea City,0.0,1.0,2016-08-13


In [2]:
df_test = df[df['season'] == 2020]
df_train = df[df['season'] != 2020]

## Poisson Distribution

In [3]:
import warnings
from scipy.stats import poisson
from scipy.optimize import minimize

In [4]:
games = pd.DataFrame([df["score1"], df["score2"], df["team1"], df["team2"]]).T
games["score1"] = games["score1"].astype(int)
games["score2"] = games["score2"].astype(int)

teams = np.sort(np.unique(np.concatenate([df["team1"], df["team2"]])))
n_teams = len(teams)

_params = np.concatenate(
    (
        [1] * n_teams,  # Attack rating
        [-1] * n_teams, # Defense rating
        [0.5],          # home advantage
    )
)

In [5]:
def score_inference(params, games, teams):
    params_df = (
        pd.DataFrame(params[:n_teams], columns=["attack"])
        .assign(defence=params[n_teams : n_teams * 2])
        .assign(team=teams)
    )

    aggregate_df = (
        games.merge(params_df, left_on='team1', right_on='team')
        .rename(columns={"attack": "attack1", "defence": "defence1"})
        .merge(params_df, left_on='team2', right_on='team')
        .rename(columns={"attack": "attack2", "defence": "defence2"})
        .drop("team_y", axis=1)
        .drop("team_x", axis=1)
        .assign(home_adv=params[-1])
    )

    aggregate_df["score1_infered"] = np.exp(aggregate_df["home_adv"] + aggregate_df["attack1"] + aggregate_df["defence2"])
    aggregate_df["score2_infered"] = np.exp(aggregate_df["attack2"] + aggregate_df["defence1"])

    aggregate_df["score1_loglikelihood"] = poisson.logpmf(aggregate_df["score1"], aggregate_df["score1_infered"])
    aggregate_df["score2_loglikelihood"] = poisson.logpmf(aggregate_df["score2"], aggregate_df["score2_infered"])
    aggregate_df["loglikelihood"] = aggregate_df["score1_loglikelihood"] + aggregate_df["score2_loglikelihood"]
    
    return -aggregate_df["loglikelihood"].sum()

In [12]:
# Optimize the parameter evalutation over 100 iterations
options = {
    "maxiter": 100,
    # "disp": False,
}

# Set the home rating to have a unique set of values for reproducibility
constraints = [{"type": "eq", "fun": lambda x: sum(x[: n_teams]) - n_teams}]

bounds = [(0, 3)] * n_teams
bounds += [(-3, 0)] * n_teams
bounds += [(0, 3)]

_res = minimize(
    score_inference,
    _params,
    args=(games, teams),
    constraints=constraints,
    bounds=bounds,
    options=options,
)

_params = _res["x"]
n_params = len(_params)
loglikelihood = _res["fun"] * -1
aic = -2 * (loglikelihood) + 2 * n_params

In [7]:
def score_mtx(home_team, away_team, max_goals=8):
    # Get the corresponding model parameters
    home_idx = np.where(teams == home_team)[0][0]
    away_idx = np.where(teams == away_team)[0][0]

    home_attack = _params[home_idx]
    away_attack = _params[away_idx]

    home_defence = _params[home_idx + n_teams]
    away_defence = _params[away_idx + n_teams]

    home_advantage = _params[-1]

    # PMF
    home_goals = np.exp(home_advantage + home_attack + away_defence)
    away_goals = np.exp(away_attack + home_defence)
    home_goals_vector = poisson(home_goals).pmf(np.arange(0, max_goals))
    away_goals_vector = poisson(away_goals).pmf(np.arange(0, max_goals))

    # Aggregate probabilities
    m = np.outer(home_goals_vector, away_goals_vector)
    return m

In [14]:
mtx = score_mtx('Leicester City', "West Ham United", 4)
mtx

array([[0.04233894, 0.05265625, 0.03274386, 0.01357434],
       [0.08122152, 0.10101388, 0.06281465, 0.02604052],
       [0.07790623, 0.09689071, 0.06025069, 0.0249776 ],
       [0.04981751, 0.06195722, 0.03852759, 0.01597205]])

In [15]:
def odds(m):
    home = np.sum(np.tril(m, -1))
    draw = np.sum(np.diag(m))
    away = np.sum(np.triu(m, 1))
    return f"Home: {home:.2f}, Draw {draw:.2f}, Away {away:.2f}"

In [16]:
odds(mtx)

'Home: 0.41, Draw 0.22, Away 0.21'

In [17]:
def clean_sheet(m):
    home = np.sum(m[:, 0])
    away = np.sum(m[0, :])
    return f"Home: {home:.2f}, Away {away:.2f}"

In [18]:
clean_sheet(mtx)

'Home: 0.25, Away 0.14'