# Score Predictions

Evaluate the score prediction models performance from [source](https://journals.sagepub.com/doi/full/10.1177/1471082X18817650). 

## Poisson

In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np

In [3]:
from poisson import Poisson

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

poisson_model = Poisson(df[df['season'] != 2021])
poisson_model.optimize()

In [4]:
games = (
    df[df['season'] == 2021].
    loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = poisson_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.136242,0.98794,1.542477,1.190938,0.586816,0.199533,1.702538,1.741134,0.379242,0.224784,0.39513,0.175253,0.182138
1,0.0,1.0,Burnley,Arsenal,1.028134,1.079066,1.542477,1.190938,0.586816,0.199533,1.528081,1.589486,0.367721,0.237632,0.394205,0.203991,0.216898
2,5.0,0.0,Manchester City,Arsenal,1.87139,1.559036,1.542477,1.190938,0.586816,0.199533,3.55114,0.983577,0.811566,0.096834,0.06283,0.363215,0.028692
3,0.0,0.0,Brighton and Hove Albion,Arsenal,0.971289,1.089846,1.542477,1.190938,0.586816,0.199533,1.443639,1.572443,0.350769,0.241721,0.407147,0.20751,0.236012
4,0.0,1.0,Brentford,Brighton and Hove Albion,1.136242,0.98794,0.971289,1.089846,0.586816,0.199533,1.88365,0.983487,0.583678,0.222282,0.19328,0.373724,0.152033


In [5]:
np.mean(poisson_model.evaluate(games)['rps'])

0.22137372533527627

In [6]:
poisson_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv
27,1.87139,1.559036,Manchester City,0.199533
25,1.722476,1.46592,Liverpool,0.199533
44,1.594433,1.369049,Tottenham Hotspur,0.199533
14,1.54318,1.35231,Chelsea,0.199533
1,1.542477,1.190938,Arsenal,0.199533


## Time Decayed Poisson

In [7]:
from poisson_decay import Poisson_Time_Decay

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

poisson_decay_model = Poisson_Time_Decay(df[df['season'] != 2021])
poisson_decay_model.optimize()

In [8]:
games = (
    df[df['season'] == 2021]
    .loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = poisson_decay_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.171558,1.003641,1.508672,1.238727,0.568912,0.173099,1.651598,1.657037,0.38358,0.22991,0.385862,0.190642,0.19168
1,0.0,1.0,Burnley,Arsenal,1.052349,1.083845,1.508672,1.238727,0.568912,0.173099,1.465995,1.529326,0.364377,0.243085,0.392199,0.21665,0.230804
2,5.0,0.0,Manchester City,Arsenal,1.891683,1.566476,1.508672,1.238727,0.568912,0.173099,3.393525,0.943835,0.808127,0.103113,0.065904,0.380241,0.03359
3,0.0,0.0,Brighton and Hove Albion,Arsenal,1.011897,1.127696,1.508672,1.238727,0.568912,0.173099,1.407877,1.463713,0.362844,0.249051,0.38785,0.23135,0.244627
4,0.0,1.0,Brentford,Brighton and Hove Albion,1.171558,1.003641,1.011897,1.127696,0.568912,0.173099,1.845545,1.00829,0.569227,0.226341,0.203761,0.364602,0.157938


In [9]:
np.mean(poisson_decay_model.evaluate(games)['rps'])

0.21980951365630053

In [10]:
poisson_decay_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv
27,1.891683,1.566476,Manchester City,0.173099
25,1.724495,1.464016,Liverpool,0.173099
28,1.590105,1.332563,Manchester United,0.173099
44,1.589424,1.304167,Tottenham Hotspur,0.173099
14,1.524766,1.332519,Chelsea,0.173099


## Dixon and Coles

In [11]:
from dixon_coles import Dixon_Coles

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

dc_model = Dixon_Coles(df[df['season'] != 2021])
dc_model.optimize()

In [12]:
games = (
    df[df['season'] == 2021].
    loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = dc_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.171139,1.003608,1.50848,1.238807,0.173108,-0.010986,1.111199,1.656773,0.255222,0.245089,0.498323,0.19075,0.326771
1,0.0,1.0,Burnley,Arsenal,1.052378,1.08436,1.50848,1.238807,0.173108,-0.010986,0.986767,1.528244,0.243405,0.258591,0.498266,0.216038,0.372708
2,5.0,0.0,Manchester City,Arsenal,1.891768,1.567568,1.50848,1.238807,0.173108,-0.010986,2.28432,0.942623,0.672916,0.184668,0.138809,0.388636,0.099788
3,0.0,0.0,Brighton and Hove Albion,Arsenal,1.012627,1.127535,1.50848,1.238807,0.173108,-0.010986,0.948311,1.463666,0.244472,0.265866,0.490878,0.231385,0.387339
4,0.0,1.0,Brentford,Brighton and Hove Albion,1.171139,1.003608,1.012627,1.127535,0.173108,-0.010986,1.241986,1.009059,0.414027,0.286264,0.299664,0.363108,0.288807


In [13]:
np.mean(dc_model.evaluate(games)['rps'])

0.20455618318561566

In [14]:
dc_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv,rho
27,1.891768,1.567568,Manchester City,0.173108,-0.010986
25,1.724556,1.463515,Liverpool,0.173108,-0.010986
28,1.590945,1.331583,Manchester United,0.173108,-0.010986
44,1.588999,1.304573,Tottenham Hotspur,0.173108,-0.010986
14,1.525255,1.332117,Chelsea,0.173108,-0.010986


## Bayesian

In [15]:
from bayesian import Bayesian

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 24 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [16]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.149231,-0.109722,0.467286,-0.229276,0.199443,0.111584,1.259837,1.598631,0.301255,0.24583,0.452604,0.202163,0.283627
1,0.0,1.0,Burnley,Arsenal,10,1,-0.029463,-0.12724,0.467286,-0.229276,0.199443,0.111584,1.05368,1.570871,0.254567,0.252204,0.492983,0.207861,0.348572
2,5.0,0.0,Manchester City,Arsenal,27,1,0.794973,-0.54893,0.467286,-0.229276,0.199443,0.111584,2.403016,1.030393,0.673612,0.177606,0.145406,0.355666,0.090444
3,0.0,0.0,Brighton and Hove Albion,Arsenal,8,1,-0.084753,-0.133432,0.467286,-0.229276,0.199443,0.111584,0.997003,1.561174,0.241756,0.253849,0.504163,0.209887,0.368902
4,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.149231,-0.109722,-0.084753,-0.133432,0.199443,0.111584,1.386561,0.920451,0.477056,0.273394,0.249445,0.398299,0.249932


In [17]:
np.mean(model.evaluate(games)['rps'])

0.19222340235353397

In [18]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
2,Manchester City,0.794973,-0.54893,0.199443,0.111584
19,Liverpool,0.646241,-0.472064,0.199443,0.111584
16,Tottenham Hotspur,0.519044,-0.388415,0.199443,0.111584
9,Chelsea,0.46942,-0.373922,0.199443,0.111584
7,Arsenal,0.467286,-0.229276,0.199443,0.111584


## Bayesian Decay

In [19]:
from bayesian_decay import Bayesian_Time_Decay

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian_Time_Decay(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 25 seconds.


In [20]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.199645,-0.137212,0.346485,-0.195994,0.173061,0.125142,1.352363,1.397123,0.361982,0.255404,0.382424,0.247287,0.258601
1,0.0,1.0,Burnley,Arsenal,10,1,-0.066898,-0.073833,0.346485,-0.195994,0.173061,0.125142,1.035941,1.488538,0.263414,0.26023,0.476182,0.225699,0.354835
2,5.0,0.0,Manchester City,Arsenal,27,1,0.719167,-0.426302,0.346485,-0.195994,0.173061,0.125142,2.273626,1.046368,0.647246,0.189056,0.161269,0.350363,0.102937
3,0.0,0.0,Brighton and Hove Albion,Arsenal,8,1,-0.101944,-0.105561,0.346485,-0.195994,0.173061,0.125142,1.000265,1.44205,0.261765,0.265567,0.472528,0.23644,0.367734
4,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.199645,-0.137212,-0.101944,-0.105561,0.173061,0.125142,1.480362,0.892245,0.509223,0.263544,0.227073,0.409671,0.227554


In [21]:
np.mean(model.evaluate(games)['rps'])

0.19597881520140314

In [22]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
2,Manchester City,0.719167,-0.426302,0.173061,0.125142
19,Liverpool,0.555022,-0.35862,0.173061,0.125142
10,Manchester United,0.427095,-0.265265,0.173061,0.125142
16,Tottenham Hotspur,0.424058,-0.244861,0.173061,0.125142
9,Chelsea,0.363022,-0.266825,0.173061,0.125142


## Bayesian XG

In [23]:
from bayesian_xg import Bayesian_XG

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian_XG(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 24 seconds.


In [24]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.122778,-0.172533,0.219132,-0.04879,0.165258,0.147187,1.471722,1.213836,0.431013,0.25648,0.312318,0.297011,0.229521
1,0.0,1.0,Burnley,Arsenal,10,1,-0.017531,0.044962,0.219132,-0.04879,0.165258,0.147187,1.279059,1.508749,0.322094,0.251546,0.426126,0.221174,0.27825
2,5.0,0.0,Manchester City,Arsenal,27,1,0.614074,-0.332681,0.219132,-0.04879,0.165258,0.147187,2.40543,1.034209,0.673172,0.177579,0.145853,0.354304,0.090225
3,0.0,0.0,Brighton and Hove Albion,Arsenal,8,1,0.028298,-0.050882,0.219132,-0.04879,0.165258,0.147187,1.339041,1.370858,0.363804,0.257595,0.37843,0.253869,0.262073
4,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.122778,-0.172533,0.028298,-0.050882,0.165258,0.147187,1.468646,1.002956,0.478891,0.262879,0.258072,0.36674,0.230234


In [25]:
np.mean(model.evaluate(games)['rps'])

0.19510248394084523

In [26]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
2,Manchester City,0.614074,-0.332681,0.165258,0.147187
19,Liverpool,0.508975,-0.183587,0.165258,0.147187
9,Chelsea,0.363822,-0.277196,0.165258,0.147187
16,Tottenham Hotspur,0.304373,-0.064063,0.165258,0.147187
10,Manchester United,0.296718,-0.147616,0.165258,0.147187


## Soccer Performance Index

In [27]:
from spi import SPI

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    )
df = df[df['season'] == 2021]
df = df[df['score1'].notna()]

spi = SPI(df)

In [28]:
spi.predict().head()

Unnamed: 0,proj_score1,proj_score2,score1,score2,team1,team2,home_win_p,away_win_p,draw_p,home_cs_p,away_cs_p
44457,1.77,1.13,2.0,2.0,AFC Bournemouth,West Bromwich Albion,0.5183,0.2386,0.2431,0.32287,0.170329
44473,1.37,1.24,1.0,1.0,Cardiff City,Barnsley,0.3939,0.3338,0.2723,0.289358,0.254095
44474,1.47,1.05,1.0,1.0,Derby County,Huddersfield Town,0.4612,0.2674,0.2715,0.349886,0.229922
44475,1.48,1.05,3.0,0.0,Luton Town,Peterborough United,0.4643,0.265,0.2707,0.349884,0.227634
44476,1.4,1.32,1.0,1.0,Bristol City,Blackpool,0.3866,0.3468,0.2666,0.267107,0.246579


In [29]:
np.mean(spi.evaluate()['rps'])

0.20653926514778326

## Betting Odds

In [30]:
df = (
    pd.read_csv('../data/betting/2021-22.csv')
    .loc[:, ["FTHG", "FTAG", "B365H", "B365D", "B365A"]]
    .rename(columns={
        "FTHG": "score1",
        "FTAG": "score2",
        "B365H": "home_win_p",
        "B365D": "draw_p",
        "B365A": "away_win_p"})
        )

df['total'] = (100 / df['home_win_p'] + 100 / df['draw_p'] + 100 / df['away_win_p'])
df['home_win_p'] = 100 / df['home_win_p'] / df['total'] 
df['away_win_p'] = 100 / df['away_win_p'] / df['total']
df['draw_p'] = 100 / df['draw_p'] / df['total']

In [31]:
from ranked_probability_score import ranked_probability_score, match_outcome

df["winner"] = match_outcome(df)

df["rps"] = df.apply(
    lambda row: ranked_probability_score(
        [row["home_win_p"], row["draw_p"], row["away_win_p"]], int(row["winner"])
        ), axis=1)

np.mean(df['rps'])

0.18133581928678952