# Score Predictions

Evaluate the score prediction models performance from [source](https://journals.sagepub.com/doi/full/10.1177/1471082X18817650). 

## Poisson

In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np

In [3]:
from poisson import Poisson

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

poisson_model = Poisson(df[df['season'] != 2021])
poisson_model.optimize()

In [4]:
games = (
    df[df['season'] == 2021].
    loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = poisson_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.136242,0.987929,1.542468,1.190928,0.586809,0.199537,1.702545,1.741138,0.379243,0.224783,0.395129,0.175252,0.182137
1,0.0,1.0,Burnley,Arsenal,1.028134,1.079049,1.542468,1.190928,0.586809,0.199537,1.528086,1.589499,0.36772,0.237632,0.394208,0.203989,0.216896
2,5.0,0.0,Manchester City,Arsenal,1.871381,1.55902,1.542468,1.190928,0.586809,0.199537,3.551118,0.983584,0.811564,0.096836,0.062831,0.363213,0.028692
3,0.0,1.0,Brentford,Brighton and Hove Albion,1.136242,0.987929,0.971303,1.089823,0.586809,0.199537,1.883683,0.983512,0.58368,0.222279,0.193281,0.373714,0.152028
4,1.0,2.0,Burnley,Brighton and Hove Albion,1.028134,1.079049,0.971303,1.089823,0.586809,0.199537,1.690663,0.897856,0.560572,0.240495,0.198555,0.40729,0.184396


In [5]:
np.mean(poisson_model.evaluate(games)['rps'])

0.22301742882319794

In [6]:
poisson_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv
27,1.871381,1.55902,Manchester City,0.199537
25,1.722475,1.465936,Liverpool,0.199537
44,1.594428,1.369068,Tottenham Hotspur,0.199537
14,1.543177,1.352297,Chelsea,0.199537
1,1.542468,1.190928,Arsenal,0.199537


## Time Decayed Poisson

In [7]:
from poisson_decay import Poisson_Time_Decay

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

poisson_decay_model = Poisson_Time_Decay(df[df['season'] != 2021])
poisson_decay_model.optimize()

In [8]:
games = (
    df[df['season'] == 2021]
    .loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = poisson_decay_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.171558,1.003641,1.508672,1.238726,0.568913,0.173099,1.651601,1.657038,0.38358,0.22991,0.385861,0.190642,0.19168
1,0.0,1.0,Burnley,Arsenal,1.052349,1.083845,1.508672,1.238726,0.568913,0.173099,1.465998,1.529326,0.364378,0.243085,0.392199,0.21665,0.230803
2,5.0,0.0,Manchester City,Arsenal,1.891684,1.566476,1.508672,1.238726,0.568913,0.173099,3.393531,0.943835,0.808127,0.103112,0.065904,0.380241,0.03359
3,0.0,1.0,Brentford,Brighton and Hove Albion,1.171558,1.003641,1.011897,1.127695,0.568913,0.173099,1.845549,1.00829,0.569228,0.226341,0.203761,0.364602,0.157937
4,1.0,2.0,Burnley,Brighton and Hove Albion,1.052349,1.083845,1.011897,1.127695,0.568913,0.173099,1.63815,0.930579,0.539677,0.246281,0.213732,0.394205,0.194338


In [9]:
np.mean(poisson_decay_model.evaluate(games)['rps'])

0.22144856017446618

In [10]:
poisson_decay_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv
27,1.891684,1.566476,Manchester City,0.173099
25,1.724495,1.464017,Liverpool,0.173099
28,1.590104,1.332563,Manchester United,0.173099
44,1.589424,1.304167,Tottenham Hotspur,0.173099
14,1.524766,1.332519,Chelsea,0.173099


## Dixon and Coles

In [11]:
from dixon_coles import Dixon_Coles

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

dc_model = Dixon_Coles(df[df['season'] != 2021])
dc_model.optimize()

In [12]:
games = (
    df[df['season'] == 2021].
    loc[:, ["score1", "score2", "team1", "team2"]]
    )
predictions = dc_model.predict(games)
predictions.head()

Unnamed: 0,score1,score2,team1,team2,attack1,defence1,attack2,defence2,home_adv,rho,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,1.171139,1.003606,1.50848,1.238809,0.173107,-0.010986,1.111196,1.656777,0.255221,0.245089,0.498325,0.190749,0.326771
1,0.0,1.0,Burnley,Arsenal,1.05238,1.084361,1.50848,1.238809,0.173107,-0.010986,0.986766,1.528244,0.243405,0.258591,0.498266,0.216038,0.372709
2,5.0,0.0,Manchester City,Arsenal,1.891768,1.567569,1.50848,1.238809,0.173107,-0.010986,2.284315,0.942623,0.672916,0.184668,0.138809,0.388637,0.099789
3,0.0,1.0,Brentford,Brighton and Hove Albion,1.171139,1.003606,1.012627,1.127534,0.173107,-0.010986,1.241987,1.009062,0.414027,0.286264,0.299665,0.363107,0.288807
4,1.0,2.0,Burnley,Brighton and Hove Albion,1.05238,1.084361,1.012627,1.127534,0.173107,-0.010986,1.102911,0.930779,0.38959,0.3082,0.302025,0.393943,0.333438


In [13]:
np.mean(dc_model.evaluate(games)['rps'])

0.20454476845496408

In [14]:
dc_model.print_parameters().sort_values('attack', ascending=False).head()

Unnamed: 0,attack,defence,team,home_adv,rho
27,1.891768,1.567569,Manchester City,0.173107,-0.010986
25,1.724556,1.463513,Liverpool,0.173107,-0.010986
28,1.590945,1.331582,Manchester United,0.173107,-0.010986
44,1.588999,1.304573,Tottenham Hotspur,0.173107,-0.010986
14,1.525256,1.332118,Chelsea,0.173107,-0.010986


# Bayesian

In [15]:
from bayesian import Bayesian

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 27 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [16]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.149653,-0.110053,0.466687,-0.229498,0.199425,0.111823,1.260366,1.597526,0.301577,0.245897,0.452216,0.202386,0.283477
1,0.0,1.0,Burnley,Arsenal,10,1,-0.028467,-0.126916,0.466687,-0.229498,0.199425,0.111823,1.054728,1.570813,0.25484,0.252195,0.492719,0.207873,0.348207
2,5.0,0.0,Manchester City,Arsenal,27,1,0.794361,-0.549155,0.466687,-0.229498,0.199425,0.111823,2.40154,1.02979,0.673496,0.177701,0.14544,0.355885,0.090577
3,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.149653,-0.110053,-0.084728,-0.133442,0.199425,0.111823,1.387438,0.92039,0.47731,0.273299,0.249284,0.398324,0.249713
4,1.0,2.0,Burnley,Brighton and Hove Albion,10,8,-0.028467,-0.126916,-0.084728,-0.133442,0.199425,0.111823,1.161066,0.904999,0.416365,0.299128,0.284473,0.40453,0.313151


In [17]:
np.mean(model.evaluate(games)['rps'])

0.19804332338819447

In [18]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
3,Manchester City,0.794361,-0.549155,0.199425,0.111823
19,Liverpool,0.645263,-0.472328,0.199425,0.111823
13,Tottenham Hotspur,0.519075,-0.388038,0.199425,0.111823
9,Chelsea,0.469242,-0.373585,0.199425,0.111823
8,Arsenal,0.466687,-0.229498,0.199425,0.111823


# Bayesian Decay

In [19]:
from bayesian_decay import Bayesian_Time_Decay

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian_Time_Decay(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 26 seconds.


In [20]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.200007,-0.138459,0.34644,-0.196323,0.173643,0.125229,1.353314,1.39544,0.362559,0.255452,0.3818,0.247703,0.258356
1,0.0,1.0,Burnley,Arsenal,10,1,-0.065826,-0.073125,0.34644,-0.196323,0.173643,0.125229,1.037405,1.489654,0.263609,0.260092,0.476124,0.225448,0.354316
2,5.0,0.0,Manchester City,Arsenal,27,1,0.719032,-0.42546,0.34644,-0.196323,0.173643,0.125229,2.274093,1.047293,0.647121,0.189052,0.161395,0.350038,0.102889
3,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.200007,-0.138459,-0.103236,-0.105201,0.173643,0.125229,1.482423,0.890061,0.510315,0.263338,0.226186,0.410566,0.227086
4,1.0,2.0,Burnley,Brighton and Hove Albion,10,8,-0.065826,-0.073125,-0.103236,-0.105201,0.173643,0.125229,1.136375,0.950154,0.398342,0.298951,0.302674,0.386672,0.320978


In [21]:
np.mean(model.evaluate(games)['rps'])

0.20221302058012297

In [22]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
3,Manchester City,0.719032,-0.42546,0.173643,0.125229
19,Liverpool,0.555337,-0.35892,0.173643,0.125229
10,Manchester United,0.426942,-0.26707,0.173643,0.125229
13,Tottenham Hotspur,0.423309,-0.245391,0.173643,0.125229
9,Chelsea,0.363578,-0.265469,0.173643,0.125229


## Bayesian XG

In [23]:
from bayesian_xg import Bayesian_XG

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    .dropna()
    )

model = Bayesian_XG(df[df['season'] != 2021])
model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 6 jobs)
NUTS: [def_star, tau_def, atts_star, tau_att, intercept, home]


Sampling 6 chains for 1_000 tune and 2_000 draw iterations (6_000 + 12_000 draws total) took 24 seconds.


In [24]:
df = (df
    .loc[df['league_id'] == 2411]
    .dropna()
    .loc[df['season'] == 2021]
    .merge(model.teams, left_on="team1", right_on="team")
    .rename(columns={"team_index": "hg"})
    .drop(["team"], axis=1)
    .merge(model.teams, left_on="team2", right_on="team")
    .rename(columns={"team_index": "ag"})
    .drop(["team"], axis=1)
    .sort_values("date")
)
games = df.loc[:, ["score1", "score2", "team1", "team2", "hg", "ag"]]
model.predict(games).head()

Unnamed: 0,score1,score2,team1,team2,hg,ag,attack1,defence1,attack2,defence2,home_adv,intercept,score1_infered,score2_infered,home_win_p,draw_p,away_win_p,home_cs_p,away_cs_p
0,2.0,0.0,Brentford,Arsenal,7,1,0.123369,-0.17425,0.218824,-0.047979,0.16494,0.147438,1.473688,1.211685,0.431992,0.25641,0.311408,0.29765,0.22907
1,0.0,1.0,Burnley,Arsenal,10,1,-0.017121,0.045422,0.218824,-0.047979,0.16494,0.147438,1.280535,1.509357,0.322353,0.251453,0.425959,0.221039,0.277839
2,5.0,0.0,Manchester City,Arsenal,27,1,0.613408,-0.332136,0.218824,-0.047979,0.16494,0.147438,2.405619,1.034715,0.673091,0.177586,0.145924,0.354125,0.090208
3,0.0,1.0,Brentford,Brighton and Hove Albion,7,8,0.123369,-0.17425,0.027652,-0.050296,0.16494,0.147438,1.470278,1.000841,0.479828,0.262765,0.257249,0.367516,0.229859
4,1.0,2.0,Burnley,Brighton and Hove Albion,10,8,-0.017121,0.045422,0.027652,-0.050296,0.16494,0.147438,1.277571,1.246715,0.373026,0.268479,0.35839,0.287431,0.2787


In [25]:
np.mean(model.evaluate(games)['rps'])

0.2021603464125936

In [26]:
parameter_df = (
    pd.DataFrame()
    .assign(attack=[np.mean([x[team] for x in model.trace["atts"]]) for team in range(model.league_size)])
    .assign(defence=[np.mean([x[team] for x in model.trace["defs"]]) for team in range(model.league_size)])
    .assign(team=np.array(model.teams.team_index.values))
)

aggregate_df = (
    model.games.loc[:, ["team1", "hg"]]
    .drop_duplicates()
    .merge(parameter_df, left_on='hg', right_on='team')
    .assign(home_adv=np.mean(model.trace["home"]))
    .assign(intercept=np.mean([x for x in model.trace["intercept"]]))
    .drop(["hg", "team"], axis=1)
)
aggregate_df.sort_values('attack', ascending=False).head()

Unnamed: 0,team1,attack,defence,home_adv,intercept
3,Manchester City,0.613408,-0.332136,0.16494,0.147438
19,Liverpool,0.507287,-0.183169,0.16494,0.147438
9,Chelsea,0.36418,-0.277409,0.16494,0.147438
13,Tottenham Hotspur,0.304015,-0.062237,0.16494,0.147438
10,Manchester United,0.295058,-0.147418,0.16494,0.147438


## Soccer Performance Index

In [27]:
from spi import SPI

df = pd.read_csv("../data/fivethirtyeight/spi_matches.csv")
df = (df
    .loc[(df['league_id'] == 2411) | (df['league_id'] == 2412)]
    )
df = df[df['season'] == 2021]
df = df[df['score1'].notna()]

spi = SPI(df)

In [28]:
spi.predict().head()

Unnamed: 0,proj_score1,proj_score2,score1,score2,team1,team2,home_win_p,away_win_p,draw_p,home_cs_p,away_cs_p
44458,1.77,1.13,2.0,2.0,AFC Bournemouth,West Bromwich Albion,0.5183,0.2386,0.2431,0.32287,0.170329
44476,1.47,1.05,1.0,1.0,Derby County,Huddersfield Town,0.4612,0.2674,0.2715,0.349886,0.229922
44479,1.48,1.05,3.0,0.0,Luton Town,Peterborough United,0.4643,0.265,0.2707,0.349884,0.227634
44489,1.49,1.13,2.0,1.0,Blackburn,Swansea City,0.4481,0.2845,0.2674,0.322981,0.225367
44491,1.51,0.99,1.0,4.0,Preston North End,Hull City,0.4854,0.2456,0.2691,0.371511,0.220908


In [29]:
np.mean(spi.evaluate()['rps'])

0.20430506232044202