In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt
import scipy
import scipy.optimize

In [2]:
from models.model_1 import Model1
from models.model_6 import Model6

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 100)
pd.options.display.float_format = "{: ,.5f}".format

# data

In [4]:
df_events = pd.read_csv("data/events.csv").sort_values(
    [
        "game_id",
        "minute",
    ],
    ignore_index=True,
)
df_games = pd.read_csv("data/games.csv").assign(date=lambda x: pd.to_datetime(x.date))

In [5]:
df = (
    df_games.merge(df_events, how="outer", on=["game_id"])
    .sort_values(by=["game_id", "minute"])
    .query(
        "competition == 'Premier League, England' & '2018-08-01' < date < '2019-08-01'"
    )
    .reset_index(drop=True)
)

In [6]:
list_game_score_minutes = list()

game_id = 0


for i, row in df.iterrows():

    if row.game_id != game_id:
        if game_id != 0:  # Changing match
            # No goal until end-of-match of previous game
            dict_game_score_minutes = dict(
                game_id=game_id,
                home_score=home_score,
                away_score=away_score,
                start_minute=last_minute,
                end_minute=90,
                side="both",
                goal=0,
            )
            list_game_score_minutes.append(dict_game_score_minutes)

        game_id = row.game_id
        home_score, away_score = 0, 0
        last_minute = 0

    if np.isnan(row.minute):  # Match finished 0-0 with no red cards and no goals
        dict_game_score_minutes = dict(
            game_id=game_id,
            home_score=0,
            away_score=0,
            start_minute=0,
            end_minute=90,
            side="both",
            goal=0,
        )
        list_game_score_minutes.append(dict_game_score_minutes)
        continue

    if row.type == "goal":
        # Waiting for the goal
        dict_game_score_minutes = dict(
            game_id=game_id,
            home_score=home_score,
            away_score=away_score,
            start_minute=last_minute,
            end_minute=row.minute,
            side="both",
            goal=0,
        )
        list_game_score_minutes.append(dict_game_score_minutes)
        last_minute = row.minute

        # Scoring the goal
        dict_game_score_minutes = dict(
            game_id=game_id,
            home_score=home_score,
            away_score=away_score,
            start_minute=row.minute,
            end_minute=row.minute,
            side=row.side,
            goal=1,
        )
        list_game_score_minutes.append(dict_game_score_minutes)

        if row.side == "away":
            away_score += 1

        elif row.side == "home":
            home_score += 1

    if i == len(df) - 1:
        # No goal until end-of-match
        dict_game_score_minutes = dict(
            game_id=game_id,
            home_score=home_score,
            away_score=away_score,
            start_minute=last_minute,
            end_minute=90,
            side="both",
            goal=0,
        )
        list_game_score_minutes.append(dict_game_score_minutes)

In [7]:
df_goal_events = pd.DataFrame(list_game_score_minutes).merge(
    df[["game_id", "home_team", "away_team"]].drop_duplicates(ignore_index=True),
    how="left",
    on=["game_id"],
)

In [8]:
df_goal_events

Unnamed: 0,game_id,home_score,away_score,start_minute,end_minute,side,goal,home_team,away_team
0,68,0,0,0.00000,3.00000,both,0,Manchester United,Leicester City
1,68,0,0,3.00000,3.00000,home,1,Manchester United,Leicester City
2,68,1,0,3.00000,83.00000,both,0,Manchester United,Leicester City
3,68,1,0,83.00000,83.00000,home,1,Manchester United,Leicester City
4,68,2,0,83.00000,90.00000,both,0,Manchester United,Leicester City
...,...,...,...,...,...,...,...,...,...
2539,4071,1,2,46.00000,71.00000,both,0,Watford,West Ham United
2540,4071,1,2,71.00000,71.00000,away,1,Watford,West Ham United
2541,4071,1,3,71.00000,78.00000,both,0,Watford,West Ham United
2542,4071,1,3,78.00000,78.00000,away,1,Watford,West Ham United


# models

For match $k$  

Scoring intensities for home and away team respectively are:  

\begin{equation}
\lambda_k(t) = \lambda_k = \alpha_{i(k)}\beta_{j(k)}\gamma_h,
\end{equation}

\begin{equation}
\mu_k(t) = \mu_k = \alpha_{j(k)}\beta_{i(k)}
\end{equation}

The likelihood is:  

\begin{equation}
L(\mathbf{t}_k, \mathbf{J}_k) = \exp(-\Lambda[0, 1]) \exp(-\Upsilon[0, 1]) \prod_{l=1}^{m_k} \lambda_k(t_{k,l})^{1-J_{k,l}} \mu_k(t_{k,l})^{J_{k,l}},
\end{equation}

\begin{equation}
\Lambda[t_1, t_2] = \int_{t_1}^{t_2} \lambda_k(t) dt
\end{equation}

\begin{equation}
\Upsilon[t_1, t_2] = \int_{t_1}^{t_2} \mu_k(t) dt  
\end{equation}

where  
$J$ is indicator for home goal  
$m_k$ is the total number of goals in the match  
  
The likelihood can be broken down into the first 2 terms which is the time waiting for goals to occur. This is the time spent between the goals and the time spent after the last goal. Related to the survival function of an exponential distribution, representing the probability of scoring 0 goals.  
And the final 2 terms which are probability density of a goal being scored at time t.    

In [9]:
teams_home = np.asarray(df_goal_events.home_team.values, dtype=str, order="C")
teams_away = np.asarray(df_goal_events.away_team.values, dtype=str, order="C")

In [10]:
start_minutes = np.asarray(
    df_goal_events.start_minute.values, dtype=np.float64, order="C"
)
end_minutes = np.asarray(df_goal_events.end_minute.values, dtype=np.float64, order="C")
goals = np.asarray(df_goal_events.goal.values, dtype=bool, order="C")
home_scores = np.asarray(df_goal_events.home_score.values, dtype=np.int64, order="C")
away_scores = np.asarray(df_goal_events.away_score.values, dtype=np.int64, order="C")
homes = np.asarray((df_goal_events.side == "home"), dtype=bool, order="C")

In [11]:
def callback_function(xk, loss_function):
    # xk is the current parameter vector
    # Print the current iteration values
    global iteration
    iteration += 1
    current_loss = loss_function(xk)
    print(f"Iteration {iteration}: Loss = {current_loss}")

    # You could also print specific parameter values if needed
    # print(f"Current alpha values: {xk[:n_teams]}")
    # print(f"Current beta values: {xk[n_teams:2*n_teams]}")
    # print(f"Current home advantage: {xk[-1]}")

    return False  # Return False to continue optimization

## Model 1

In [12]:
model_1 = Model1(
    teams_home,
    teams_away,
    start_minutes,
    end_minutes,
    goals,
    home_scores,
    away_scores,
    homes,
)

In [13]:
# initial parameters
alphas = [0.0] * model_1.n_teams
betas = [0.0] * model_1.n_teams
constant = 0.4
home_advantage = 0.1

init_params = np.concatenate([alphas, betas, [constant], [home_advantage]])

In [14]:
iteration = 0
model_1.fit(
    init_params, callback_function=lambda x: callback_function(x, model_1.loss_function)
)

Iteration 1: Loss = 2461.3027426712533
Iteration 2: Loss = 32797.79024324296
Iteration 3: Loss = 6105.673668688497
Iteration 4: Loss = 15138.441876459388
Iteration 5: Loss = 2417.4092929922012
Iteration 6: Loss = 7480.507057287203
Iteration 7: Loss = 2988.123332866773
Iteration 8: Loss = 1609.663509767134
Iteration 9: Loss = 1206.6229565628937
Iteration 10: Loss = 810.4173091181292
Iteration 11: Loss = 934.1849602371992
Iteration 12: Loss = 701.8444331238188
Iteration 13: Loss = 660.1603595644245
Iteration 14: Loss = 731.1070453659066
Iteration 15: Loss = 684.0276994805444
Iteration 16: Loss = 718.7150909448114
Iteration 17: Loss = 663.550698283551
Iteration 18: Loss = 648.294244408735
Iteration 19: Loss = 647.8146680349366
Iteration 20: Loss = 660.2712654334731
Iteration 21: Loss = 656.9188793925665
Iteration 22: Loss = 648.4113341030826
Iteration 23: Loss = 646.6692589059423
Iteration 24: Loss = 646.5751718911912
Iteration 25: Loss = 646.7697733192723
Iteration 26: Loss = 646.6260755

In [15]:
model_1.params

{'constant': 0.07236639208485858, 'home_advantage': 0.2237884151231505}

In [16]:
model_1.team_params.sort_values("alpha", ascending=False, ignore_index=True)

Unnamed: 0,team,alpha,beta
0,Manchester City,0.63836,-0.75217
1,Liverpool,0.52691,-0.86396
2,Arsenal,0.41384,0.07238
3,Tottenham Hotspur,0.31517,-0.20311
4,Manchester United,0.25847,0.06098
5,Chelsea,0.15289,-0.30827
6,Bournemouth,0.13267,0.35169
7,West Ham United,0.05582,0.09946
8,Everton,0.05331,-0.14559
9,Watford,0.01919,0.15547


In [17]:
model_1.calculate_log_likelihood(
    # params
    alphas=model_1.team_params.alpha.values,
    betas=model_1.team_params.beta.values,
    constant=model_1.params["constant"],
    home_advantage=model_1.params["home_advantage"],
    # model functions,
    lambda_k=model_1.lambda_k,
    mu_k=model_1.mu_k,
    # codes
    home_indices=model_1.home_indices,
    away_indices=model_1.away_indices,
    # obs data
    start_minutes=start_minutes,
    end_minutes=end_minutes,
    goals=goals,
    home_scores=home_scores,
    away_scores=away_scores,
    homes=homes,
)

-646.5719945605159

## Model 6

In [18]:
model_6 = Model6(
    teams_home,
    teams_away,
    start_minutes,
    end_minutes,
    goals,
    home_scores,
    away_scores,
    homes,
)

In [19]:
# parameters
alphas = model_1.team_params.alpha.values
betas = model_1.team_params.beta.values
constant = model_1.params["constant"]
home_advantage = model_1.params["home_advantage"]
injury_time_1 = 0.0
injury_time_2 = 0.0
lambda_10 = 0.0
lambda_01 = 0.0
lambda_11 = 0.0
lambda_22 = 0.0
lambda_21 = 0.0
lambda_12 = 0.0
mu_10 = 0.0
mu_01 = 0.0
mu_11 = 0.0
mu_22 = 0.0
mu_21 = 0.0
mu_12 = 0.0
epsilon_1 = -10.0
epsilon_2 = -10.0


init_params = np.concatenate(
    [
        alphas,
        betas,
        [
            constant,
            home_advantage,
            injury_time_1,
            injury_time_2,
            lambda_10,
            lambda_01,
            lambda_11,
            lambda_22,
            lambda_21,
            lambda_12,
            mu_10,
            mu_01,
            mu_11,
            mu_22,
            mu_21,
            mu_12,
            epsilon_1,
            epsilon_2,
        ],
    ]
)

In [20]:
model_6.calculate_log_likelihood(
    # params
    alphas=alphas,
    betas=betas,
    constant=constant,
    home_advantage=home_advantage,
    epsilon_1=-10.0,
    epsilon_2=-10.0,
    injury_time_1=0.0,
    injury_time_2=0.0,
    lambda_10=0.0,
    lambda_01=0.0,
    lambda_11=0.0,
    lambda_22=0.0,
    lambda_21=0.0,
    lambda_12=0.0,
    mu_10=0.0,
    mu_01=0.0,
    mu_11=0.0,
    mu_22=0.0,
    mu_21=0.0,
    mu_12=0.0,
    # model functions,
    lambda_k=model_6.lambda_k,
    mu_k=model_6.mu_k,
    # codes
    home_indices=model_6.home_indices,
    away_indices=model_6.away_indices,
    # obs data
    start_minutes=start_minutes,
    end_minutes=end_minutes,
    goals=goals,
    home_scores=home_scores,
    away_scores=away_scores,
    homes=homes,
)

-646.5696005633034

In [21]:
model_6.loss_function(init_params)

646.5696005633034

In [22]:
iteration = 0
model_6.fit(
    init_params, callback_function=lambda x: callback_function(x, model_6.loss_function)
)

Iteration 1: Loss = 640.3471048175104
Iteration 2: Loss = 1480.4684284142763
Iteration 3: Loss = 3178.6253631345
Iteration 4: Loss = 1195.479928720954
Iteration 5: Loss = 1386.4945990306098
Iteration 6: Loss = 3186.5581642992597
Iteration 7: Loss = 3119.3911395072137
Iteration 8: Loss = 1380.7795503270802
Iteration 9: Loss = 1064.3669717709233
Iteration 10: Loss = 975.9282811756372
Iteration 11: Loss = 1011.7444718123397
Iteration 12: Loss = 1305.7786536706901
Iteration 13: Loss = 846.6318683068115
Iteration 14: Loss = 643.2442063697692
Iteration 15: Loss = 642.2405517011898
Iteration 16: Loss = 655.459697248886
Iteration 17: Loss = 839.8665923382937
Iteration 18: Loss = 675.4494049299141
Iteration 19: Loss = 547.8154044163533
Iteration 20: Loss = 544.2609168706325
Iteration 21: Loss = 600.5795073617453
Iteration 22: Loss = 612.2452468974694
Iteration 23: Loss = 552.1411429718916
Iteration 24: Loss = 532.6731041304951
Iteration 25: Loss = 544.0657752353401
Iteration 26: Loss = 624.2933

In [23]:
model_6.params

{'constant': -0.13099046484116525,
 'home_advantage': 0.21367196809259686,
 'injury_time_1': 1.0736383911084582,
 'injury_time_2': 1.995840093358755,
 'lambda_10': 0.1686952520834389,
 'lambda_01': 0.2529932297833304,
 'lambda_11': 0.3054926207978921,
 'lambda_22': 0.4989600233397253,
 'lambda_21': -0.010128907880822725,
 'lambda_12': 0.2879702073849168,
 'mu_10': 0.13744333104495995,
 'mu_01': 0.08909043996354939,
 'mu_11': 0.4384144643518268,
 'mu_22': -0.4989600233397228,
 'mu_21': 0.2527152225677677,
 'mu_12': 0.055004652438987674,
 'epsilon_1': -9.994382691327768,
 'epsilon_2': -9.997691842640823}

In [24]:
model_6.team_params.sort_values("alpha", ascending=False, ignore_index=True)

Unnamed: 0,team,alpha,beta
0,Manchester City,0.67908,-0.80581
1,Liverpool,0.56714,-0.89883
2,Arsenal,0.42869,0.03134
3,Tottenham Hotspur,0.33673,-0.2236
4,Manchester United,0.24731,0.03602
5,Chelsea,0.16945,-0.27795
6,Bournemouth,0.11918,0.37147
7,Everton,0.0753,-0.14525
8,West Ham United,0.03971,0.08509
9,Watford,0.02322,0.16345


In [None]:
# To do:
# * speed up python code


# * speed up fitting with Cython