In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 100)
pd.options.display.float_format = "{: ,.5f}".format

# data

In [3]:
df_games = (
    pd.read_csv("data/games.csv")
    .assign(date=lambda x: pd.to_datetime(x.date))
    .query(
        "competition == 'Premier League, England' & '2018-08-01' < date < '2019-08-01'"
    )
    .reset_index(drop=True)
)

In [4]:
df_team_level = pd.concat(
    [
        df_games.rename(
            columns={col: col[5:] for col in df_games.columns if col.startswith("home")}
        )
        .rename(
            columns={
                col: f"opp{col[4:]}"
                for col in df_games.columns
                if col.startswith("away")
            }
        )
        .assign(home=1),
        df_games.rename(
            columns={col: col[5:] for col in df_games.columns if col.startswith("away")}
        )
        .rename(
            columns={
                col: f"opp{col[4:]}"
                for col in df_games.columns
                if col.startswith("home")
            }
        )
        .assign(home=0),
    ]
).sort_values(by=["game_id", "home"], ignore_index=True)

# model

In [5]:
# Constraints
def create_constraints(
    features: list[str], df_team_competitions: pd.DataFrame
) -> tuple[np.ndarray, np.ndarray]:
    """
    Constrains alpha and beta params to sum to 0 for each league
    """
    # Constraint matrix
    constraint_matrix = np.array(
        [
            [0 for _ in features]
            for _ in range(2 * len(df_team_competitions.competition.unique()))
        ]
    )
    i = 0
    for alpha_or_beta in ["alpha", "beta"]:
        for league in df_team_competitions.competition.unique():
            for j, feature in enumerate(features):
                if feature.startswith(alpha_or_beta) and any(
                    [
                        feature.endswith(team)
                        for team in df_team_competitions.query(
                            "competition == @league"
                        ).team
                    ]
                ):
                    constraint_matrix[i, j] = 1
            i += 1

    # Constraint array
    constraint_array = np.array(
        [0 for _ in range(2 * len(df_team_competitions.competition.unique()))]
    )
    return constraint_matrix, constraint_array


# Model
class Model:
    def __init__(
        self,
        df_train: pd.DataFrame,
        features: list[str],
        TEAM_NAMES: np.ndarray,
    ) -> None:
        self.target_variable = "goals"
        self.df_train = df_train
        self.TEAM_NAMES = TEAM_NAMES
        self.features = features
        self.y_train = df_train[self.target_variable]
        self.X_train = df_train[features]
        self.df_team_competitions = self._get_df_team_competitions(df_train)
        constraint_matrix, constraint_array = create_constraints(
            features, self.df_team_competitions
        )
        self.constraint_matrix = constraint_matrix
        self.constraint_array = constraint_array
        self.model = sm.GLM(self.y_train, self.X_train, family=sm.families.Poisson())
        self.fitted_model = None

    def train_model(self) -> None:
        self.fitted_model = self.model.fit_constrained(
            (self.constraint_matrix, self.constraint_array)
        )

    def model_predict(self, df_test: pd.DataFrame) -> pd.DataFrame:
        predictions = self.fitted_model.predict(df_test[self.features])
        return predictions

    def create_constraints(self, features: list[str]) -> tuple[np.ndarray, np.ndarray]:
        constraint_matrix, constraint_array = create_constraints(features)
        return constraint_matrix, constraint_array

    def _get_df_team_competitions(self, df_train: pd.DataFrame) -> pd.DataFrame:
        df_team_competitions = (
            df_train.groupby("team", as_index=False)
            .competition.first()
            .sort_values("competition", ignore_index=True)
        )
        return df_team_competitions

    def get_df_params(self) -> pd.DataFrame:
        alpha_params = self.fitted_model.params[
            [f"alpha_{team_name}" for team_name in self.TEAM_NAMES]
        ]
        beta_params = self.fitted_model.params[
            [f"beta_{team_name}" for team_name in self.TEAM_NAMES]
        ]
        return pd.DataFrame(
            {
                "team": self.TEAM_NAMES,
                "alpha": alpha_params.values,
                "beta": beta_params.values,
            }
        ).merge(self.df_team_competitions, how="left", on="team")

In [6]:
df_train_alpha_dummies = pd.get_dummies(df_team_level.team, prefix="alpha") * 1
df_train_beta_dummies = pd.get_dummies(df_team_level.opp_team, prefix="beta") * 1
df_train = (
    df_team_level.join(df_train_alpha_dummies)
    .join(df_train_beta_dummies)
    .assign(constant=1)
)

In [7]:
TEAM_NAMES = np.sort(list(set(df_train.team) | set(df_train.opp_team)))
features = (
    ["constant", "home"]
    + list([f"alpha_{team_name}" for team_name in TEAM_NAMES])
    + list([f"beta_{team_name}" for team_name in TEAM_NAMES])
)

In [8]:
model = Model(df_train, features, TEAM_NAMES)
model.train_model()

In [9]:
print(model.fitted_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  goals   No. Observations:                  760
Model:                            GLM   Df Residuals:                      722
Model Family:                 Poisson   Df Model:                           37
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1065.1
Date:                Fri, 21 Mar 2025   Deviance:                       762.55
Time:                        16:14:16   Pearson chi2:                     656.
No. Iterations:                     1   Pseudo R-squ. (CS):             0.2448
Covariance Type:            nonrobust                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
constant      

In [15]:
model.get_df_params().sort_values("alpha", ascending=False, ignore_index=True)

Unnamed: 0,team,alpha,beta,competition
0,Manchester City,0.59833,-0.75681,"Premier League, England"
1,Liverpool,0.53196,-0.80732,"Premier League, England"
2,Arsenal,0.36217,0.01982,"Premier League, England"
3,Tottenham Hotspur,0.26418,-0.25526,"Premier League, England"
4,Manchester United,0.24869,0.06915,"Premier League, England"
5,Chelsea,0.20247,-0.2592,"Premier League, England"
6,Bournemouth,0.11519,0.32058,"Premier League, England"
7,Everton,0.05482,-0.10256,"Premier League, England"
8,Watford,0.02981,0.14503,"Premier League, England"
9,West Ham United,0.02585,0.07462,"Premier League, England"
