In [1]:
%load_ext autoreload
%autoreload 2
import sys

sys.path.append("../")

In [4]:
! pip install tabulate

You should consider upgrading via the '/opt/conda/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
import penaltyblog as pb
from scipy.optimize import minimize
from scipy.stats import poisson

In [4]:
?pb.footballdata.fetch_data

[0;31mSignature:[0m [0mpb[0m[0;34m.[0m[0mfootballdata[0m[0;34m.[0m[0mfetch_data[0m[0;34m([0m[0mcountry[0m[0;34m,[0m [0mseason_start_year[0m[0;34m,[0m [0mdivision[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fetches the requested data from football-data.co.uk

Parameters
----------
country : string
    The name of the country of interest
season_start_year : int
    The year the season started, e.g. `2018` for the 2018/2019 season
division : int
    The division's level, where `0` is the top tier, `1` is the second tier etc

Example
-----------
import penaltyblog as pb
pb.footballdata.fetch("England", 2018, 0)

Returns
------
Pandas dataframe
[0;31mFile:[0m      ~/repos/martin/penaltyblog/penaltyblog/footballdata/footballdata.py
[0;31mType:[0m      function


In [5]:
pb.list_countries()

['belgium',
 'england',
 'france',
 'germany',
 'greece',
 'italy',
 'portugal',
 'scotland',
 'spain',
 'turkey']

In [6]:
df = pb.footballdata.fetch_data("England", 2018, 0)
df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"]].head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG
0,2018-08-10,Man United,Leicester,2,1
1,2018-08-11,Bournemouth,Cardiff,2,0
2,2018-08-11,Fulham,Crystal Palace,0,2
3,2018-08-11,Huddersfield,Chelsea,0,3
4,2018-08-11,Newcastle,Tottenham,1,2


In [5]:
df = pb.get_example_data()
print(df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG"]].head().to_markdown())

|    | Date                | HomeTeam     | AwayTeam       |   FTHG |   FTAG |
|---:|:--------------------|:-------------|:---------------|-------:|-------:|
|  0 | 2018-08-10 00:00:00 | Man United   | Leicester      |      2 |      1 |
|  1 | 2018-08-11 00:00:00 | Bournemouth  | Cardiff        |      2 |      0 |
|  2 | 2018-08-11 00:00:00 | Fulham       | Crystal Palace |      0 |      2 |
|  3 | 2018-08-11 00:00:00 | Huddersfield | Chelsea        |      0 |      3 |
|  4 | 2018-08-11 00:00:00 | Newcastle    | Tottenham      |      1 |      2 |


In [7]:
print(pb.clubelo.fetch_rankings_by_date(2010, 1, 1).head().to_markdown())

|    |   Rank | Club        | Country   |   Level |     Elo | From                | To                  |
|---:|-------:|:------------|:----------|--------:|--------:|:--------------------|:--------------------|
|  0 |      1 | Barcelona   | ESP       |       1 | 1987.68 | 2009-12-18 00:00:00 | 2010-01-02 00:00:00 |
|  1 |      2 | Chelsea     | ENG       |       1 | 1945.54 | 2009-12-29 00:00:00 | 2010-01-16 00:00:00 |
|  2 |      3 | Man United  | ENG       |       1 | 1928.53 | 2009-12-31 00:00:00 | 2010-01-09 00:00:00 |
|  3 |      4 | Real Madrid | ESP       |       1 | 1902.72 | 2009-12-20 00:00:00 | 2010-01-03 00:00:00 |
|  4 |      5 | Inter       | ITA       |       1 | 1884.49 | 2009-12-21 00:00:00 | 2010-01-06 00:00:00 |


In [8]:
print(pb.clubelo.fetch_rankings_by_team("barcelona").head().to_markdown())

|    | Rank   | Club      | Country   |   Level |     Elo | From                | To                  |
|---:|:-------|:----------|:----------|--------:|--------:|:--------------------|:--------------------|
|  0 | None   | Barcelona | ESP       |       1 | 1636.7  | 1939-10-22 00:00:00 | 1939-12-03 00:00:00 |
|  1 | None   | Barcelona | ESP       |       1 | 1626.1  | 1939-12-04 00:00:00 | 1939-12-10 00:00:00 |
|  2 | None   | Barcelona | ESP       |       1 | 1636.73 | 1939-12-11 00:00:00 | 1939-12-17 00:00:00 |
|  3 | None   | Barcelona | ESP       |       1 | 1646.95 | 1939-12-18 00:00:00 | 1939-12-24 00:00:00 |
|  4 | None   | Barcelona | ESP       |       1 | 1637.42 | 1939-12-25 00:00:00 | 1939-12-31 00:00:00 |


### Poisson Model

In [7]:
pois = pb.PoissonGoalsModel(df["FTHG"], df["FTAG"], df["HomeTeam"], df["AwayTeam"])

In [8]:
pois.fit()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
pois

Module: Penaltyblog

Model: Poisson

Number of parameters: 42
Log Likelihood: -1065.077
AIC: 2214.154

Team                 Attack               Defence             
------------------------------------------------------------
Arsenal              1.362                -1.062              
Bournemouth          1.115                -0.761              
Brighton             0.634                -0.937              
Burnley              0.894                -0.801              
Cardiff              0.614                -0.798              
Chelsea              1.202                -1.341              
Crystal Palace       1.004                -1.045              
Everton              1.055                -1.184              
Fulham               0.626                -0.637              
Huddersfield         0.184                -0.712              
Leicester            0.999                -1.145              
Liverpool            1.532                -1.889              
Man City         

In [7]:
x = pois.predict("Liverpool", "Stoke")
x.home_draw_away

[0.522666209710224, 0.281450920905797, 0.195882869363381]

### Dixon and Coles

In [10]:
dc = pb.DixonColesGoalModel(df["FTHG"], df["FTAG"], df["HomeTeam"], df["AwayTeam"])

In [11]:
dc.fit()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
dc

Module: Penaltyblog

Model: Dixon and Coles

Number of parameters: 43
Log Likelihood: -1064.943
AIC: 2215.886

Team                 Attack               Defence             
------------------------------------------------------------
Arsenal              1.36                 -0.982              
Bournemouth          1.115                -0.679              
Brighton             0.632                -0.858              
Burnley              0.897                -0.717              
Cardiff              0.615                -0.715              
Chelsea              1.205                -1.254              
Crystal Palace       1.007                -0.961              
Everton              1.054                -1.102              
Fulham               0.625                -0.557              
Huddersfield         0.18                 -0.631              
Leicester            0.996                -1.064              
Liverpool            1.534                -1.803              
Man City 

In [13]:
x = dc.predict("Liverpool", "Stoke")
x.home_draw_away

ValueError: No parameters for away team - please ensure the team was included in the training data

### Dixon and Coles Time Decay

In [13]:
weights = pb.dixon_coles_weights(df["Date"], 0.001)

In [14]:
dc = pb.DixonColesGoalModel(
    df["FTHG"], df["FTAG"], df["HomeTeam"], df["AwayTeam"], weights
)

In [15]:
dc.fit()



In [16]:
dc

Module: Penaltyblog

Model: Dixon and Coles

Number of parameters: 43
Log Likelihood: -959.416
AIC: 2004.831

Team                 Attack               Defence             
------------------------------------------------------------
Arsenal              1.372                -1.611              
Aston Villa          0.673                -1.531              
Blackburn            0.932                -1.162              
Bolton               0.914                -1.167              
Chelsea              1.225                -1.653              
Everton              0.967                -1.845              
Fulham               0.938                -1.582              
Liverpool            0.889                -1.803              
Man City             1.543                -2.12               
Man United           1.513                -2.005              
Newcastle            1.099                -1.562              
Norwich              1.018                -1.308              
QPR       

In [17]:
preds = list()
outcome = list()
for idx, row in df.iterrows():
    pred = dc.predict(row["HomeTeam"], row["AwayTeam"])
    preds.append(pred.home_draw_away)
    
    if row["FTR"] == "H":
        outcome.append(0)
    elif row["FTR"] == "D":
        outcome.append(1)
    elif row["FTR"] == "A":
        outcome.append(2)
    else:
        raise ValueError("Result not recognised")

In [18]:
rps_scores = list()
for i in range(len(outcome)):
    rps_scores.append(pb.rps(preds[i], outcome[i]))

In [19]:
np.mean(rps_scores)

0.20175300119587533

### Rue Salvesen

In [18]:
rs = pb.RueSalvesenGoalModel(df["FTHG"], df["FTAG"], df["HomeTeam"], df["AwayTeam"])

In [19]:
rs.fit()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [20]:
rs

Module: Penaltyblog

Model: Rue Salvesen

Number of parameters: 44
Log Likelihood: -1061.167
AIC: 2210.334

Team                 Attack               Defence             
------------------------------------------------------------
Arsenal              1.266                -0.924              
Bournemouth          1.005                -0.607              
Brighton             0.69                 -0.952              
Burnley              0.852                -0.709              
Cardiff              0.641                -0.779              
Chelsea              1.224                -1.337              
Crystal Palace       0.996                -0.988              
Everton              1.072                -1.18               
Fulham               0.609                -0.578              
Huddersfield         0.298                -0.8                
Leicester            1.014                -1.119              
Liverpool            1.603                -1.91               
Man City    

In [21]:
x = rs.predict("Liverpool", "Stoke")
x.home_draw_away

ValueError: No parameters for away team - please ensure the team was included in the training data

In [None]:
probs = rs.predict("Liverpool", "Stoke")
probs.total_goals("over", 2.5)

In [None]:
probs = rs.predict("Liverpool", "Stoke")
probs.total_goals("under", 2.5)

In [43]:
probs.asian_handicap("away", -1.5)

0.7397383751361912

In [44]:
from pprint import pprint
params = rs.get_params()
pprint(params)

{'attack_Arsenal': 1.3650671020694474,
 'attack_Aston Villa': 0.6807140182913024,
 'attack_Blackburn': 0.971135574781119,
 'attack_Bolton': 0.9502712140456423,
 'attack_Chelsea': 1.235466344414206,
 'attack_Everton': 0.9257685468926837,
 'attack_Fulham': 0.9122902202053228,
 'attack_Liverpool': 0.8684673939949753,
 'attack_Man City': 1.543379586931267,
 'attack_Man United': 1.4968564161865994,
 'attack_Newcastle': 1.1095636706231062,
 'attack_Norwich': 1.0424304866584615,
 'attack_QPR': 0.827439335780754,
 'attack_Stoke': 0.6248927873330669,
 'attack_Sunderland': 0.8510292333101492,
 'attack_Swansea': 0.8471368133406263,
 'attack_Tottenham': 1.2496040004504756,
 'attack_West Brom': 0.8625207332372105,
 'attack_Wigan': 0.8177807129177644,
 'attack_Wolves': 0.8181858085358248,
 'defence_Arsenal': -1.2192247076852236,
 'defence_Aston Villa': -1.0566859588325535,
 'defence_Blackburn': -0.7430288162188969,
 'defence_Bolton': -0.7268011436918458,
 'defence_Chelsea': -1.2065700516830344,
 'de

### RPS

In [30]:
predictions = np.array(
    [
        [1, 0, 0],
        [0.9, 0.1, 0],
        [0.8, 0.1, 0.1],
        [0.5, 0.25, 0.25],
        [0.35, 0.3, 0.35],
        [0.6, 0.3, 0.1],
        [0.6, 0.1, 0.3],
        [0.5, 0.45, 0.05],
        [0.55, 0.1, 0.35],
    ]
)

observed = [0, 0, 0, 0, 1, 1, 0, 0, 0, 0]

In [88]:
n_cat = 3
n_pred = len(predictions)

rps = np.zeros(n_pred)

for x in range(0, n_pred):
    obs_vec = np.zeros(n_cat)
    obs_vec[observed[x]] = 1
    cumulative = 0
    for i in range(n_cat):
        cumulative += (sum(predictions[x][:i+1]) - sum(obs_vec[0:i+1])) **2
    print(cumulative / (n_cat - 1))

obs_vec, cumulative

0.0
0.0049999999999999975
0.024999999999999988
0.15625
0.12250000000000003
0.185
0.12500000000000003
0.12625
0.16249999999999998


(array([1., 0., 0.]), 0.32499999999999996)

In [110]:
import numpy as np

# Outcome should be a binary list of the ordinal outcome. [0, 1, 0] for exmaple.
# Probs should be a list of probabilities. [0.79, 0.09, 0.12] for example.
# Outcome and Probs must be provided with the same order as probabilities.

def rps(probs, outcome):
    cum_probs = np.cumsum(probs)
    
    cum_outcomes = np.zeros(len(probs))
    cum_outcomes[outcome] = 1
    cum_outcomes = np.cumsum(cum_outcomes)

    sum_rps = 0
    for i in range(len(probs)):         
        sum_rps+= (cum_probs[i] - cum_outcomes[i])**2
    
    return sum_rps/(len(probs)-1)

rps(predictions[1], observed[1])

0.0049999999999999975

In [111]:
rps([0.8, 0.1, 0.1], 0)

0.024999999999999988

In [None]:
rankProbScore <- function(predictions, observed){
  ncat <- ncol(predictions)
  npred <- nrow(predictions)
   
  rps <- numeric(npred)
   
  for (rr in 1:npred){
    obsvec <- rep(0, ncat)
    obsvec[observed[rr]] <- 1
    cumulative <- 0
    for (i in 1:ncat){
      cumulative <- cumulative + (sum(predictions[rr,1:i]) - sum(obsvec[1:i]))^2
    }
    rps[rr] <- (1/(ncat-1))*cumulative
  }
  return(rps)
}

In [11]:
home_idx = np.where(mod.teams == "Norwich")[0][0]
away_idx = np.where(mod.teams == "Arsenal")[0][0]

home_attack = mod._params[home_idx]
away_attack = mod._params[away_idx]

home_defence = mod._params[home_idx + mod.n_teams]
away_defence = mod._params[away_idx + mod.n_teams]

intercept = mod._params[-2]
home_advantage = mod._params[-1]

home_goals = np.exp(intercept + home_advantage + home_attack + away_defence)
away_goals = np.exp(intercept + away_attack + home_defence)

home_vec = poisson(home_goals).pmf(np.arange(0, 10))
away_vec = poisson(away_goals).pmf(np.arange(0, 10))
m = np.outer(home_vec, away_vec)

In [12]:
sum(np.triu(m, 1)).sum()

0.5666354417609977

In [13]:
np.triu(m, -1).sum()

0.8949525556391001

In [73]:
repr_str = ""

repr_str += "Model: Poisson"
repr_str += "\n"
repr_str += "\n"

repr_str += "Log Likelihood: {0}".format(round(mod.loglikelihood, 3))
repr_str += "\n"
repr_str += "AIC: {0}".format(round(mod.aic, 3))
repr_str += "\n"
repr_str += "\n"

repr_str += "{0: <20} {1:<20} {2:<20}".format("Team", "Attack", "Defence")
repr_str += "\n"
repr_str += "-" * 60
repr_str += "\n"

for idx, team in enumerate(mod.teams):
    repr_str += "{0: <20} {1:<20} {2:<20}".format(
        mod.teams[idx],
        round(mod._params[idx], 3),
        round(mod._params[idx + mod.n_teams], 3),
    )
    repr_str += "\n"

repr_str += "-" * 60
repr_str += "\n"

repr_str += "Home Advantage: {0}".format(round(mod._params[-2], 3))
repr_str += "\n"
repr_str += "Intercept: {0}".format(round(mod._params[-1], 3))
repr_str += "\n"

print(repr_str)

Model: Poisson

Log Likelihood: -1088.991
AIC: 2261.982

Team                 Attack               Defence             
------------------------------------------------------------
Arsenal              1.362                -1.199              
Aston Villa          0.671                -1.157              
Blackburn            0.957                -0.758              
Bolton               0.913                -0.773              
Chelsea              1.229                -1.271              
Everton              0.96                 -1.426              
Fulham               0.93                 -1.185              
Liverpool            0.898                -1.429              
Man City             1.571                -1.706              
Man United           1.531                -1.58               
Newcastle            1.084                -1.177              
Norwich              1.025                -0.922              
QPR                  0.834                -0.931              


In [74]:
mod

<penaltyblog.poisson.poisson.Poisson at 0x7f9df3f4a8b0>

In [42]:
def _fit(params, fixtures, teams):
    n_teams = len(teams)

    params_df = pd.DataFrame(params[:n_teams], columns=["attack"])
    params_df["defence"] = params[n_teams : n_teams * 2]
    params_df["team"] = teams

    df2 = (
        fixtures.merge(params_df, left_on="team_home", right_on="team")
        .rename(columns={"attack": "home_attack", "defence": "home_defence"})
        .drop("team", axis=1)
        .merge(params_df, left_on="team_away", right_on="team")
        .rename(columns={"attack": "away_attack", "defence": "away_defence"})
        .drop("team", axis=1)
        .assign(hfa=params[-2])
        .assign(intercept=params[-1])
    )

    # import pdb

    # pdb.set_trace()

    df2["goals_home"] = df2["goals_home"].astype(int)
    df2["goals_away"] = df2["goals_away"].astype(int)

    # goal expectation
    df2["home_exp"] = np.exp(
        df2["intercept"] + df2["hfa"] + df2["home_attack"] + df2["away_defence"]
    )
    df2["away_exp"] = np.exp(
        df2["intercept"] + df2["away_attack"] + df2["home_defence"]
    )

    # likelihood
    df2["home_llk"] = poisson.pmf(df2["goals_home"], df2["home_exp"])
    df2["away_llk"] = poisson.pmf(df2["goals_away"], df2["away_exp"])
    df2["llk"] = np.log(df2["home_llk"]) + np.log(df2["away_llk"])

    #     # return the sum of the negative likelihood
    return -df2["llk"].sum()


#     return df2

In [43]:
df2 = _fit(mod.params, mod.fixtures, mod.teams)

In [44]:
df2

1482.7290022715952

In [24]:
poisson.pmf(df2["goals_home"], df2["home_exp"])

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [36]:
df2["goals_home"].astype(int)

0      4
1      0
2      0
3      5
4      2
      ..
375    3
376    0
377    1
378    3
379    0
Name: goals_home, Length: 380, dtype: int64

In [4]:
home_teams = np.unique(df["HomeTeam"].values)
away_teams = np.unique(df["AwayTeam"].values)

In [5]:
teams = np.sort(np.unique(np.concatenate([home_teams, away_teams])))
n_teams = len(teams)

teams

array(['Arsenal', 'Aston Villa', 'Blackburn', 'Bolton', 'Chelsea',
       'Everton', 'Fulham', 'Liverpool', 'Man City', 'Man United',
       'Newcastle', 'Norwich', 'QPR', 'Stoke', 'Sunderland', 'Swansea',
       'Tottenham', 'West Brom', 'Wigan', 'Wolves'], dtype=object)

In [6]:
params = np.concatenate(
    (
        [1] * n_teams,  # attack strength
        [-1] * n_teams,  # defence strength
        [0.25],  # home advantage
        [0.13],  # intercept
    )
)

params

array([ 1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        1.  ,  1.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  ,
       -1.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  , -1.  ,
       -1.  , -1.  , -1.  , -1.  ,  0.25,  0.13])

In [7]:
attack_params = dict(zip(teams, params[:n_teams]))
defence_params = dict(zip(teams, params[n_teams : n_teams * 2]))
hfa = params[-2]
intercept = params[-1]

attack_params, defence_params, hfa, intercept

({'Arsenal': 1.0,
  'Aston Villa': 1.0,
  'Blackburn': 1.0,
  'Bolton': 1.0,
  'Chelsea': 1.0,
  'Everton': 1.0,
  'Fulham': 1.0,
  'Liverpool': 1.0,
  'Man City': 1.0,
  'Man United': 1.0,
  'Newcastle': 1.0,
  'Norwich': 1.0,
  'QPR': 1.0,
  'Stoke': 1.0,
  'Sunderland': 1.0,
  'Swansea': 1.0,
  'Tottenham': 1.0,
  'West Brom': 1.0,
  'Wigan': 1.0,
  'Wolves': 1.0},
 {'Arsenal': -1.0,
  'Aston Villa': -1.0,
  'Blackburn': -1.0,
  'Bolton': -1.0,
  'Chelsea': -1.0,
  'Everton': -1.0,
  'Fulham': -1.0,
  'Liverpool': -1.0,
  'Man City': -1.0,
  'Man United': -1.0,
  'Newcastle': -1.0,
  'Norwich': -1.0,
  'QPR': -1.0,
  'Stoke': -1.0,
  'Sunderland': -1.0,
  'Swansea': -1.0,
  'Tottenham': -1.0,
  'West Brom': -1.0,
  'Wigan': -1.0,
  'Wolves': -1.0},
 0.25,
 0.13)

In [8]:
def neg_log_likelihood(
    goals_home_observed,
    goals_away_observed,
    attack_param_home,
    defence_param_home,
    attack_param_away,
    defence_param_away,
    home_advantage,
    intercept,
):
    home_goals_expected = np.exp(
        intercept + attack_param_home + defence_param_away + home_advantage
    )
    away_goals_expected = np.exp(intercept + attack_param_away + defence_param_home)

    if home_goals_expected < 0 or away_goals_expected < 0:
        return 100000000

    log_lik_home = poisson.pmf(goals_home_observed, home_goals_expected)
    log_lik_away = poisson.pmf(goals_away_observed, away_goals_expected)

    log_lik = np.sum(np.log(log_lik_home) + np.log(log_lik_away))

    return log_lik * -1

In [9]:
def fit(params, fixtures, teams):
    attack_params = dict(zip(teams, params[:n_teams]))
    defence_params = dict(zip(teams, params[n_teams : n_teams * 2]))
    hfa = params[-2]
    intercept = params[-1]

    log_likelihood = list()
    for idx, row in fixtures.iterrows():
        llk = neg_log_likelihood(
            row["FTHG"],
            row["FTAG"],
            attack_params[row["HomeTeam"]],
            defence_params[row["HomeTeam"]],
            attack_params[row["AwayTeam"]],
            defence_params[row["AwayTeam"]],
            hfa,
            intercept,
        )
        log_likelihood.append(llk)
    #         import pdb; pdb.set_trace()

    log_likelihood = np.sum(log_likelihood)
    return log_likelihood

In [10]:
def fit2(params, fixtures, teams):
    params_df = pd.DataFrame(params[:n_teams], columns=["attack"])
    params_df["defence"] = params[n_teams : n_teams * 2]
    params_df["team"] = teams

    df2 = (
        fixtures.merge(params_df, left_on="HomeTeam", right_on="team")
        .rename(columns={"attack": "home_attack", "defence": "home_defence"})
        .drop("team", axis=1)
        .merge(params_df, left_on="AwayTeam", right_on="team")
        .rename(columns={"attack": "away_attack", "defence": "away_defence"})
        .drop("team", axis=1)
        .assign(hfa=params[-2])
        .assign(intercept=params[-1])
    )

    # goal expectation
    df2["home_exp"] = np.exp(
        df2["intercept"] + df2["hfa"] + df2["home_attack"] + df2["away_defence"]
    )
    df2["away_exp"] = np.exp(
        df2["intercept"] + df2["away_attack"] + df2["home_defence"]
    )

    # likelihood
    df2["home_llk"] = poisson.pmf(df2["FTHG"], df2["home_exp"])
    df2["away_llk"] = poisson.pmf(df2["FTAG"], df2["away_exp"])
    df2["llk"] = np.log(df2["home_llk"]) + np.log(df2["away_llk"])

    return -df2["llk"].sum()

In [11]:
%time fit2(params, df, teams)

CPU times: user 25.5 ms, sys: 8.61 ms, total: 34.1 ms
Wall time: 31.3 ms


1169.1661546630157

In [12]:
options = {
    "maxiter": 100,
    "disp": True,
}

constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

res = minimize(
    fit2,
    params,
    args=(df, teams),
    constraints=constraints,
    options=options,
)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Optimization terminated successfully    (Exit mode 0)
            Current function value: 1088.9910526412477
            Iterations: 27
            Function evaluations: 1207
            Gradient evaluations: 27


In [17]:
len(res["x"])

42

In [13]:
dict(
    zip(
        ["attack_" + team for team in teams]
        + ["defence_" + team for team in teams]
        + ["hfa", "intercept"],
        res.x,
    )
)

{'attack_Arsenal': 1.3618894048046104,
 'attack_Aston Villa': 0.6710556384701741,
 'attack_Blackburn': 0.9565636311597576,
 'attack_Bolton': 0.9129977612270153,
 'attack_Chelsea': 1.2286740487399637,
 'attack_Everton': 0.9599618908688915,
 'attack_Fulham': 0.929668701183735,
 'attack_Liverpool': 0.8978373454639799,
 'attack_Man City': 1.5709770286005091,
 'attack_Man United': 1.5309479144008318,
 'attack_Newcastle': 1.0841956761868037,
 'attack_Norwich': 1.0250161563424816,
 'attack_QPR': 0.8343182520298015,
 'attack_Stoke': 0.6432806103056741,
 'attack_Sunderland': 0.8602148668669803,
 'attack_Swansea': 0.8425827806293722,
 'attack_Tottenham': 1.2391263982727108,
 'attack_West Brom': 0.8661609505043957,
 'attack_Wigan': 0.8066418147380476,
 'attack_Wolves': 0.7778891292042638,
 'defence_Arsenal': -1.0283507377341752,
 'defence_Aston Villa': -0.9863219398955788,
 'defence_Blackburn': -0.5878757870885645,
 'defence_Bolton': -0.6027985047201236,
 'defence_Chelsea': -1.1005237555739777,
 

In [32]:
params

array([ 0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,
        0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,  0.1 ,
        0.1 ,  0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 ,
       -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 , -0.1 ,
       -0.1 , -0.1 , -0.1 , -0.1 ,  0.25,  0.13])

In [149]:
poisson.pmf([1, 2, 3], [4, 5, 6])

array([0.07326256, 0.08422434, 0.08923508])

In [90]:
import pandas as pd

In [123]:
df = pd.read_csv("http://www.football-data.co.uk/mmz4281/1112/E0.csv")
df = df[["HomeTeam", "AwayTeam", "FTHG", "FTAG"]]
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG
0,Blackburn,Wolves,1,2
1,Fulham,Aston Villa,0,0
2,Liverpool,Sunderland,1,1
3,Newcastle,Arsenal,0,0
4,QPR,Bolton,0,4


In [92]:
def dc_log_like(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma):
    lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x)
    return (
        np.log(rho_correction(x, y, lambda_x, mu_y, rho))
        + np.log(poisson.pmf(x, lambda_x))
        + np.log(poisson.pmf(y, mu_y))
    )


def rho_correction(x, y, lambda_x, mu_y, rho):
    if x == 0 and y == 0:
        return 1 - (lambda_x * mu_y * rho)
    elif x == 0 and y == 1:
        return 1 + (lambda_x * rho)
    elif x == 1 and y == 0:
        return 1 + (mu_y * rho)
    elif x == 1 and y == 1:
        return 1 - rho
    else:
        return 1.0

In [99]:
def solve_parameters(
    dataset,
    debug=False,
    init_vals=None,
    options={"disp": True, "maxiter": 100},
    constraints=[{"type": "eq", "fun": lambda x: sum(x[:20]) - 20}],
    **kwargs
):
    teams = np.sort(dataset["HomeTeam"].unique())
    # check for no weirdness in dataset
    away_teams = np.sort(dataset["AwayTeam"].unique())
    if not np.array_equal(teams, away_teams):
        raise ValueError("Something's not right")
    n_teams = len(teams)
    if init_vals is None:
        # random initialisation of model parameters
        init_vals = np.concatenate(
            (
                np.random.uniform(0, 1, (n_teams)),  # attack strength
                np.random.uniform(0, -1, (n_teams)),  # defence strength
                np.array([0, 1.0]),  # rho (score correction), gamma (home advantage)
            )
        )

    def dc_log_like(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma):
        lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x)

        if lambda_x < 0 or mu_y < 0 or rho_correction(x, y, lambda_x, mu_y, rho) < 0:
            return 100000000

        return (
            np.log(rho_correction(x, y, lambda_x, mu_y, rho))
            + np.log(poisson.pmf(x, lambda_x))
            + np.log(poisson.pmf(y, mu_y))
        )

    def estimate_paramters(params):
        score_coefs = dict(zip(teams, params[:n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        rho, gamma = params[-2:]
        log_like = [
            dc_log_like(
                row.HomeGoals,
                row.AwayGoals,
                score_coefs[row.HomeTeam],
                defend_coefs[row.HomeTeam],
                score_coefs[row.AwayTeam],
                defend_coefs[row.AwayTeam],
                rho,
                gamma,
            )
            for row in dataset.itertuples()
        ]
        return -sum(log_like)

    opt_output = minimize(
        estimate_paramters,
        init_vals,
        options=options,
        constraints=constraints,
        **kwargs
    )
    if debug:
        # sort of hacky way to investigate the output of the optimisation process
        return opt_output
    else:
        return dict(
            zip(
                ["attack_" + team for team in teams]
                + ["defence_" + team for team in teams]
                + ["rho", "home_adv"],
                opt_output.x,
            )
        )

In [100]:
params = solve_parameters(epl_1718)

  + np.log(poisson.pmf(x, lambda_x))
  + np.log(poisson.pmf(y, mu_y))
  lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x)
  Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
  np.log(rho_correction(x, y, lambda_x, mu_y, rho))
  return 1- (lambda_x * mu_y * rho)


Iteration limit reached    (Exit mode 9)
            Current function value: nan
            Iterations: 100
            Function evaluations: 5345
            Gradient evaluations: 100


In [101]:
params

{'attack_Arsenal': nan,
 'attack_Bournemouth': nan,
 'attack_Brighton': nan,
 'attack_Burnley': nan,
 'attack_Chelsea': nan,
 'attack_Crystal Palace': nan,
 'attack_Everton': nan,
 'attack_Huddersfield': nan,
 'attack_Leicester': nan,
 'attack_Liverpool': nan,
 'attack_Man City': nan,
 'attack_Man United': nan,
 'attack_Newcastle': nan,
 'attack_Southampton': nan,
 'attack_Stoke': nan,
 'attack_Swansea': nan,
 'attack_Tottenham': nan,
 'attack_Watford': nan,
 'attack_West Brom': nan,
 'attack_West Ham': nan,
 'defence_Arsenal': nan,
 'defence_Bournemouth': nan,
 'defence_Brighton': nan,
 'defence_Burnley': nan,
 'defence_Chelsea': nan,
 'defence_Crystal Palace': nan,
 'defence_Everton': nan,
 'defence_Huddersfield': nan,
 'defence_Leicester': nan,
 'defence_Liverpool': nan,
 'defence_Man City': nan,
 'defence_Man United': nan,
 'defence_Newcastle': nan,
 'defence_Southampton': nan,
 'defence_Stoke': nan,
 'defence_Swansea': nan,
 'defence_Tottenham': nan,
 'defence_Watford': nan,
 'def

In [201]:
pd.DataFrame(
    [
        df["FTHG"].tolist(),
        df["FTAG"].tolist(),
        df["HomeTeam"].tolist(),
        df["AwayTeam"].tolist(),
    ]
).T

Unnamed: 0,0,1,2,3
0,1,2,Blackburn,Wolves
1,0,0,Fulham,Aston Villa
2,1,1,Liverpool,Sunderland
3,0,0,Newcastle,Arsenal
4,0,4,QPR,Bolton
...,...,...,...,...
375,0,1,Sunderland,Man United
376,1,0,Swansea,Liverpool
377,2,0,Tottenham,Fulham
378,2,3,West Brom,Arsenal


In [17]:
from datetime import datetime

In [20]:
datetime.now().date()

datetime.date(2021, 6, 3)