# Logistic Regression Power Ranking

## David Sheehan Poisson regression

https://dashee87.github.io/football/python/predicting-football-results-with-statistical-modelling/

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from scipy.stats import poisson,skellam
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrix, dmatrices

In [2]:
epl_1617 = pd.read_csv("http://www.football-data.co.uk/mmz4281/1617/E0.csv")
epl_1617 = epl_1617[['HomeTeam','AwayTeam','FTHG','FTAG']]
epl_1617 = epl_1617.rename(columns={'HomeTeam': 'h', 'AwayTeam': 'a','FTHG': 'h_ftGoals', 'FTAG': 'a_ftGoals'})
epl_1617.head()

Unnamed: 0,h,a,h_ftGoals,a_ftGoals
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


In [3]:
epl_1617 = epl_1617[:-10]
epl_1617.mean()


h_ftGoals    1.591892
a_ftGoals    1.183784
dtype: float64

In [4]:
epl_1617.head()

Unnamed: 0,h,a,h_ftGoals,a_ftGoals
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


In [5]:
epl_1617.tail()

Unnamed: 0,h,a,h_ftGoals,a_ftGoals
365,Chelsea,Watford,4,3
366,Arsenal,Sunderland,2,0
367,Man City,West Brom,3,1
368,Southampton,Man United,0,0
369,Leicester,Tottenham,1,6


In [6]:
epl_1617.shape

(370, 4)

In [7]:
goal_model_data = pd.concat([epl_1617[['h','a','h_ftGoals']].assign(home=1).rename(
            columns={'h':'team', 'a':'opponent','h_ftGoals':'goals'}),
           epl_1617[['a','h','a_ftGoals']].assign(home=0).rename(
            columns={'a':'team', 'h':'opponent','a_ftGoals':'goals'})])
goal_model_data.head()

Unnamed: 0,team,opponent,goals,home
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


In [8]:
goal_model_data.iloc[365:374]

Unnamed: 0,team,opponent,goals,home
365,Chelsea,Watford,4,1
366,Arsenal,Sunderland,2,1
367,Man City,West Brom,3,1
368,Southampton,Man United,0,1
369,Leicester,Tottenham,1,1
0,Swansea,Burnley,1,0
1,West Brom,Crystal Palace,1,0
2,Tottenham,Everton,1,0
3,Leicester,Hull,1,0


In [9]:
goal_model_data.tail()

Unnamed: 0,team,opponent,goals,home
365,Watford,Chelsea,3,0
366,Sunderland,Arsenal,0,0
367,West Brom,Man City,1,0
368,Man United,Southampton,0,0
369,Tottenham,Leicester,6,0


In [10]:
goal_model_data.shape

(740, 4)

In [11]:
poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,740.0
Model:,GLM,Df Residuals:,700.0
Model Family:,Poisson,Df Model:,39.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-1042.4
Date:,"Mon, 16 Dec 2019",Deviance:,776.11
Time:,20:27:09,Pearson chi2:,659.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3725,0.198,1.880,0.060,-0.016,0.761
team[T.Bournemouth],-0.2891,0.179,-1.612,0.107,-0.641,0.062
team[T.Burnley],-0.6458,0.200,-3.230,0.001,-1.038,-0.254
team[T.Chelsea],0.0789,0.162,0.488,0.626,-0.238,0.396
team[T.Crystal Palace],-0.3865,0.183,-2.107,0.035,-0.746,-0.027
team[T.Everton],-0.2008,0.173,-1.161,0.246,-0.540,0.138
team[T.Hull],-0.7006,0.204,-3.441,0.001,-1.100,-0.302
team[T.Leicester],-0.4204,0.187,-2.249,0.025,-0.787,-0.054
team[T.Liverpool],0.0162,0.164,0.099,0.921,-0.306,0.338


In [12]:
poisson_model.predict(pd.DataFrame(data={'team': 'Chelsea', 'opponent': 'Sunderland',
                                       'home':1},index=[1]))

1    3.061662
dtype: float64

In [13]:
poisson_model.predict(pd.DataFrame(data={'team': 'Sunderland', 'opponent': 'Chelsea',
                                       'home':0},index=[1]))


1    0.409373
dtype: float64

## Jordan Tigani Logistic Regression

https://www.youtube.com/watch?v=m2rhRJkWOEI&list=PLatexvNeFffyzlyAtDiUffmokReDzTQk6

https://github.com/GoogleCloudPlatform/ipython-soccer-predictions/blob/master/predict/wc-final.ipynb

In [14]:
# 1 is home team, -1 is away team
cBr = [1,0,1,0,0,-1]
cMe = [0,1,-1,0,-1,0]
cCr = [-1,0,0,-1,1,0]
cCa = [0,-1,0,1,0,1]
h_ftGoals = [3,1,0,0,1,1]
a_ftGoals = [1,0,0,4,3,4]
df = pd.DataFrame({'brazil': cBr, 'mexico': cMe, 'croatia': cCr, 'camaroon': cCa,
                   'h_ftGoals': h_ftGoals, 'a_ftGoals': a_ftGoals, 'intercept': [1]*6})
df['goalDiff'] = df['h_ftGoals'].sub(df['a_ftGoals'])
df.head(10)

Unnamed: 0,brazil,mexico,croatia,camaroon,h_ftGoals,a_ftGoals,intercept,goalDiff
0,1,0,-1,0,3,1,1,2
1,0,1,0,-1,1,0,1,1
2,1,-1,0,0,0,0,1,0
3,0,0,-1,1,0,4,1,-4
4,0,-1,1,0,1,3,1,-2
5,-1,0,0,1,1,4,1,-3


In [15]:
def points_to_sgn(p):
  if p > 0.1: return 1.0
  elif p < -0.1: return -1.0
  else: return 0.0

df['points'] = df['goalDiff'].apply(points_to_sgn)
df.head(10)

Unnamed: 0,brazil,mexico,croatia,camaroon,h_ftGoals,a_ftGoals,intercept,goalDiff,points
0,1,0,-1,0,3,1,1,2,1.0
1,0,1,0,-1,1,0,1,1,1.0
2,1,-1,0,0,0,0,1,0,0.0
3,0,0,-1,1,0,4,1,-4,-1.0
4,0,-1,1,0,1,3,1,-2,-1.0
5,-1,0,0,1,1,4,1,-3,-1.0


In [16]:
X = df[['brazil', 'mexico', 'croatia', 'camaroon']].values
y = df['points'].values

In [17]:
X

array([[ 1,  0, -1,  0],
       [ 0,  1,  0, -1],
       [ 1, -1,  0,  0],
       [ 0,  0, -1,  1],
       [ 0, -1,  1,  0],
       [-1,  0,  0,  1]])

In [18]:
model = sm.Logit(y, X).fit_regularized(method='l1', alpha=1.5)

ValueError: endog must be in the unit interval.

## My Attempt

In [19]:
epl_1617 = pd.read_csv("http://www.football-data.co.uk/mmz4281/1617/E0.csv")
epl_1617 = epl_1617[['HomeTeam','AwayTeam','FTHG','FTAG']]
epl_1617 = epl_1617.rename(columns={'HomeTeam': 'h', 'AwayTeam': 'a','FTHG': 'h_ftGoals', 'FTAG': 'a_ftGoals'})
epl_1617 = epl_1617[:-10]
epl_1617['goalDiff'] = epl_1617['h_ftGoals'] - epl_1617['a_ftGoals']
epl_1617.head()

Unnamed: 0,h,a,h_ftGoals,a_ftGoals,goalDiff
0,Burnley,Swansea,0,1,-1
1,Crystal Palace,West Brom,0,1,-1
2,Everton,Tottenham,1,1,0
3,Hull,Leicester,2,1,1
4,Man City,Sunderland,2,1,1


In [20]:
# Construct and print model matrix for color as categorical variable
matrix= dmatrix('C(h)', data = epl_1617, return_type = 'dataframe')

In [21]:
matrix.head()

Unnamed: 0,Intercept,C(h)[T.Bournemouth],C(h)[T.Burnley],C(h)[T.Chelsea],C(h)[T.Crystal Palace],C(h)[T.Everton],C(h)[T.Hull],C(h)[T.Leicester],C(h)[T.Liverpool],C(h)[T.Man City],C(h)[T.Man United],C(h)[T.Middlesbrough],C(h)[T.Southampton],C(h)[T.Stoke],C(h)[T.Sunderland],C(h)[T.Swansea],C(h)[T.Tottenham],C(h)[T.Watford],C(h)[T.West Brom],C(h)[T.West Ham]
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
cols = [col for col in matrix.columns if col != 'Intercept']
len(cols)

19

19 columns for 20 teams

In [23]:
# Construct and print model matrix for color as categorical variable
matrices= dmatrices('h_ftGoals ~ C(h) + C(a)', data = epl_1617, return_type = 'dataframe')

In [24]:
matrices[0].head()

Unnamed: 0,h_ftGoals
0,0.0
1,0.0
2,1.0
3,2.0
4,2.0


In [25]:
matrices[1].head()

Unnamed: 0,Intercept,C(h)[T.Bournemouth],C(h)[T.Burnley],C(h)[T.Chelsea],C(h)[T.Crystal Palace],C(h)[T.Everton],C(h)[T.Hull],C(h)[T.Leicester],C(h)[T.Liverpool],C(h)[T.Man City],...,C(a)[T.Man United],C(a)[T.Middlesbrough],C(a)[T.Southampton],C(a)[T.Stoke],C(a)[T.Sunderland],C(a)[T.Swansea],C(a)[T.Tottenham],C(a)[T.Watford],C(a)[T.West Brom],C(a)[T.West Ham]
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [26]:
h_ftGoals_model = smf.glm(formula="h_ftGoals ~ h + a", data=epl_1617, 
                        family=sm.families.Poisson()).fit()
h_ftGoals_model.summary()

0,1,2,3
Dep. Variable:,h_ftGoals,No. Observations:,370.0
Model:,GLM,Df Residuals:,331.0
Model Family:,Poisson,Df Model:,38.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-538.42
Date:,"Mon, 16 Dec 2019",Deviance:,350.6
Time:,20:27:32,Pearson chi2:,293.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6213,0.256,2.428,0.015,0.120,1.123
h[T.Bournemouth],-0.0571,0.238,-0.240,0.811,-0.524,0.410
h[T.Burnley],-0.3426,0.261,-1.312,0.190,-0.854,0.169
h[T.Chelsea],0.3158,0.219,1.439,0.150,-0.114,0.746
h[T.Crystal Palace],-0.4386,0.264,-1.661,0.097,-0.956,0.079
h[T.Everton],0.1037,0.227,0.456,0.648,-0.342,0.549
h[T.Hull],-0.2758,0.255,-1.080,0.280,-0.776,0.225
h[T.Leicester],-0.1412,0.248,-0.569,0.569,-0.628,0.345
h[T.Liverpool],0.1494,0.228,0.655,0.512,-0.297,0.596


In [27]:
a_ftGoals_model = smf.glm(formula="a_ftGoals ~ h + a", data=epl_1617, 
                        family=sm.families.Poisson()).fit()
a_ftGoals_model.summary()

0,1,2,3
Dep. Variable:,a_ftGoals,No. Observations:,370.0
Model:,GLM,Df Residuals:,331.0
Model Family:,Poisson,Df Model:,38.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-483.89
Date:,"Mon, 16 Dec 2019",Deviance:,385.27
Time:,20:27:32,Pearson chi2:,325.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3616,0.309,1.169,0.243,-0.245,0.968
h[T.Bournemouth],0.5703,0.319,1.789,0.074,-0.054,1.195
h[T.Burnley],0.1418,0.350,0.404,0.686,-0.545,0.829
h[T.Chelsea],0.0282,0.360,0.078,0.938,-0.678,0.734
h[T.Crystal Palace],0.4372,0.327,1.335,0.182,-0.204,1.079
h[T.Everton],-0.0269,0.360,-0.075,0.940,-0.732,0.678
h[T.Hull],0.5878,0.321,1.831,0.067,-0.041,1.217
h[T.Leicester],0.4219,0.330,1.278,0.201,-0.225,1.069
h[T.Liverpool],0.1479,0.351,0.422,0.673,-0.539,0.835


In [28]:
h_ftGoals_model.predict(pd.DataFrame(data={'h': 'Chelsea', 'a': 'Sunderland'},index=[1]))


1    2.888013
dtype: float64

In [29]:
a_ftGoals_model.predict(pd.DataFrame(data={'h': 'Chelsea', 'a': 'Sunderland'},index=[1]))


1    0.505795
dtype: float64

In [30]:
h_ftGoals_model.predict(pd.DataFrame(data={'a': 'Sunderland', 'h': 'Chelsea'},index=[1]))

1    2.888013
dtype: float64

## Incorporating as a Feature

In [31]:
epl_1617 = pd.read_csv("http://www.football-data.co.uk/mmz4281/1617/E0.csv", dayfirst=True, parse_dates=['Date'])
epl_1617 = epl_1617[['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
epl_1617 = epl_1617.rename(columns={'Date': 'date', 'HomeTeam': 'h', 'AwayTeam': 'a',
                                    'FTHG': 'h_ftGoals',
                                    'FTAG': 'a_ftGoals'}).sort_values(by='date').reset_index(drop=True)
epl_1617.head()

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals
0,2016-08-13,Burnley,Swansea,0,1
1,2016-08-13,Crystal Palace,West Brom,0,1
2,2016-08-13,Everton,Tottenham,1,1
3,2016-08-13,Hull,Leicester,2,1
4,2016-08-13,Man City,Sunderland,2,1


In [32]:
epl_1617.tail()

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals
375,2017-05-21,Chelsea,Sunderland,5,1
376,2017-05-21,Burnley,West Ham,1,2
377,2017-05-21,Arsenal,Everton,3,1
378,2017-05-21,Hull,Tottenham,1,7
379,2017-05-21,Watford,Man City,0,5


In [33]:
epl_1617['game_day'] = epl_1617.groupby(by='date').ngroup(ascending=True)
epl_1617.head(20)

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals,game_day
0,2016-08-13,Burnley,Swansea,0,1,0
1,2016-08-13,Crystal Palace,West Brom,0,1,0
2,2016-08-13,Everton,Tottenham,1,1,0
3,2016-08-13,Hull,Leicester,2,1,0
4,2016-08-13,Man City,Sunderland,2,1,0
5,2016-08-13,Middlesbrough,Stoke,1,1,0
6,2016-08-13,Southampton,Watford,1,1,0
7,2016-08-14,Arsenal,Liverpool,3,4,1
8,2016-08-14,Bournemouth,Man United,1,3,1
9,2016-08-15,Chelsea,West Ham,2,1,2


In [34]:
epl_1617.tail(20)

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals,game_day
360,2017-05-13,Bournemouth,Burnley,2,1,100
361,2017-05-13,Man City,Leicester,2,1,100
362,2017-05-14,Crystal Palace,Hull,4,0,101
363,2017-05-14,Tottenham,Man United,2,1,101
364,2017-05-14,West Ham,Liverpool,0,4,101
365,2017-05-15,Chelsea,Watford,4,3,102
366,2017-05-16,Arsenal,Sunderland,2,0,103
367,2017-05-16,Man City,West Brom,3,1,103
368,2017-05-17,Southampton,Man United,0,0,104
369,2017-05-18,Leicester,Tottenham,1,6,105


In [35]:
# We want to calculate the home and away coefficient for each team and set it as a feature
# We need some built up history to make this work
# Arbitrarily pick game day 50 - halway thorugh the season as an initial experiment
# Calculate the coeeficients on each progressive match day and place them into the dataframe as a feature

In [36]:
epl_1617[epl_1617['game_day'] >= 50].head()

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals,game_day
188,2017-01-01,Watford,Tottenham,1,4,50
189,2017-01-01,Arsenal,Crystal Palace,2,0,50
190,2017-01-02,Everton,Southampton,3,0,51
191,2017-01-02,Man City,Burnley,2,1,51
192,2017-01-02,Middlesbrough,Leicester,0,0,51


In [37]:
# Game Day 50 is 189 games into the season, so will have an adequate record of home and away wins by this time

In [40]:
# Indices where the game day changes
# we will step through these from 188 and up (inclusive)
epl_1617['game_day'].diff()[epl_1617['game_day'].diff() != 0].index.values

array([  0,   7,   9,  10,  11,  18,  20,  28,  30,  38,  39,  40,  41,
        46,  50,  58,  59,  60,  61,  66,  70,  77,  79,  80,  88,  90,
        97,  99, 100, 105, 110, 118, 119, 120, 126, 130, 137, 139, 140,
       146, 150, 152, 160, 166, 169, 170, 178, 179, 180, 181, 188, 190,
       196, 199, 200, 208, 210, 217, 220, 227, 230, 238, 240, 247, 249,
       250, 256, 257, 258, 265, 267, 268, 269, 272, 273, 280, 283, 291,
       293, 297, 303, 310, 312, 313, 320, 322, 323, 327, 329, 330, 333,
       334, 339, 343, 344, 345, 351, 353, 354, 355, 357, 362, 365, 366,
       368, 369, 370])

In [None]:
# First add columns for each team, then run the regression at each change in the game day,
# and fill the team column with the appropriate regression coefficients, 
# ? Fill h_h_coeff, h_a_coeff, a_a_coeff, a_h_coeff ? - Try this to start
# Then can coefficients be calculated based on odds or implied probailities ?
# Also in a seperate notebook, look at decile of season completed as a feature