# 0005.0000 Modeling Strategy

In [1]:
import pathlib
import sys

import pandas as pd

from scipy.stats import poisson,skellam
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tools.sm_exceptions as sm_exceptions
from patsy import dmatrix, dmatrices

from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin


import scipy.stats as stats
from statsmodels.graphics.mosaicplot import mosaic
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
#add the 'src' directory to path to import modules
PROJECT_DIR = pathlib.Path.cwd().resolve().parent
sys.path.append(str(PROJECT_DIR))

from src.visualization.visualize import extend_cols

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

EDA_FP = PROJECT_DIR / 'data' / 'reference' / 'eda' / 'sample.csv'

### Modeling Strategy

Load Data

Prepare for Modeling

Model a non-trivial feature

Create the feature for the data

Transform the Data

Model the transformed Data

Score the Model with various parameters

Calculate the betting return

### Load Data

In [2]:
df_orig = pd.read_csv(EDA_FP, parse_dates=['date']).sort_values(by=['date', 'h', 'a']).reset_index(drop=True)

### Prepare for Modeling

+ Ensure feature columns are floats
+ One hot encode the results column
+ Add meta data - the sequentially numbered game day within each season - required for time series cross validation

In [3]:
def insert_game_day(df_orig):
    """
    Accepts a Dataframe of league matches - may be multiple seasons
    Must have a season column identifying the season, and a date column
    Returns a dataFrame where each game day within a season is consecutively integer numbered
    
    """
    season_dfs = []
    for season in df_orig['season'].unique():
        # Make a bunch of brand spanking new dfs to concat together
        season_df = df_orig[df_orig['season'] == season].copy()
        # Number each game day in the season starting at 0
        season_df['game_day'] = season_df.groupby(by='date').ngroup(ascending=True)
        season_dfs.append(season_df)
    df = pd.concat(season_dfs, axis=0).sort_values(by=['season', 'date'])
    return df

In [4]:
# Specify columns to keep
cols = extend_cols(['hwinOddsBet365', 'drawOddsBet365', 'awinOddsBet365',
                   'hwinOddsLb', 'drawOddsLb', 'awinOddsLb'])
df_orig = df_orig[cols]

# Ensure float datatypes for modeling
int_cols = ['h_shots', 'a_shots', 'h_shotsOnTarget', 'a_shotsOnTarget']
df_orig[int_cols] = df_orig[int_cols].astype(float)

# Dummy Variables for categorical column result
res = df_orig['result']
df_orig = pd.get_dummies(df_orig, columns=['result'])
# Keep the results column as a convenience column
df_orig['result'] = res

# Add the game day meta data
df_orig = insert_game_day(df_orig)
df_orig.head()

Unnamed: 0,nation,league,season,date,h,a,h_ftGoals,a_ftGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsLb,drawOddsLb,awinOddsLb,result_awin,result_draw,result_hwin,result,game_day
0,germany,bundesliga,2007-2008,2007-08-10,stuttgart,schalke,2.0,2.0,16.0,10.0,7.0,4.0,2.37,3.25,2.87,2.1,3.2,3.0,0,1,0,draw,0
1,germany,bundesliga,2007-2008,2007-08-11,bayern-munich,hansa-rostock,3.0,0.0,27.0,6.0,13.0,0.0,1.16,6.0,19.0,1.2,5.0,11.0,0,0,1,hwin,1
2,germany,bundesliga,2007-2008,2007-08-11,bochum,sv-werder-bremen,2.0,2.0,17.0,20.0,4.0,9.0,3.5,3.3,2.05,3.2,3.2,2.0,0,1,0,draw,1
3,germany,bundesliga,2007-2008,2007-08-11,eintracht-frankfurt,hertha-berlin,1.0,0.0,19.0,10.0,5.0,3.0,2.37,3.2,2.9,2.37,3.2,2.6,0,0,1,hwin,1
4,germany,bundesliga,2007-2008,2007-08-11,hannover,hamburger-sv,0.0,1.0,10.0,14.0,2.0,8.0,2.5,3.2,2.75,2.5,3.25,2.4,1,0,0,awin,1


In [5]:
df_orig.tail()

Unnamed: 0,nation,league,season,date,h,a,h_ftGoals,a_ftGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsLb,drawOddsLb,awinOddsLb,result_awin,result_draw,result_hwin,result,game_day
913,germany,bundesliga,2009-2010,2010-05-08,mainz,schalke,0.0,0.0,12.0,10.0,4.0,3.0,2.88,3.4,2.38,2.75,3.25,2.2,0,1,0,draw,91
914,germany,bundesliga,2009-2010,2010-05-08,nurnberg,fc-koln,1.0,0.0,13.0,5.0,3.0,2.0,1.53,4.33,5.5,1.5,3.6,5.5,0,0,1,hwin,91
915,germany,bundesliga,2009-2010,2010-05-08,sc-freiburg,dortmund,3.0,1.0,14.0,11.0,3.0,5.0,3.75,3.6,1.91,3.5,3.4,1.83,0,0,1,hwin,91
916,germany,bundesliga,2009-2010,2010-05-08,sv-werder-bremen,hamburger-sv,1.0,1.0,22.0,15.0,8.0,4.0,1.62,4.0,5.25,1.53,3.75,5.0,0,1,0,draw,91
917,germany,bundesliga,2009-2010,2010-05-08,wolfsburg,eintracht-frankfurt,3.0,1.0,16.0,19.0,7.0,3.0,1.73,4.0,4.2,1.67,3.6,4.0,0,0,1,hwin,91


In [6]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 23 columns):
nation             918 non-null object
league             918 non-null object
season             918 non-null object
date               918 non-null datetime64[ns]
h                  918 non-null object
a                  918 non-null object
h_ftGoals          918 non-null float64
a_ftGoals          918 non-null float64
h_shots            918 non-null float64
a_shots            918 non-null float64
h_shotsOnTarget    918 non-null float64
a_shotsOnTarget    918 non-null float64
hwinOddsBet365     918 non-null float64
drawOddsBet365     918 non-null float64
awinOddsBet365     918 non-null float64
hwinOddsLb         918 non-null float64
drawOddsLb         918 non-null float64
awinOddsLb         918 non-null float64
result_awin        918 non-null uint8
result_draw        918 non-null uint8
result_hwin        918 non-null uint8
result             918 non-null object
game_day           9

In [7]:
df = df_orig.copy(deep=True)

In [8]:
class LeagueSeasonTimeSeriesFold():
    """
    Accepts a league season date sorted DataFrame containing a game_day column
    
    """
    
    def get_game_day_change_indices(self, X_df):
        return X_df['game_day'].diff()[X_df['game_day'].diff() != 0].index.values
    
    def split(self, X_df, y=None, groups=None):
        """
        Accepts a dataframe for a season with a 'game_day' column
        Splits into train, test for each game day in the season
        
        """
        game_day_indices = self.get_game_day_change_indices(X_df)        
        for gd_index in game_day_indices[1:]:
            n_game_day = X_df.loc[gd_index]['game_day']
            train_indices = X_df[X_df['game_day'] < n_game_day].index
            test_indices = X_df[X_df['game_day'] == n_game_day].index
            yield train_indices, test_indices


X = df[df['season'] == '2009-2010']
lstsf = LeagueSeasonTimeSeriesFold()


n =0
for train_index, test_index in lstsf.split(X):

    print('train:')
    print(train_index)
    print('test')
    print(test_index)
    print('\n')
    n  += 1
    if n >= 4:
        break

train:
Int64Index([612], dtype='int64')
test
Int64Index([613, 614, 615, 616, 617, 618], dtype='int64')


train:
Int64Index([612, 613, 614, 615, 616, 617, 618], dtype='int64')
test
Int64Index([619, 620], dtype='int64')


train:
Int64Index([612, 613, 614, 615, 616, 617, 618, 619, 620], dtype='int64')
test
Int64Index([621, 622, 623, 624, 625, 626, 627], dtype='int64')


train:
Int64Index([612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627], dtype='int64')
test
Int64Index([628, 629], dtype='int64')




In [9]:
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class PoissonRegression(BaseEstimator, ClassifierMixin):
    """
    Poisson regression
    see formula
    """
    def __init__(self, family=sm.families.Poisson(),
                 formula='goals ~ home + team + opponent',
                 max_goals=10):
        self.family = family
        self.formula = formula
        self.max_goals = max_goals
        self.epsilon = 0.1
        self.model = None

    def fit(self, X, y):
        """
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training input samples.
        y : Not used
        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        #X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        # Reshape the data
        self.reshaped_X_ = pd.concat([self.X_[['h','a','h_ftGoals']].assign(home=1).rename(
                             columns={'h':'team', 'a':'opponent','h_ftGoals':'goals'}),
                             self.X_[['a','h','a_ftGoals']].assign(home=0).rename(
                             columns={'a':'team', 'h':'opponent','a_ftGoals':'goals'})])
        
        # fit the model - have to handle perfect separation or not enough data
        try:
            self.model= smf.glm(formula=self.formula,
                                data=self.reshaped_X_, 
                                family=self.family).fit()
        except sm_exceptions.PerfectSeparationError:
            # print('sm_exceptions.PerfectSeparationError')
            self.model=None
        except ValueError:
            # print('ValueError')
            self.model=None        
        
        # Return the classifier
        return self.model
    

    def calc_probas(self, row):
        team_pred = [[poisson.pmf(i, team_avg) for i in range(0, self.max_goals+1)] \
                     for team_avg in [row['h_lambda'], row['a_lambda']]]
        prob_table = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
        phwin = np.sum(np.tril(prob_table, -1))
        pdraw = np.sum(np.diag(prob_table))
        pawin = np.sum(np.triu(prob_table, 1))
        # Return in same sequence as classes_
        return pawin, pdraw, phwin
        

    def predict(self, X):
        """ A reference implementation of a prediction for a classifier.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        """
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        # X = check_array(X)
        if self.model is not None:
        # Reshape the data and make the poisson predictions
            h_preds = self.model.predict(pd.DataFrame(data={'team': X['h'].values,
                                                   'opponent': X['a'].values,
                                                   'home':1}))
            a_preds = self.model.predict(pd.DataFrame(data={'team': X['a'].values,
                                                      'opponent': X['h'].values,
                                                      'home':0}))
            temp_df = pd.DataFrame({'h_lambda': h_preds, 'a_lambda': a_preds})
            self.probas = temp_df.apply(self.calc_probas, axis=1, result_type='expand')
            self.probas.columns = ['poiss_p(awin)', 'poiss_p(draw)', 'poiss_p(hwin)']
        else:
            self.probas = pd.DataFrame({'poiss_p(awin)': [0]*len(X),
                                   'poiss_p(draw)': [0]*len(X),
                                   'poiss_p(hwin)': [0]*len(X)})
        # Clean up non-sensical predictions before we have enough data to predict
        epsilon = 0.1
        crit1 = self.probas.sum(axis=1) < 1.0 - self.epsilon
        crit2 = self.probas.sum(axis=1) > 1.0 + self.epsilon
        self.probas[crit1 | crit2] = 0
        self.probas.index = X.index
        return self.probas.idxmax(axis=1)
        
        
    def predict_proba(self, X):
        self.predict(X)
        return self.probas    

In [10]:
df = df_orig.copy(deep=True)
X = df[df['season'] == '2007-2008']
y = X.pop('result')
lstsf = LeagueSeasonTimeSeriesFold()

clf = PoissonRegression()
for train_index, test_index in lstsf.split(X):
    clf.fit(X.loc[train_index], y.loc[train_index])
    preds = clf.predict_proba(X.loc[test_index])
    res = pd.concat([preds, y.loc[test_index]], axis=1)

# look at the final prediction batch
print(res)
print(clf.classes_)

  scale = np.dot(wresid, wresid) / df_resid
  endog_mu = self._clean(endog / mu)
  endog_mu = self._clean(endog / mu)
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


     poiss_p(awin)  poiss_p(draw)  poiss_p(hwin) result
297       0.065939       0.183498       0.750556   hwin
298       0.147360       0.215506       0.637130   awin
299       0.378860       0.225298       0.395838   awin
300       0.207616       0.254447       0.537937   hwin
301       0.135171       0.273396       0.591433   hwin
302       0.184406       0.205145       0.610437   hwin
303       0.287986       0.208289       0.503709   awin
304       0.474840       0.278738       0.246422   awin
305       0.123754       0.161541       0.714624   draw
['awin' 'draw' 'hwin']


In [11]:
df = df_orig.copy(deep=True)
# X = df[df['season'] == '2007-2008']
# y = X.pop('result')
#lstsf = LeagueSeasonTimeSeriesFold()

# i don't like this !
df['poiss_p(awin)'], df['poiss_p(draw)'], df['poiss_p(hwin)'] = [0, 0, 0] 
for season in sorted(df['season'].unique()):
    print(season)
    season_df = df[df['season'] == season]

    game_day_splitter = LeagueSeasonTimeSeriesFold()
    for train_indices, predict_indices in game_day_splitter.split(season_df):
        clf = PoissonRegression()
        clf.fit(season_df.loc[train_indices], season_df.loc[train_indices, 'result'])
        preds = clf.predict_proba(season_df.loc[predict_indices])
        season_df.loc[preds.index, preds.columns] = preds[preds.columns].values
        #season_df = pd.concat([season_df, preds], sort=False, axis=0)
        #print(preds)
        #season_df[preds.columns[0]] = preds[preds.columns[0]].values
        #season_df[preds.columns[0].loc[preds.index, [preds.columns]] = preds.values

    

# clf = PoissonRegression()
# for train_index, test_index in lstsf.split(X):
#     clf.fit(X.loc[train_index], y.loc[train_index])
#     preds = clf.predict_proba(X.loc[test_index])
#     res = pd.concat([preds, y.loc[test_index]], axis=1)

# # look at the final prediction batch
# print(res)
# print(clf.classes_)

2007-2008


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2008-2009
2009-2010


Unnamed: 0,nation,league,season,date,h,a,h_ftGoals,a_ftGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsLb,drawOddsLb,awinOddsLb,result_awin,result_draw,result_hwin,result,game_day,poiss_p(awin),poiss_p(draw),poiss_p(hwin)
612,germany,bundesliga,2009-2010,2009-08-07,wolfsburg,stuttgart,2.0,0.0,13.0,14.0,7.0,4.0,1.95,3.5,3.75,2.0,3.4,3.75,0,0,1,hwin,0,0.0,0.0,0.0
613,germany,bundesliga,2009-2010,2009-08-08,dortmund,fc-koln,1.0,0.0,24.0,7.0,11.0,0.0,1.62,3.75,5.5,1.67,3.3,4.5,0,0,1,hwin,1,0.0,0.0,0.0
614,germany,bundesliga,2009-2010,2009-08-08,hertha-berlin,hannover,1.0,0.0,10.0,15.0,4.0,3.0,1.8,3.5,4.5,1.73,3.4,4.0,0,0,1,hwin,1,0.0,0.0,0.0
615,germany,bundesliga,2009-2010,2009-08-08,hoffenheim,bayern-munich,1.0,1.0,9.0,9.0,1.0,3.0,4.2,3.4,1.91,3.75,3.3,1.8,0,1,0,draw,1,0.0,0.0,0.0
616,germany,bundesliga,2009-2010,2009-08-08,mainz,leverkusen,2.0,2.0,8.0,13.0,4.0,7.0,3.8,3.4,2.0,3.2,3.2,2.0,0,1,0,draw,1,0.0,0.0,0.0
617,germany,bundesliga,2009-2010,2009-08-08,nurnberg,schalke,1.0,2.0,14.0,9.0,3.0,5.0,3.5,3.3,2.1,2.88,3.3,2.1,1,0,0,awin,1,0.0,0.0,0.0
618,germany,bundesliga,2009-2010,2009-08-08,sv-werder-bremen,eintracht-frankfurt,2.0,3.0,22.0,15.0,10.0,8.0,1.4,4.5,8.0,1.36,4.0,7.0,1,0,0,awin,1,0.0,0.0,0.0
619,germany,bundesliga,2009-2010,2009-08-09,bochum,borussia-monchengladbach,3.0,3.0,28.0,10.0,8.0,7.0,2.2,3.3,3.25,2.1,3.3,2.88,0,1,0,draw,2,0.0,0.0,0.0
620,germany,bundesliga,2009-2010,2009-08-09,sc-freiburg,hamburger-sv,1.0,1.0,17.0,14.0,6.0,3.0,3.6,3.4,2.0,3.4,3.25,1.91,0,1,0,draw,2,0.0,0.0,0.0
621,germany,bundesliga,2009-2010,2009-08-15,bayern-munich,sv-werder-bremen,1.0,1.0,9.0,10.0,4.0,1.0,1.62,3.75,5.5,1.53,3.5,5.5,0,1,0,draw,3,0.0,0.0,0.0


In [12]:
season_df.head(45)

Unnamed: 0,nation,league,season,date,h,a,h_ftGoals,a_ftGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsLb,drawOddsLb,awinOddsLb,result_awin,result_draw,result_hwin,result,game_day,poiss_p(awin),poiss_p(draw),poiss_p(hwin)
612,germany,bundesliga,2009-2010,2009-08-07,wolfsburg,stuttgart,2.0,0.0,13.0,14.0,7.0,4.0,1.95,3.5,3.75,2.0,3.4,3.75,0,0,1,hwin,0,0.0,0.0,0.0
613,germany,bundesliga,2009-2010,2009-08-08,dortmund,fc-koln,1.0,0.0,24.0,7.0,11.0,0.0,1.62,3.75,5.5,1.67,3.3,4.5,0,0,1,hwin,1,0.0,0.0,0.0
614,germany,bundesliga,2009-2010,2009-08-08,hertha-berlin,hannover,1.0,0.0,10.0,15.0,4.0,3.0,1.8,3.5,4.5,1.73,3.4,4.0,0,0,1,hwin,1,0.0,0.0,0.0
615,germany,bundesliga,2009-2010,2009-08-08,hoffenheim,bayern-munich,1.0,1.0,9.0,9.0,1.0,3.0,4.2,3.4,1.91,3.75,3.3,1.8,0,1,0,draw,1,0.0,0.0,0.0
616,germany,bundesliga,2009-2010,2009-08-08,mainz,leverkusen,2.0,2.0,8.0,13.0,4.0,7.0,3.8,3.4,2.0,3.2,3.2,2.0,0,1,0,draw,1,0.0,0.0,0.0
617,germany,bundesliga,2009-2010,2009-08-08,nurnberg,schalke,1.0,2.0,14.0,9.0,3.0,5.0,3.5,3.3,2.1,2.88,3.3,2.1,1,0,0,awin,1,0.0,0.0,0.0
618,germany,bundesliga,2009-2010,2009-08-08,sv-werder-bremen,eintracht-frankfurt,2.0,3.0,22.0,15.0,10.0,8.0,1.4,4.5,8.0,1.36,4.0,7.0,1,0,0,awin,1,0.0,0.0,0.0
619,germany,bundesliga,2009-2010,2009-08-09,bochum,borussia-monchengladbach,3.0,3.0,28.0,10.0,8.0,7.0,2.2,3.3,3.25,2.1,3.3,2.88,0,1,0,draw,2,0.0,0.0,0.0
620,germany,bundesliga,2009-2010,2009-08-09,sc-freiburg,hamburger-sv,1.0,1.0,17.0,14.0,6.0,3.0,3.6,3.4,2.0,3.4,3.25,1.91,0,1,0,draw,2,0.0,0.0,0.0
621,germany,bundesliga,2009-2010,2009-08-15,bayern-munich,sv-werder-bremen,1.0,1.0,9.0,10.0,4.0,1.0,1.62,3.75,5.5,1.53,3.5,5.5,0,1,0,draw,3,0.0,0.0,0.0


In [13]:
season_df.tail()

Unnamed: 0,nation,league,season,date,h,a,h_ftGoals,a_ftGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsLb,drawOddsLb,awinOddsLb,result_awin,result_draw,result_hwin,result,game_day,poiss_p(awin),poiss_p(draw),poiss_p(hwin)
913,germany,bundesliga,2009-2010,2010-05-08,mainz,schalke,0.0,0.0,12.0,10.0,4.0,3.0,2.88,3.4,2.38,2.75,3.25,2.2,0,1,0,draw,91,0.492853,0.288792,0.218355
914,germany,bundesliga,2009-2010,2010-05-08,nurnberg,fc-koln,1.0,0.0,13.0,5.0,3.0,2.0,1.53,4.33,5.5,1.5,3.6,5.5,0,0,1,hwin,91,0.413702,0.306825,0.279473
915,germany,bundesliga,2009-2010,2010-05-08,sc-freiburg,dortmund,3.0,1.0,14.0,11.0,3.0,5.0,3.75,3.6,1.91,3.5,3.4,1.83,0,0,1,hwin,91,0.59787,0.226394,0.175732
916,germany,bundesliga,2009-2010,2010-05-08,sv-werder-bremen,hamburger-sv,1.0,1.0,22.0,15.0,8.0,4.0,1.62,4.0,5.25,1.53,3.75,5.0,0,1,0,draw,91,0.257035,0.218959,0.523998
917,germany,bundesliga,2009-2010,2010-05-08,wolfsburg,eintracht-frankfurt,3.0,1.0,16.0,19.0,7.0,3.0,1.73,4.0,4.2,1.67,3.6,4.0,0,0,1,hwin,91,0.285043,0.203489,0.511446


In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
class SMWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, family, formula, alpha, L1_wt):
        self.family = family
        self.formula = formula
        self.alpha = alpha
        self.L1_wt = L1_wt
        self.model = None
        self.result = None
    def fit(self, X, y):
        data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
        data.columns = X.columns.tolist() + ['y']
        self.model = glm_sm(self.formula, data, family=self.family)
        self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
        return self.result
    def predict(self, X):
        return self.result.predict(X)

In [None]:
def get_game_day_change_indices(df):
    return df['game_day'].diff()[df['game_day'].diff() != 0].index.values

In [None]:
def poiss_reshape_train_data(df):
    return pd.concat([df[['h','a','h_ftGoals']].assign(home=1).rename(
                     columns={'h':'team', 'a':'opponent','h_ftGoals':'goals'}),
                     df[['a','h','a_ftGoals']].assign(home=0).rename(
                     columns={'a':'team', 'h':'opponent','a_ftGoals':'goals'})])

In [None]:
def fit_poiss_model(train_data):
    """
    For the first games of the season the model will not run
    There is perfect sparation or not enough data
    So catch the exceptions and explicitly return None
    The function catching this data must deal with a none return value
    """
    try:
        return smf.glm(formula="goals ~ home + team + opponent",
                       data=train_data, 
                       family=sm.families.Poisson()).fit()
    except sm_exceptions.PerfectSeparationError:
        return None
    except ValueError:
        return None

In [None]:
def get_poiss_preds(model, predict_data):
    
    h_preds = model.predict(pd.DataFrame(data={'team': predict_data['h'].values,
                                               'opponent': predict_data['a'].values,
                                               'home':1}))
    a_preds = model.predict(pd.DataFrame(data={'team': predict_data['a'].values,
                                                  'opponent': predict_data['h'].values,
                                                  'home':0}))
    return h_preds.values, a_preds.values

In [None]:
class SMWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, family, formula, alpha, L1_wt):
        self.family = family
        self.formula = formula
        self.alpha = alpha
        self.L1_wt = L1_wt
        self.model = None
        self.result = None
    def fit(self, X, y):
        data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
        data.columns = X.columns.tolist() + ['y']
        self.model = glm_sm(self.formula, data, family=self.family)
        self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
        return self.result
    def predict(self, X):
        return self.result.predict(X)

In [None]:
def run_model_predict_cycles(df_orig):
    df = df_orig.copy(deep=True)
    df = insert_game_day(df)
    game_day_indices = get_game_day_change_indices(df)
    
    # We can only attempt to predict after the first game day so we start looping at 1
    for gd_index in game_day_indices[1:]:
        n_game_day = df.iloc[gd_index]['game_day']
        train_data = df[df['game_day'] < n_game_day]
        predict_data = df[df['game_day'] == n_game_day]
    
        # 3 functions below and the new column assignments are all coupled
        # All must be changed for different model
        # Should extract this block into a interchangable function
        # for different models or write a class with\
        # reshape, fit, predict, insert methods
        poiss_reshaped_train_data = poiss_reshape_train_data(train_data)
        poiss_model = fit_poiss_model(poiss_reshaped_train_data)
        if poiss_model is not None:
            poiss_pred_h_coeffs, poiss_pred_a_coeffs = get_poiss_preds(poiss_model, predict_data)
            df.loc[predict_data.index, 'h_poissPred'] = poiss_pred_h_coeffs
            df.loc[predict_data.index, 'a_poissPred'] = poiss_pred_a_coeffs
        else:
            df.loc[predict_data.index, 'h_poissPred'] = np.NaN
            df.loc[predict_data.index, 'a_poissPred'] = np.NaN  
        # We can run this through a numpy matrix function to return
        # - h_poissProbWin
        # - h_poissProbDraw
        # - a_poissProbWin
        # - a_poissProbDraw
        # End of coupled block of functions
        
    return df
        
        
df = run_model_predict_cycles(season_df)
df

In [None]:
import itertools
import numpy as np
import pandas as pd

np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
%matplotlib inline

Feature Nomenclature & Time Series to Supervised

https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

https://machinelearningmastery.com/time-series-forecasting-supervised-learning/

DataFrame in time sequence where index 0 is the earliest game, and index max is the latest game

There are 2 teams per game `h` and `a`, where h indicates the team played at home, and a - away.

Each team has generated a feature by the end of the game - `h_feat`, and `a_feat`

There could be multiple features, meaning a double  set for each feature

Each team has a home game record and an away game record

So, there are 4 records to get for each feature
+ home team home record `h_h`
+ home team away record `h_a`
+ away team home record `a_h`
+ away team away record `a_a`

Results presented as 0, -1, -2 ... -n where 0 referes to the current game. This column can be used as a predictor variable for classification, or used as a classification response, or dropped and just the previous game features used for prediction


### Interpretation Note

`h_h_feat_-1` means:
+ (`h`) home team 
+ (`h`) home game records 
+ (`feat`) feature value in 
+ (`-1`) last game

In [None]:
df = pd.DataFrame({'h':['A','C','B','E','A','B','L','M','B'],
                  'a':['B','D','A','F','J','K','B','A','S'],
                  'h_firstfeat':[0,1,2,3,4,5,6,7,8],
                  'a_firstfeat':[9,10,11,12,13,14,15,16,17],
                  'h_secondfeat':[18,19,20,21,22,23,24,25,26],
                  'a_secondfeat':[27,28,29,30,31,32,33,34,35]})
df