# 0006.0002b Features - Poisson Regression Model b

In [1]:
import pathlib
import sys

import pandas as pd
import numpy as np

import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import poisson, nbinom


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 2
#add the 'src' directory to path to import modules
PROJECT_DIR = pathlib.Path.cwd().resolve().parent
sys.path.append(str(PROJECT_DIR))

from src.visualization.visualize import (extend_cols,
                                         vstacked_bar_charts,
                                         diagnose_discrete_fit)
from src.visualization.fit_dists import fit_nbinom

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

fn = '2007-2008__2017-2018.csv'
EDA_FP = PROJECT_DIR / 'data' / 'sample' / '01-stacked-seasons' / 'germany' / 'bundesliga' / fn
SAVED_IMAGES_DIR = PROJECT_DIR / 'notebooks' / 'saved-images'








import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import poisson, nbinom
import pathlib

import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline

ENV_DIR = pathlib.Path.cwd().parent.parent.parent

season_fp = (ENV_DIR / 'data' / 'soccer' / 'raw' / 'england' / 'premier' / '2016-2017'/
             'football-data-co-uk'/ 'season-data' / '2016-2017.csv')

## Plan
- Run Model as per web page
- Identify blocks transform, fit, predict
- Devise scheme for converting to ski-kit learn compatable code
- Write code
- Test

---


## Run Model with David Sheehan's Code

This web page runs through a method for running a poisson regression on league football data:

[David Sheehan - Predicting Football Results with Statistical Modeling](https://dashee87.github.io/football/python/predicting-football-results-with-statistical-modelling/)

I'll re-run this data using David's code

- Load & trim Data
- Format data for Model

### Load & trim data

In [2]:
df_all = pd.read_csv(season_fp)
df_all.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,E0,13/08/16,Burnley,Swansea,0,1,A,0,0,D,...,1.61,32,-0.25,2.13,2.06,1.86,1.81,2.79,3.16,2.89
1,E0,13/08/16,Crystal Palace,West Brom,0,1,A,0,0,D,...,1.52,33,-0.5,2.07,2.0,1.9,1.85,2.25,3.15,3.86
2,E0,13/08/16,Everton,Tottenham,1,1,D,1,0,H,...,1.77,32,0.25,1.91,1.85,2.09,2.0,3.64,3.54,2.16
3,E0,13/08/16,Hull,Leicester,2,1,H,1,0,H,...,1.67,31,0.25,2.35,2.26,2.03,1.67,4.68,3.5,1.92
4,E0,13/08/16,Man City,Sunderland,2,1,H,1,0,H,...,2.48,34,-1.5,1.81,1.73,2.2,2.14,1.25,6.5,14.5


In [3]:
df = df_all[['HomeTeam','AwayTeam','FTHG','FTAG']]
df = df.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})
df.head()

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


### Format Data for Model

In [4]:
# Use the first 370 matches for training
df_train = df[:-10]
# Split off the last 10 matches as a test set
df_test = df[-10:]
df_test.head(12)

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals
370,Arsenal,Everton,3,1
371,Burnley,West Ham,1,2
372,Chelsea,Sunderland,5,1
373,Hull,Tottenham,1,7
374,Leicester,Bournemouth,1,1
375,Liverpool,Middlesbrough,3,0
376,Man United,Crystal Palace,2,0
377,Southampton,Stoke,0,1
378,Swansea,West Brom,2,1
379,Watford,Man City,0,5


In [5]:
# Format the data into a shape for the statsmodels poisson regression model
print('Original shape: ', df_train.shape, '\n')
goal_model_data = pd.concat([df_train[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
           df_train[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

print('Head \n', goal_model_data.head(), '\n')
print('Tail \n', goal_model_data.tail(), '\n')
print('Formatted Shape: ', goal_model_data.shape)

Original shape:  (370, 4) 

Head 
              team    opponent  goals  home
0         Burnley     Swansea      0     1
1  Crystal Palace   West Brom      0     1
2         Everton   Tottenham      1     1
3            Hull   Leicester      2     1
4        Man City  Sunderland      2     1 

Tail 
            team     opponent  goals  home
365     Watford      Chelsea      3     0
366  Sunderland      Arsenal      0     0
367   West Brom     Man City      1     0
368  Man United  Southampton      0     0
369   Tottenham    Leicester      6     0 

Formatted Shape:  (740, 4)


In [6]:
# Looking at a single team
print(goal_model_data[goal_model_data['team'] == 'Burnley'])
# There are 20 teams in the league.
# each team plays every other team twice once at home, once away
# So each team plays 38 games
# 38 games x 20 teams = 760, but 2 teams per game, so games = 760/2 = 380
# Formatting puts data into a form with a boolean for home or away game

        team        opponent  goals  home
0    Burnley         Swansea      0     1
11   Burnley       Liverpool      2     1
32   Burnley            Hull      1     1
59   Burnley         Watford      2     1
66   Burnley         Arsenal      0     1
82   Burnley         Everton      2     1
101  Burnley  Crystal Palace      3     1
120  Burnley        Man City      1     1
141  Burnley     Bournemouth      3     1
171  Burnley   Middlesbrough      1     1
181  Burnley      Sunderland      4     1
200  Burnley     Southampton      1     1
222  Burnley       Leicester      1     1
247  Burnley         Chelsea      1     1
283  Burnley       Tottenham      0     1
293  Burnley           Stoke      1     1
327  Burnley      Man United      0     1
346  Burnley       West Brom      2     1
20   Burnley         Chelsea      0     0
43   Burnley       Leicester      0     0
78   Burnley     Southampton      1     0
91   Burnley      Man United      0     0
119  Burnley       West Brom      

### Setup Model

In [7]:
# I like this because the model specification is crystal clear
# but the price we pay is the horrible code to reformat the data
# This should be abstracted away into either a fit or a transform
# Maybe we can use patsy to make life easier
poisson_model_spec = smf.glm(formula="goals ~ home + team + opponent",
                             data=goal_model_data, family=sm.families.Poisson())

### Fit

In [8]:
poisson_model = poisson_model_spec.fit()

### Review Fitted Model

In [9]:
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,740
Model:,GLM,Df Residuals:,700
Model Family:,Poisson,Df Model:,39
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-1042.4
Date:,"Fri, 22 Feb 2019",Deviance:,776.11
Time:,19:03:59,Pearson chi2:,659.
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3725,0.198,1.880,0.060,-0.016,0.761
team[T.Bournemouth],-0.2891,0.179,-1.612,0.107,-0.641,0.062
team[T.Burnley],-0.6458,0.200,-3.230,0.001,-1.038,-0.254
team[T.Chelsea],0.0789,0.162,0.488,0.626,-0.238,0.396
team[T.Crystal Palace],-0.3865,0.183,-2.107,0.035,-0.746,-0.027
team[T.Everton],-0.2008,0.173,-1.161,0.246,-0.540,0.138
team[T.Hull],-0.7006,0.204,-3.441,0.001,-1.100,-0.302
team[T.Leicester],-0.4204,0.187,-2.249,0.025,-0.787,-0.054
team[T.Liverpool],0.0162,0.164,0.099,0.921,-0.306,0.338


In [10]:
# To get access to the parameters
poisson_model.params[0:5]

Intercept                 0.372459
team[T.Bournemouth]      -0.289149
team[T.Burnley]          -0.645833
team[T.Chelsea]           0.078899
team[T.Crystal Palace]   -0.386515
dtype: float64

In [11]:
fitted_vals = poisson_model.fittedvalues
print(fitted_vals[0:40])
print(type(fitted_vals))
print(fitted_vals.shape)
print(fitted_vals.index.value_counts())

0     1.579602
1     1.462040
2     0.928233
3     1.357518
4     2.862819
5     0.893366
6     1.556214
7     1.881268
8     0.959127
9     2.994550
10    1.430381
11    0.986209
12    1.282752
13    0.962615
14    1.922467
15    2.861258
16    0.793632
17    1.070915
18    0.841148
19    1.822406
20    2.494122
21    2.001348
22    2.004100
23    0.635613
24    1.979069
25    1.583482
26    1.984051
27    1.075203
28    2.800066
29    1.261497
30    2.044550
31    1.611553
32    1.684412
33    2.780058
34    1.237189
35    0.989379
36    0.617632
37    1.720320
38    0.862519
39    0.714071
dtype: float64
<class 'pandas.core.series.Series'>
(740,)
369    2
126    2
117    2
118    2
119    2
120    2
121    2
122    2
123    2
124    2
125    2
127    2
92     2
128    2
129    2
130    2
131    2
132    2
133    2
134    2
135    2
136    2
116    2
115    2
114    2
113    2
94     2
95     2
96     2
97     2
      ..
269    2
270    2
271    2
272    2
273    2
274    2
255    2


In [12]:
# So fitted values is returning the expected goals for each game where the first 340 rows relate to the home team,
# and the second relate to the away team
# In effect this is transforming the actual scores into expected goals for each team in each game
# Maybe this is better suited to being a transform class that adds 2 columns to the dataframe
fitted_vals.shape

(740,)

In [13]:
# Arsenal not shown due to dummy variable scheme, so that we don't have an overparameterized model
poisson_model.params.index.values

array(['Intercept', 'team[T.Bournemouth]', 'team[T.Burnley]',
       'team[T.Chelsea]', 'team[T.Crystal Palace]', 'team[T.Everton]',
       'team[T.Hull]', 'team[T.Leicester]', 'team[T.Liverpool]',
       'team[T.Man City]', 'team[T.Man United]', 'team[T.Middlesbrough]',
       'team[T.Southampton]', 'team[T.Stoke]', 'team[T.Sunderland]',
       'team[T.Swansea]', 'team[T.Tottenham]', 'team[T.Watford]',
       'team[T.West Brom]', 'team[T.West Ham]', 'opponent[T.Bournemouth]',
       'opponent[T.Burnley]', 'opponent[T.Chelsea]',
       'opponent[T.Crystal Palace]', 'opponent[T.Everton]',
       'opponent[T.Hull]', 'opponent[T.Leicester]',
       'opponent[T.Liverpool]', 'opponent[T.Man City]',
       'opponent[T.Man United]', 'opponent[T.Middlesbrough]',
       'opponent[T.Southampton]', 'opponent[T.Stoke]',
       'opponent[T.Sunderland]', 'opponent[T.Swansea]',
       'opponent[T.Tottenham]', 'opponent[T.Watford]',
       'opponent[T.West Brom]', 'opponent[T.West Ham]', 'home'],
    

### Make Expected Goals Predictions

In [14]:
# Create some empty columns to hold the model output
df_test = df_test.copy(deep=True)
df_test.loc[:, 'exp_HomeGoals'] = np.nan
df_test.loc[:, 'exp_AwayGoals'] = np.nan
df_test

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,exp_HomeGoals,exp_AwayGoals
370,Arsenal,Everton,3,1,,
371,Burnley,West Ham,1,2,,
372,Chelsea,Sunderland,5,1,,
373,Hull,Tottenham,1,7,,
374,Leicester,Bournemouth,1,1,,
375,Liverpool,Middlesbrough,3,0,,
376,Man United,Crystal Palace,2,0,,
377,Southampton,Stoke,0,1,,
378,Swansea,West Brom,2,1,,
379,Watford,Man City,0,5,,


#### Format for prediction model

In [15]:
print('Original shape: ', df_test.shape, '\n')
test_goal_model_data = pd.concat([df_test[['HomeTeam','AwayTeam']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent'}),
           df_test[['AwayTeam','HomeTeam']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent'})])

print('Head \n', test_goal_model_data.head(25), '\n')
print('Formatted Shape: ', test_goal_model_data.shape)

Original shape:  (10, 6) 

Head 
                team        opponent  home
370         Arsenal         Everton     1
371         Burnley        West Ham     1
372         Chelsea      Sunderland     1
373            Hull       Tottenham     1
374       Leicester     Bournemouth     1
375       Liverpool   Middlesbrough     1
376      Man United  Crystal Palace     1
377     Southampton           Stoke     1
378         Swansea       West Brom     1
379         Watford        Man City     1
370         Everton         Arsenal     0
371        West Ham         Burnley     0
372      Sunderland         Chelsea     0
373       Tottenham            Hull     0
374     Bournemouth       Leicester     0
375   Middlesbrough       Liverpool     0
376  Crystal Palace      Man United     0
377           Stoke     Southampton     0
378       West Brom         Swansea     0
379        Man City         Watford     0 

Formatted Shape:  (20, 3)


In [16]:
preds = poisson_model.predict(test_goal_model_data)
preds

370    1.868565
371    1.450723
372    3.061662
373    0.563119
374    1.934709
375    2.237112
376    1.898056
377    1.371017
378    1.287506
379    0.973557
370    1.187284
371    1.059643
372    0.409373
373    2.518130
374    1.522212
375    0.509805
376    0.646601
377    0.827016
378    1.283282
379    2.090720
dtype: float64

In [17]:
# Put preds back into test DataFrame
preds = poisson_model.predict(test_goal_model_data)
test_goal_model_data['expected_Goals'] = preds
test_goal_model_data

Unnamed: 0,team,opponent,home,expected_Goals
370,Arsenal,Everton,1,1.868565
371,Burnley,West Ham,1,1.450723
372,Chelsea,Sunderland,1,3.061662
373,Hull,Tottenham,1,0.563119
374,Leicester,Bournemouth,1,1.934709
375,Liverpool,Middlesbrough,1,2.237112
376,Man United,Crystal Palace,1,1.898056
377,Southampton,Stoke,1,1.371017
378,Swansea,West Brom,1,1.287506
379,Watford,Man City,1,0.973557


In [18]:
# Get back closer to our original format
df_test['exp_HomeGoals'] = test_goal_model_data[test_goal_model_data['home'] == 1]['expected_Goals']
df_test['exp_AwayGoals'] = test_goal_model_data[test_goal_model_data['home'] == 0]['expected_Goals']
df_test

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,exp_HomeGoals,exp_AwayGoals
370,Arsenal,Everton,3,1,1.868565,1.187284
371,Burnley,West Ham,1,2,1.450723,1.059643
372,Chelsea,Sunderland,5,1,3.061662,0.409373
373,Hull,Tottenham,1,7,0.563119,2.51813
374,Leicester,Bournemouth,1,1,1.934709,1.522212
375,Liverpool,Middlesbrough,3,0,2.237112,0.509805
376,Man United,Crystal Palace,2,0,1.898056,0.646601
377,Southampton,Stoke,0,1,1.371017,0.827016
378,Swansea,West Brom,2,1,1.287506,1.283282
379,Watford,Man City,0,5,0.973557,2.09072


### Simulate Game

In [19]:
def simulate_game(home_goals_ev, away_goals_ev, max_goals=10):
    """
    
    """
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)]
                 for team_avg in [home_goals_ev, away_goals_ev]]
    goals_matrix = pd.DataFrame(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))
    return goals_matrix

In [20]:
home_goals_ev = df_test[df_test['HomeTeam'] =='Chelsea']['exp_HomeGoals']
away_goals_ev = df_test[df_test['AwayTeam'] =='Sunderland']['exp_AwayGoals']
print(home_goals_ev)
print(away_goals_ev)

372    3.061662
Name: exp_HomeGoals, dtype: float64
372    0.409373
Name: exp_AwayGoals, dtype: float64


In [21]:
goals_matrix= simulate_game(home_goals_ev, away_goals_ev)
goals_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.031085,0.012725,0.002605,0.000355,3.637587e-05,2.978258e-06,2.03203e-07,1.188368e-08,6.08107e-10,2.766027e-11,1.132336e-12
1,0.095171,0.038961,0.007975,0.001088,0.0001113706,9.118421e-06,6.221389e-07,3.638382e-08,1.861818e-09,8.468641e-11,3.466831e-12
2,0.145691,0.059642,0.012208,0.001666,0.0001704896,1.395876e-05,9.523894e-07,5.569747e-08,2.850129e-09,1.296406e-10,5.307132e-12
3,0.148686,0.060868,0.012459,0.0017,0.0001739938,1.424567e-05,9.719648e-07,5.684228e-08,2.90871e-09,1.323052e-10,5.416215e-12
4,0.113806,0.046589,0.009536,0.001301,0.0001331776,1.090386e-05,7.439569e-07,4.350796e-08,2.226372e-09,1.012685e-10,4.145655e-12
5,0.069687,0.028528,0.005839,0.000797,8.154894e-05,6.676784e-06,4.555489e-07,2.664133e-08,1.36328e-09,6.200995e-11,2.538519e-12
6,0.03556,0.014557,0.00298,0.000407,4.161255e-05,3.407009e-06,2.324561e-07,1.359446e-08,6.956502e-10,3.164225e-11,1.295348e-12
7,0.015553,0.006367,0.001303,0.000178,1.820051e-05,1.490159e-06,1.016717e-07,5.945948e-09,3.042637e-10,1.38397e-11,5.665595e-13
8,0.005952,0.002437,0.000499,6.8e-05,6.965475e-06,5.702952e-07,3.891056e-08,2.27556e-09,1.164441e-10,5.296559e-12,2.168267e-13
9,0.002025,0.000829,0.00017,2.3e-05,2.369548e-06,1.940057e-07,1.323677e-08,7.741107e-10,3.961248e-11,1.801808e-12,7.376112e-14


In [22]:
# probability home win
p_hwin = np.sum(np.tril(goals_matrix, -1))
p_hwin

0.8885986612364137

In [23]:
# probability draw
p_draw = np.sum(np.diag(goals_matrix))
p_draw

0.08409349268649573

In [24]:
# probability away win
p_awin = np.sum(np.triu(goals_matrix, 1))
p_awin

0.026961819942852995

In [25]:
def get_result_probs(some_row, max_goals=10):
    x = some_row[0]
    y = some_row[1]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [x, y]]
    goals_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    p_hwin = np.sum(np.tril(goals_matrix, -1))
    p_draw = np.sum(np.diag(goals_matrix))
    p_awin = np.sum(np.triu(goals_matrix, 1))
    return p_hwin, p_draw, p_awin

df_test['p_hwin'], df_test['p_draw'], df_test['p_awin'] = \
            zip(*df_test[['exp_HomeGoals', 'exp_AwayGoals']].apply(get_result_probs, axis=1))

df_test

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,exp_HomeGoals,exp_AwayGoals,p_hwin,p_draw,p_awin
370,Arsenal,Everton,3,1,1.868565,1.187284,0.533001,0.225918,0.241077
371,Burnley,West Ham,1,2,1.450723,1.059643,0.460841,0.263152,0.276006
372,Chelsea,Sunderland,5,1,3.061662,0.409373,0.888599,0.084093,0.026962
373,Hull,Tottenham,1,7,0.563119,2.51813,0.062653,0.138042,0.799239
374,Leicester,Bournemouth,1,1,1.934709,1.522212,0.474692,0.219747,0.305554
375,Liverpool,Middlesbrough,3,0,2.237112,0.509805,0.769684,0.160951,0.069341
376,Man United,Crystal Palace,2,0,1.898056,0.646601,0.672432,0.208757,0.118805
377,Southampton,Stoke,0,1,1.371017,0.827016,0.496391,0.277415,0.226194
378,Swansea,West Brom,2,1,1.287506,1.283282,0.368164,0.265662,0.366175
379,Watford,Man City,0,5,0.973557,2.09072,0.166696,0.202521,0.63077


## Define Data Flow i.t.o Scikit Learn Classes

In [26]:
df_all = pd.read_csv(season_fp)
df = df_all[['HomeTeam','AwayTeam','FTHG','FTAG']]
df = df.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})
#df['Home'] = 1
#df_away = df['Away'] =0
# Use the first 370 matches for training
df_train = df[:-10]
# Split off the last 10 matches as a test set
df_test = df[-10:]
df_train.head()

# "goals ~ home + team + opponent"

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


# Convert to Estimator

In [27]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted

class GLMExpVals(BaseEstimator, ClassifierMixin):
    """ An example transformer that returns the element-wise square root.
    For more information regarding how to build your own transformer, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo'
        A parameter used for demonstation of how to pass and store paramters.
    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    """
    def __init__(self, model_spec="target ~ home + team + opponent",
                 distr_family=sm.families.Poisson()):
        self.model_spec = model_spec
        self.distr_family = distr_family

    
    def shape_for_regression(self, X):
        # Converts any integers to floats - required for statsmodels
        X = X.astype(float, errors='ignore')
        # upper - Select the first 2 columns and any remaining odd columns and add a column of ones
        upper = np.c_[X.iloc[:, ([i for i in range(len(X.columns)) if i%2 == 1 or i == 0])].values, np.ones(len(X))]
        # lower - Select the first 2 columns, and any remaining even columns, and add a column of zeros
        lower = np.c_[X.iloc[:, ([i for i in range(len(X.columns)) if i%2 == 0 or i == 1])].values, np.zeros(len(X))]
        # Swap the first 2 columns
        lower[:,[0,1]] = lower[:,[1,0]]
        # Form back into a DataFrame
        model_df = pd.DataFrame(np.vstack([upper, lower]), columns=['team', 'opponent', 'target', 'home'])
        model_df['target'] = model_df['target'].astype(float)
        model_df['home'] = model_df['home'].astype(float)
        return model_df
    
    
    def fit(self, X, y=None):
        """
        Reshapes the data into a form suitable for a GLM
        Instantiates the model
        Fits the model
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
            4 Columns - must be shaped like this:
            HomeTeam | AwayTeam | HomeGoals | AwayGoals |
            or can be
            home | away | home shots on target | away shots on target |
            or can be
            home | away | home goals | away goals | home sot | away sot | home x3 | away x3
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        Returns
        -------
        self : object
            Returns self.
        """
        X = X.astype(float, errors='ignore')
        self.n_features_ = X.shape[1]
        reshaped_X = self.shape_for_regression(X)
        self.model = smf.glm(formula=self.model_spec,
                                    data=reshaped_X,
                                    family=self.distr_family)
        self.fitted_model = self.model.fit()
        return self

    
    def get_result_probs(self, some_row, max_goals=10):
        x = some_row[0]
        y = some_row[1]
        team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [x, y]]
        goals_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
        p_hwin = np.sum(np.tril(goals_matrix, -1))
        p_draw = np.sum(np.diag(goals_matrix))
        p_awin = np.sum(np.triu(goals_matrix, 1))
        return p_hwin, p_draw, p_awin

    def predict(self, X):
        """
        This will add 2 columns to the end of the original DataFrame
        The columns are the expected value of the target for each team
        Reshape the data into a form suitable for a GLM
        Predict on the reshaped data
        Reshape the data back to the original shape
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        X_transformed : array, shape (n_samples, n_features)
            The array containing the element-wise square roots of the values
            in ``X``.
        """
        # Check is fit had been called
        check_is_fitted(self, 'n_features_')

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.n_features_:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')
        
        reshaped_X = self.shape_for_regression(X)
        reshaped_X = reshaped_X[list(set(reshaped_X.columns) - set('target'))]
        reshaped_X['expected_target'] = self.fitted_model.predict(reshaped_X)
        X_interim = X.copy(deep=True)
        X_interim['exp_home_target'] = reshaped_X[reshaped_X['home'] == 0.0]['expected_target'].values
        X_interim['exp_away_target'] = reshaped_X[reshaped_X['home'] == 1.0]['expected_target'].values
        
#        print(X.head(3))
        X_transformed = X_interim.copy(deep=True)
        X_transformed['p_hwin'], X_transformed['p_draw'], X_transformed['p_awin'] = \
            zip(*X_interim[['exp_home_target', 'exp_away_target']].apply(get_result_probs, axis=1))
             
       
        
        return X_transformed
      
# X_train = df_train.copy(deep=True)
# glm_transform = GLMExpValsTransform()
# glm_transform.fit(X)
# t = glm_transform.predict(X)
# print(t.head())

# X = df_test.copy(deep=True)
# glm_transform = GLMExpValsTransform()
# glm_transform.fit(X)
# t = glm_transform.predict(X)
# t.head()
#print(t.shape)

X_train = df_train.copy(deep=True)
glm_transform = GLMExpVals()
glm_transform.fit(X_train)
X_train_ = glm_transform.predict(X_train)
print(X_train_.head())

X_test = df_test.copy(deep=True)
glm_transform = GLMExpVals()
glm_transform.fit(X_train)
X_test_ = glm_transform.predict(X_test)
X_test_.head(12)
#print(t.shape)

         HomeTeam    AwayTeam  HomeGoals  AwayGoals  exp_home_target  \
0         Burnley     Swansea          0          1         1.579602   
1  Crystal Palace   West Brom          0          1         1.462040   
2         Everton   Tottenham          1          1         0.928233   
3            Hull   Leicester          2          1         1.357518   
4        Man City  Sunderland          2          1         2.862819   

   exp_away_target    p_hwin    p_draw    p_awin  
0         1.024772  0.502330  0.251750  0.245919  
1         1.155441  0.441720  0.259232  0.299047  
2         1.464388  0.239051  0.264778  0.496170  
3         1.568222  0.330722  0.245004  0.424273  
4         0.502182  0.853335  0.105233  0.041234  


Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,exp_home_target,exp_away_target,p_hwin,p_draw,p_awin
370,Arsenal,Everton,3,1,1.868565,1.187284,0.533001,0.225918,0.241077
371,Burnley,West Ham,1,2,1.450723,1.059643,0.460841,0.263152,0.276006
372,Chelsea,Sunderland,5,1,3.061662,0.409373,0.888599,0.084093,0.026962
373,Hull,Tottenham,1,7,0.563119,2.51813,0.062653,0.138042,0.799239
374,Leicester,Bournemouth,1,1,1.934709,1.522212,0.474692,0.219747,0.305554
375,Liverpool,Middlesbrough,3,0,2.237112,0.509805,0.769684,0.160951,0.069341
376,Man United,Crystal Palace,2,0,1.898056,0.646601,0.672432,0.208757,0.118805
377,Southampton,Stoke,0,1,1.371017,0.827016,0.496391,0.277415,0.226194
378,Swansea,West Brom,2,1,1.287506,1.283282,0.368164,0.265662,0.366175
379,Watford,Man City,0,5,0.973557,2.09072,0.166696,0.202521,0.63077


In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

class IdentityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, input_array, y=None):
        return self
    
    def transform(self, input_array, y=None):
        return input_array

In [29]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
        ('id', IdentityTransformer()),
        ('glm',GLMExpVals())
    ])

predicted = pipeline.fit(X_train).predict(X_train)
print(predicted.head())

predicted = pipeline.predict(X_test)
print(predicted.head())

         HomeTeam    AwayTeam  HomeGoals  AwayGoals  exp_home_target  \
0         Burnley     Swansea          0          1         1.579602   
1  Crystal Palace   West Brom          0          1         1.462040   
2         Everton   Tottenham          1          1         0.928233   
3            Hull   Leicester          2          1         1.357518   
4        Man City  Sunderland          2          1         2.862819   

   exp_away_target    p_hwin    p_draw    p_awin  
0         1.024772  0.502330  0.251750  0.245919  
1         1.155441  0.441720  0.259232  0.299047  
2         1.464388  0.239051  0.264778  0.496170  
3         1.568222  0.330722  0.245004  0.424273  
4         0.502182  0.853335  0.105233  0.041234  
      HomeTeam     AwayTeam  HomeGoals  AwayGoals  exp_home_target  \
370    Arsenal      Everton          3          1         1.868565   
371    Burnley     West Ham          1          2         1.450723   
372    Chelsea   Sunderland          5          1  

In [30]:
stop

NameError: name 'stop' is not defined

## Try chaining 2 estimators into Pipeline

In [None]:
# def get_result_probs(some_row, max_goals=10):
#     x = some_row[0]
#     y = some_row[1]
#     team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [x, y]]
#     goals_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
#     p_hwin = np.sum(np.tril(goals_matrix, -1))
#     p_draw = np.sum(np.diag(goals_matrix))
#     p_awin = np.sum(np.triu(goals_matrix, 1))
#     return p_hwin, p_draw, p_awin

# df_test['p_hwin'], df_test['p_draw'], df_test['p_awin'] = \
#             zip(*df_test[['exp_home_target', 'exp_away_target']].apply(get_result_probs, axis=1))

# df_test



from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted

class MatchSimulator(BaseEstimator, ClassifierMixin):
    """ An example transformer that returns the element-wise square root.
    For more information regarding how to build your own transformer, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo'
        A parameter used for demonstation of how to pass and store paramters.
    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    """
    def __init__(self, max_goals=10):
        self.max_goals = max_goals

    
    def fit(self, X, y=None):
        """
        Reshapes the data into a form suitable for a GLM
        Instantiates the model
        Fits the model
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
            4 Columns - must be shaped like this:
            HomeTeam | AwayTeam | HomeGoals | AwayGoals |
            or can be
            home | away | home shots on target | away shots on target |
            or can be
            home | away | home goals | away goals | home sot | away sot | home x3 | away x3
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        Returns
        -------
        self : object
            Returns self.
        """
        X = X.astype(float, errors='ignore')
        self.n_features_ = X.shape[1]
#         reshaped_X = self.shape_for_regression(X)
#         self.model = smf.glm(formula=self.model_spec,
#                                     data=reshaped_X,
#                                     family=self.distr_family)
#         self.fitted_model = self.model.fit()
        return self


    def predict(self, X):
        """
        This will add 2 columns to the end of the original DataFrame
        The columns are the expected value of the target for each team
        Reshape the data into a form suitable for a GLM
        Predict on the reshaped data
        Reshape the data back to the original shape
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        X_transformed : array, shape (n_samples, n_features)
            The array containing the element-wise square roots of the values
            in ``X``.
        """
        # Check is fit had been called
        check_is_fitted(self, 'n_features_')

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.n_features_:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')
        def get_result_probs(some_row, max_goals=10):
            x = some_row[0]
            y = some_row[1]
            team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [x, y]]
            goals_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
            p_hwin = np.sum(np.tril(goals_matrix, -1))
            p_draw = np.sum(np.diag(goals_matrix))
            p_awin = np.sum(np.triu(goals_matrix, 1))
            return p_hwin, p_draw, p_awin
        print(X.head(3))
        X_transformed = X.copy(deep=True)
        X_transformed['p_hwin'], X_transformed['p_draw'], X_transformed['p_awin'] = \
            zip(*X[['exp_home_target', 'exp_away_target']].apply(get_result_probs, axis=1))
             
        return X_transformed
      
# X_train = df_train.copy(deep=True)
# glm_transform = GLMExpValsTransform()
# glm_transform.fit(X)
# t = glm_transform.predict(X)
# print(t.head())

# X = df_test.copy(deep=True)
# glm_transform = GLMExpValsTransform()
# glm_transform.fit(X)
# t = glm_transform.predict(X)
# t.head()
#print(t.shape)
print(X_train_.head())
X_train_ = X_train_.copy(deep=True)
match_sim = MatchSimulator()
match_sim.fit(X_train_)
s = match_sim.predict(X_train_)
print(s.head())

X_test_ = X_test_.copy(deep=True)
match_sim = MatchSimulator()
match_sim.fit(X_train_)
u = match_sim.predict(X_test_)
u.head(12)
#print(t.shape)

In [None]:
from sklearn.pipeline import Pipeline


X_train = df_train.copy(deep=True)
X_test = df_test.copy(deep=True)

pipeline = Pipeline([
        ('id', IdentityTransformer()),
        ('glm',GLMExpVals()),
        ('sim', MatchSimulator())
    ])

predicted = pipeline.fit(X_train).predict(X_train)

predicted = pipeline.predict(X_test)
predicted

In [None]:
stop

In [None]:
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.utils.validation import check_is_fitted

#  def __init__(self, model_spec = "target ~ home + team + opponent",
#                     distr_family = sm.families.Poisson()):
#         self.model_spec = model_spec
#         self.distr_family = distr_family
#     def fit(self, X, y):
#         self.fitted_model = smf.glm(formula=self.model_spec,
#                                     data = X,
#                                     family=self.distr_family)
#         #if self.fit_intercept:
#         #    X = sm.add_constant(X)
#         #self.model_ = self.model_class(y, X)
#         #self.results_ = self.model_.fit()
#     def predict(self, X):
#         fitted_vals = self.fitted_model.predict(X)
#         print(fitted_vals)
#         self.y = np.hstack([fitted_vals[0:len(fitted_vals)/2], fitted_vals[len(fitted_vals)/2 +1:]])
#         return self.y
    
    
class GLMModelTransformer(TransformerMixin):
    """ An example transformer that returns the element-wise square root.
    For more information regarding how to build your own transformer, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo'
        A parameter used for demonstation of how to pass and store paramters.
    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    """
    def __init__(self, model_spec="target~ home + team + opponent",
                 distr_family=sm.families.Poisson()):
        self.model_spec = model_spec
        self.distr_family = distr_family
    
    def fit(self, X, y=None):
        """
        Reshapes the data into a form suitable for a GLM
        Instantiates the model
        Fits the model
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
            4 Columns - must be shaped like this:
            HomeTeam | AwayTeam | HomeGoals | AwayGoals |
            or can be
            home | away | home shots on target | away shots on target |
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        Returns
        -------
        self : object
            Returns self.
        """
        print('X')
        # Need to reshape the data into a format suitable for GLM fitting
        # print(X)
        
        self.fitted_model = smf.glm(formula=self.model_spec,
                                    data = X,
                                    family=self.distr_family)
        # Return the transformer
        print(self.fitted_model)
        return self

    def transform(self, X):
        """ A reference implementation of a transform function.
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        X_transformed : array, shape (n_samples, n_features)
            The array containing the element-wise square roots of the values
            in ``X``.
        """
        # Check is fit had been called
        print(X.head())
        #check_is_fitted(self, 'n_features_')

        # Check that the input is of the same shape as the one passed
        # during fit.
        #if X.shape[1] != self.n_features_:
        #    raise ValueError('Shape of input is different from what was seen'
        #                     'in `fit`')
            
        # Put preds back into test DataFrame
        preds = self.fitted_model.predict(X)
        print(preds)
        X['expected_Goals'] = preds
        # test_goal_model_data
        #fitted_vals = self.fitted_model.fitted_vals
        print(X)
        #self.y = np.hstack([fitted_vals[0:len(fitted_vals)/2], fitted_vals[len(fitted_vals)/2 +1:]])
        #return self.y

print(model_df.head())
print(model_df.shape)

glm_transformer = GLMModelTransformer() 
t = glm_transformer.fit(model_df, y=None)
print(t)
X_bar = glm_transformer.transform(model_df)

In [None]:
stop

In [None]:
# https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible
    
    
import statsmodels.api as sm
from sklearn.base import BaseEstimator, ClassifierMixin

class SMGLMWrapper(BaseEstimator, ClassifierMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_spec = "target ~ home + team + opponent",
                    distr_family = sm.families.Poisson()):
        self.model_spec = model_spec
        self.distr_family = distr_family
    def fit(self, X, y):
        self.fitted_model = smf.glm(formula=self.model_spec,
                                    data = X,
                                    family=self.distr_family)
        #if self.fit_intercept:
        #    X = sm.add_constant(X)
        #self.model_ = self.model_class(y, X)
        #self.results_ = self.model_.fit()
    def predict(self, X):
        fitted_vals = self.fitted_model.predict(X)
        print(fitted_vals)
        self.y = np.hstack([fitted_vals[0:len(fitted_vals)/2], fitted_vals[len(fitted_vals)/2 +1:]])
        return self.y

    
glm = SMGLMWrapper()
glm.fit(model_df, y=None)
y = glm.predict(model_df)
y
        #if self.fit_intercept:
        #    X = sm.add_constant(X)
        #return self.results_.predict(X)
    
# fitted_vals = poisson_model.fittedvalues

# poisson_model = poisson_model_spec.fit()
# from sklearn.datasets import make_regression
# from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LinearRegression

# X, y = make_regression(random_state=1, n_samples=300, noise=100)

# print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
# print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
# poisson_model_spec = smf.glm(formula="goals ~ home + team + opponent",
#                              data=goal_model_data, family=sm.families.Poisson())

In [None]:
stop

In [None]:
from patsy import dmatrices
import statsmodels.api as sm

#formula = """bev_quant ~ bpm  + n_events + precip + relhum + temp + C(is_happy_hr)
#                + C(is_mon_pubhol) + C(is_pre_xmas) + C(is_pubhol) + C(is_pubhol_eve)"""
model_spec="HomeGoals ~ HomeTeam + AwayTeam"#  + HomeTeam:AwayTeam"
response, predictors = dmatrices(model_spec, df_train, return_type='dataframe')
predictors.head()

In [None]:
# https://dius.com.au/2017/08/03/using-statsmodels-glms-to-model-beverage-consumption/
# Above also includes negative binomial regression for future

from patsy import dmatrices
import statsmodels.api as sm

formula = """bev_quant ~ bpm  + n_events + precip + relhum + temp + C(is_happy_hr)
                + C(is_mon_pubhol) + C(is_pre_xmas) + C(is_pubhol) + C(is_pubhol_eve)"""
response, predictors = dmatrices(formula, data, return_type='dataframe')
po_results = sm.GLM(response, predictors, family=sm.families.Poisson()).fit()
print(po_results.summary())

In [None]:
# https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible
    
    
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)


from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

X, y = make_regression(random_state=1, n_samples=300, noise=100)

print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))

In [None]:
# Transformer Template
from sklearn.base import TransformerMixin

class DumbFeaturizer(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [[1] for _ in X]

In [None]:
# Estimator Template
from sklearn.base import BaseEstimator, ClassifierMixin

class MeanClassifier(BaseEstimator, ClassifierMixin):  
    """An example of classifier"""

    def __init__(self, intValue=0, stringParam="defaultValue", otherParam=None):
        """
        Called when initializing the classifier
        """
        self.intValue = intValue
        self.stringParam = stringParam

        # THIS IS WRONG! Parameters should have same name as attributes
        self.differentParam = otherParam 


    def fit(self, X, y=None):
        """
        This should fit classifier. All the "work" should be done here.

        Note: assert is not a good choice here and you should rather
        use try/except blog with exceptions. This is just for short syntax.
        """

        assert (type(self.intValue) == int), "intValue parameter must be integer"
        assert (type(self.stringParam) == str), "stringValue parameter must be string"
        assert (len(X) == 20), "X must be list with numerical values."

        self.treshold_ = (sum(X)/len(X)) + self.intValue  # mean + intValue

        return self

    def _meaning(self, x):
        # returns True/False according to fitted classifier
        # notice underscore on the beginning
        return( True if x >= self.treshold_ else False )

    def predict(self, X, y=None):
        try:
            getattr(self, "treshold_")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")

        return([self._meaning(x) for x in X])

    def score(self, X, y=None):
        # counts number of values bigger than mean
        return(sum(self.predict(X))) 

In [None]:
# From sklearn docs
"""
This is a module to be used as a reference for building other modules
"""
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances


class TemplateEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, demo_param='demo_param'):
        self.demo_param = demo_param

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, accept_sparse=True)
        self.is_fitted_ = True
        # `fit` should always return `self`
        return self

    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        return np.ones(X.shape[0], dtype=np.int64)


class TemplateClassifier(BaseEstimator, ClassifierMixin):
    """ An example classifier which implements a 1-NN algorithm.
    For more information regarding how to build your own classifier, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo'
        A parameter used for demonstation of how to pass and store paramters.
    Attributes
    ----------
    X_ : ndarray, shape (n_samples, n_features)
        The input passed during :meth:`fit`.
    y_ : ndarray, shape (n_samples,)
        The labels passed during :meth:`fit`.
    classes_ : ndarray, shape (n_classes,)
        The classes seen at :meth:`fit`.
    """
    def __init__(self, demo_param='demo'):
        self.demo_param = demo_param

    def fit(self, X, y):
        """A reference implementation of a fitting function for a classifier.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):
        """ A reference implementation of a prediction for a classifier.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen udring fit.
        """
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]


class TemplateTransformer(BaseEstimator, TransformerMixin):
    """ An example transformer that returns the element-wise square root.
    For more information regarding how to build your own transformer, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo'
        A parameter used for demonstation of how to pass and store paramters.
    Attributes
    ----------
    n_features_ : int
        The number of features of the data passed to :meth:`fit`.
    """
    def __init__(self, demo_param='demo'):
        self.demo_param = demo_param

    def fit(self, X, y=None):
        """A reference implementation of a fitting function for a transformer.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.
        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X, accept_sparse=True)

        self.n_features_ = X.shape[1]

        # Return the transformer
        return self

    def transform(self, X):
        """ A reference implementation of a transform function.
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.
        Returns
        -------
        X_transformed : array, shape (n_samples, n_features)
            The array containing the element-wise square roots of the values
            in ``X``.
        """
        # Check is fit had been called
        check_is_fitted(self, 'n_features_')

        # Input validation
        X = check_array(X, accept_sparse=True)

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.n_features_:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')
        return np.sqrt(X)

    
    
    

# From Gaussian Naive Bayes ABC

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        return np.exp(self.predict_log_proba(X))

    

# Example pandas Type Selector
#  This returns a different shape - a subset of the dataFrame based on the column data type
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])