# Part I: Extend

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/hockey.csv')

In [2]:
df.shape

(434, 12)

In [3]:
df[df['name'] == 'Auston Matthews'].head(3)

Unnamed: 0,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
30,matthau01,Auston Matthews,C,2019-10-02,TOR,Home,OTT,W,2,0,8,18
31,matthau01,Auston Matthews,C,2019-10-04,TOR,Away,CBJ,W,1,0,4,17
32,matthau01,Auston Matthews,C,2019-10-05,TOR,Home,MTL,L-SO,2,0,4,19


In [4]:
df.sample(3)

Unnamed: 0,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
1,ovechal01,Alex Ovechkin,LW,2019-10-04,WSH,Away,NYI,W,0,0,2,20
109,mcdavco01,Connor McDavid,C,2019-11-16,EDM,Home,DAL,L-OT,0,3,5,23
45,matthau01,Auston Matthews,C,2019-11-05,TOR,Home,LAK,W,1,1,2,18


In [5]:
df.name.unique().tolist()

['Alex Ovechkin',
 'Auston Matthews',
 'Dougie Hamilton',
 'Connor McDavid',
 'Leon Draisaitl',
 'Brad Marchand',
 'David Pastrnak',
 'Nathan MacKinnon',
 'John Carlson',
 'Jack Eichel',
 'William Nylander',
 'J.T. Miller',
 'David Perron',
 'Mark Scheifele',
 'Shea Weber']

In [6]:
df['date'] = df['date'].apply(pd.to_datetime)

`train_test_split` on time series data is a little different...

In [7]:
df = df[df.date < '2019-12-01']

In [8]:
df.shape

(408, 12)

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*goals\* next game based on the {goals, assists, ice time} rolling average for the last *five* games...

In [9]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [10]:
# need to shift games by one to predict next game
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

In [11]:
# re-align index
train = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

In [12]:
# (re-)identify our X and y matrices
target = 'goals_next'
X_train = train.drop(target, axis=1)
y_train = train[target]

In [13]:
print(X_train.shape)
X_train[:3]

(333, 5)


Unnamed: 0,position,goals,assists,shots,ice_time
235,D,0.2,1.4,2.0,25.8
236,D,0.4,1.2,2.4,25.2
237,D,0.4,1.2,2.6,25.4


In [14]:
print(y_train.shape)
y_train[:3]

(333,)


235    1.0
236    0.0
237    1.0
Name: goals_next, dtype: float64

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper, CategoricalImputer

In [16]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
model.score(Z_train, y_train)

0.07041626746690122

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'D']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_D,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,goals,assists,shots,ice_time
294,C,0.4,0.4,2.2,15.8
22,LW,0.4,0.2,4.2,20.8
123,C,0.6,1.2,3.8,24.2
275,C,0.2,0.4,3.4,20.4
341,C,0.2,0.4,1.6,21.0


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

In [25]:
lb.classes_

array(['C', 'D', 'LW', 'RW'], dtype='<U2')

In [26]:
lb.transform(new['position'])

array([[0, 0, 0, 0]])

In [27]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [28]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [29]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_D,position_LW,position_RW,goals,assists,shots,ice_time
235,0,1,0,0,-1.043683,1.488882,-1.117197,2.315267
236,0,1,0,0,-0.419722,1.056626,-0.764063,2.047036
237,0,1,0,0,-0.419722,1.056626,-0.587496,2.136446
238,0,1,0,0,0.204239,1.921138,-0.587496,2.404677
239,0,1,0,0,-0.419722,2.785651,-1.117197,2.315267
240,0,1,0,0,-0.419722,1.921138,-1.117197,2.136446
241,0,1,0,0,0.204239,1.488882,-0.94063,2.047036
242,0,1,0,0,0.204239,1.488882,-0.764063,2.047036
243,0,1,0,0,-0.419722,0.62437,-1.117197,1.778806
244,0,1,0,0,0.828201,-0.672398,-0.057795,1.957626


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [30]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataframemapper',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[('position',
                                            [CategoricalImputer(copy=True,
                                                                fill_value='?',
                                                                missing_values='NaN',
                                                                strategy='most_frequent'),
                                             LabelBinarizer(neg_label=0,
                                                            pos_label=1,
                                                            sparse_output=False)]),
                                           (['goals'],
                                            [SimpleImputer(add_indicator=False,
                                                           copy=True,
                                                          

### The Pickle 🥒

In [31]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [32]:
del pipe

In [33]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [34]:
pipe.score(X_train, y_train)

0.07041626746690122

In [35]:
pipe.predict(X_train)[:10]

array([0.42075807, 0.34646911, 0.36185504, 0.30941616, 0.35062485,
       0.33627002, 0.26524895, 0.27441694, 0.30851984, 0.22706194])

In [36]:
X_train.sample(1).to_dict(orient='list')

{'position': ['C'],
 'goals': [0.2],
 'assists': [1.0],
 'shots': [3.8],
 'ice_time': [19.0]}

In [37]:
new = pd.DataFrame({
    'position': ['RW'],
    'goals': [0.7],
    'assists': [0.0],
    'shots': [3],
    'ice_time': [20.0]
})

In [38]:
pipe.predict(new)

array([1.12886688])

### Time to Test

In [39]:
df = pd.read_csv('data/hockey.csv')
df['date'] = df['date'].apply(pd.to_datetime)
df = df[df.date > '2018-12-31']

In [40]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [41]:
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

test = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

target = 'goals_next'
X_test = train.drop(target, axis=1)
y_test = train[target]

In [42]:
score = pipe.score(X_test, y_test)
print(score)

0.07041626746690122


Not all that terrible TBH...

In [43]:
with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)