# Part I: Extend

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/hockey.csv')

In [2]:
df.shape

(28917, 12)

In [3]:
df[df['name'] == 'Auston Matthews'].head(3)

Unnamed: 0,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
10,matthau01,Auston Matthews,C,2019-10-02,TOR,Home,OTT,W,2,0,8,18
446,matthau01,Auston Matthews,C,2019-10-04,TOR,Away,CBJ,W,1,0,4,17
649,matthau01,Auston Matthews,C,2019-10-05,TOR,Home,MTL,L-SO,2,0,4,19


In [4]:
df.sample(3)

Unnamed: 0,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
19462,fialake01,Kevin Fiala,LW,2019-12-19,MIN,Away,ARI,W,0,0,1,10
12992,mcdavco01,Connor McDavid,C,2019-11-24,EDM,Away,ARI,W,0,1,3,25
28519,milleco02,Colin Miller,D,2020-02-01,BUF,Home,CBJ,W,0,0,4,17


In [5]:
df.name.unique().tolist()[:10]

['Adam Larsson',
 'Alex Chiasson',
 'Alex Ovechkin',
 'Alex Pietrangelo',
 'Alex Steen',
 'Alexander Edler',
 'Alexander Kerfoot',
 'Andreas Johnsson',
 'Anthony Duclair',
 'Artem Anisimov']

In [6]:
df['date'] = df['date'].apply(pd.to_datetime)

`train_test_split` on time series data is a little different...

In [7]:
df = df[df.date < '2020-01-01']

In [8]:
df.shape

(22447, 12)

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*goals\* next game based on the {goals, assists, ice time} rolling average for the last *five* games...

In [9]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [10]:
# need to shift games by one to predict next game
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

In [11]:
# re-align index
train = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

In [12]:
# (re-)identify our X and y matrices
target = 'goals_next'
X_train = train.drop(target, axis=1)
y_train = train[target]

In [13]:
print(X_train.shape)
X_train.tail(3)

(18572, 5)


Unnamed: 0,position,goals,assists,shots,ice_time
19158,LW,0.2,0.0,2.0,13.2
19626,LW,0.2,0.0,1.6,12.2
20393,LW,0.2,0.2,1.4,11.0


In [14]:
print(y_train.shape)
y_train.tail(3)

(18572,)


19158    0.0
19626    0.0
20393    0.0
Name: goals_next, dtype: float64

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper, CategoricalImputer

In [16]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
model.score(Z_train, y_train)

0.05412876882669371

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'D']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_D,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,goals,assists,shots,ice_time
18735,C,0.6,0.0,1.4,14.2
14040,C,0.6,0.4,1.8,21.6
2321,D,0.2,0.0,1.2,19.6
6977,C,0.0,0.0,2.2,18.4
21820,C/LW,0.2,0.6,2.2,14.0


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
lb.classes_

array(['C', 'C/LW', 'C/RW', 'C/W', 'D', 'D/RW', 'F', 'LW', 'LW/C', 'RW',
       'RW/C', 'W'], dtype='<U4')

In [26]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [27]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [28]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [29]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [30]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_C/LW,position_C/RW,position_C/W,position_D,position_D/RW,position_F,position_LW,position_LW/C,position_RW,position_RW/C,position_W,goals,assists,shots,ice_time
2416,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,0.395344,-0.200961,-0.863917
3145,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,0.395344,0.225192,-0.913678
3648,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,0.395344,-0.200961,-0.714635
3931,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,0.395344,-0.414038,-0.764396
4938,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,0.395344,-0.840191,-0.764396
5139,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,-0.303127,-1.266344,-0.963438
7155,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,-0.303127,-1.479421,-1.013199
7556,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,-0.303127,-1.692498,-1.11272
7958,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,-0.303127,-1.692498,-1.062959
8454,0,0,0,0,0,0,0,1,0,0,0,0,-0.796693,-0.303127,-1.266344,-1.062959


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [31]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataframemapper',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[('position',
                                            [CategoricalImputer(copy=True,
                                                                fill_value='?',
                                                                missing_values='NaN',
                                                                strategy='most_frequent'),
                                             LabelBinarizer(neg_label=0,
                                                            pos_label=1,
                                                            sparse_output=False)]),
                                           (['goals'],
                                            [SimpleImputer(add_indicator=False,
                                                           copy=True,
                                                          

### The Pickle 🥒

In [32]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [33]:
del pipe

In [34]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [35]:
pipe.score(X_train, y_train)

0.05412876882669371

In [36]:
pipe.predict(X_train)[:10]

array([0.18901062, 0.20022583, 0.19596863, 0.18687439, 0.17333984,
       0.1434021 , 0.13432312, 0.12290955, 0.12522888, 0.13876343])

In [37]:
X_train.sample(1).to_dict(orient='list')

{'position': ['C'],
 'goals': [0.0],
 'assists': [0.0],
 'shots': [1.0],
 'ice_time': [15.0]}

In [38]:
new = pd.DataFrame({
    'position': ['RW'],
    'goals': [0.7],
    'assists': [0.0],
    'shots': [3],
    'ice_time': [20.0]
})

In [39]:
pipe.predict(new)

array([0.33442688])

### Time to Test

In [41]:
df = pd.read_csv('data/hockey.csv')
df['date'] = df['date'].apply(pd.to_datetime)
df = df[df.date > '2020-01-01']

In [42]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [43]:
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

test = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

target = 'goals_next'
X_test = train.drop(target, axis=1)
y_test = train[target]

In [44]:
score = pipe.score(X_test, y_test)
print(score)

0.05412876882669371


Kind of terrible TBH... but it's okay 🙈

In [45]:
with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)