# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/football.csv', parse_dates=[6])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1390 entries, 0 to 1389
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   team       1390 non-null   object        
 1   name       1390 non-null   object        
 2   position   1390 non-null   object        
 3   passing    1390 non-null   int64         
 4   rushing    1390 non-null   int64         
 5   receiving  1390 non-null   int64         
 6   date       1390 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 76.1+ KB


In [3]:
df.shape

(1390, 7)

In [4]:
df.sample(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date
453,KAN,Demarcus Robinson,WR,0,0,9,2021-09-12
1195,NYG,Saquon Barkley,RB,0,52,74,2021-10-03
260,KAN,Clyde Edwards-Helaire,RB,0,102,12,2021-10-03
234,TAM,Chris Godwin,WR,0,0,70,2021-10-10
486,BUF,Devin Singletary,RB,0,28,1,2021-10-31
431,ARI,DeAndre Hopkins,WR,0,0,53,2021-10-24
675,LAC,Jared Cook,TE,0,0,29,2021-10-10
312,SEA,D.K. Metcalf,WR,0,0,60,2021-09-12
354,NWE,Damien Harris,RB,0,80,0,2021-10-31
702,SFO,Jimmy Garoppolo,QB,165,0,0,2021-10-03


In [5]:
df["yards"] = df["passing"] + df["rushing"] + df["receiving"]

In [7]:
df[df['name'] == 'Tom Brady']

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards
1266,TAM,Tom Brady,QB,379,0,0,2021-09-09,379
1267,TAM,Tom Brady,QB,276,6,0,2021-09-19,282
1268,TAM,Tom Brady,QB,432,14,0,2021-09-26,446
1269,TAM,Tom Brady,QB,269,3,0,2021-10-03,272
1270,TAM,Tom Brady,QB,411,13,0,2021-10-10,424
1271,TAM,Tom Brady,QB,297,1,0,2021-10-14,298
1272,TAM,Tom Brady,QB,211,0,0,2021-10-24,211
1273,TAM,Tom Brady,QB,375,2,0,2021-10-31,377


In [13]:
tom = df[df['name'] == 'Tom Brady'].copy()

In [14]:
tom['yards_1'] = tom['yards'].shift(1)
tom['yards_2'] = tom['yards'].shift(2)

In [15]:
tom.tail(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards,yards_1,yards_2
1266,TAM,Tom Brady,QB,379,0,0,2021-09-09,379,,
1267,TAM,Tom Brady,QB,276,6,0,2021-09-19,282,379.0,
1268,TAM,Tom Brady,QB,432,14,0,2021-09-26,446,282.0,379.0
1269,TAM,Tom Brady,QB,269,3,0,2021-10-03,272,446.0,282.0
1270,TAM,Tom Brady,QB,411,13,0,2021-10-10,424,272.0,446.0
1271,TAM,Tom Brady,QB,297,1,0,2021-10-14,298,424.0,272.0
1272,TAM,Tom Brady,QB,211,0,0,2021-10-24,211,298.0,424.0
1273,TAM,Tom Brady,QB,375,2,0,2021-10-31,377,211.0,298.0


In [16]:
df['yards_1'] = df.groupby('name')['yards'].shift(1)
df['yards_2'] = df.groupby('name')['yards'].shift(2)

In [17]:
df = df.dropna(subset=["yards_1", "yards_2"])

In [18]:
df.sample(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards,yards_1,yards_2
687,CLE,Jarvis Landry,WR,0,0,37,2021-10-21,37,9.0,84.0
251,CAR,Christian McCaffrey,RB,0,31,9,2021-09-23,40,137.0,187.0
556,TEN,Geoff Swaim,TE,0,0,0,2021-10-03,0,27.0,10.0
215,ARI,Chase Edmonds,RB,0,26,49,2021-09-26,75,75.0,106.0
1212,DET,T.J. Hockenson,TE,0,0,10,2021-09-26,10,66.0,97.0
673,LAC,Jared Cook,TE,0,0,27,2021-09-26,27,28.0,56.0
1285,KAN,Travis Kelce,TE,0,0,57,2021-10-10,57,23.0,104.0
255,CAR,Chuba Hubbard,RB,0,28,28,2021-10-24,56,65.0,134.0
568,SEA,Gerald Everett,TE,0,0,40,2021-10-17,40,54.0,3.0
15,GNB,Aaron Jones,RB,0,82,14,2021-09-26,96,115.0,22.0


### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [19]:
target = 'yards'
y = df[target]
X = df[['position', 'yards_1', 'yards_2']]

In [20]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [23]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [24]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [25]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [26]:
model.score(Z_train, y_train)

0.7690450473915198

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [27]:
demo = pd.DataFrame({
    'position': ['TE', 'WR', 'RB', 'QB']
})

pd.get_dummies(demo)

Unnamed: 0,position_QB,position_RB,position_TE,position_WR
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,1,0,0,0


In [28]:
demo_2 = pd.DataFrame({
    'position': ['RB/QB', 'TE/WR', 'FB', 'RB']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_FB,position_RB,position_RB/QB,position_TE/WR
0,0,0,1,0
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0


In [29]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [30]:
X_train.sample(5)

Unnamed: 0,position,yards_1,yards_2
33,WR,19.0,7.0
981,RB,34.0,29.0
73,WR,53.0,32.0
807,WR,25.0,6.0
581,WR,60.0,85.0


In [31]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       ...,
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1]])

In [32]:
lb.classes_

array(['CB', 'FB', 'QB', 'RB', 'TE', 'WR'], dtype='<U2')

In [33]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [34]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0]])

In [35]:
new = pd.DataFrame({
    'position': [None]
})

lb.transform(new['position'])

ValueError: The type of target data is not known

In [39]:
X_train

Unnamed: 0,position,yards_1,yards_2
2,WR,43.0,49.0
3,WR,3.0,43.0
4,WR,38.0,3.0
5,WR,91.0,38.0
6,WR,133.0,91.0
...,...,...,...
1251,WR,107.0,62.0
1252,WR,62.0,107.0
1253,WR,123.0,62.0
1254,WR,46.0,123.0


In [40]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [41]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_CB,position_FB,position_QB,position_RB,position_TE,position_WR,yards_1,yards_2
2,0,0,0,0,0,1,-0.620504,-0.550181
3,0,0,0,0,0,1,-1.023629,-0.609818
4,0,0,0,0,0,1,-0.670894,-1.007396
5,0,0,0,0,0,1,-0.136753,-0.659515
6,0,0,0,0,0,1,0.286528,-0.132724
9,0,0,0,0,0,1,-0.378628,0.076004
10,0,0,0,0,0,1,-0.922848,-0.371271
11,0,0,0,0,0,1,-0.257691,-0.908001
12,0,0,0,0,0,1,-0.388707,-0.251997
15,0,0,0,1,0,0,0.105122,-0.818546


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [42]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['yards_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['yards_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [43]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [44]:
del pipe

In [45]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [46]:
pipe.score(X_train, y_train)

0.7690450473915198

In [47]:
pipe.predict(X_train)[:10]

array([53.5  , 46.125, 45.875, 59.625, 74.25 , 66.625, 51.25 , 54.125,
       61.625, 78.75 ])

In [48]:
X_train.sample(1).to_dict(orient='list')

{'position': ['WR'], 'yards_1': [64.0], 'yards_2': [38.0]}

In [51]:
new = pd.DataFrame({
    'position': ['QB'], 
    'yards_1': [200.0], 
    'yards_2': [180.0]
})

In [52]:
pipe.predict(new)

array([244.5])