# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/football.csv', parse_dates=[6])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2198 entries, 0 to 2197
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   team       2198 non-null   object        
 1   name       2198 non-null   object        
 2   position   2198 non-null   object        
 3   passing    2198 non-null   int64         
 4   rushing    2198 non-null   int64         
 5   receiving  2198 non-null   int64         
 6   date       2198 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 120.3+ KB


In [3]:
df.shape

(2198, 7)

In [4]:
df.sample(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date
1436,BAL,Mark Andrews,TE,0,0,48,2021-10-24
155,WAS,Antonio Gibson,RB,0,60,12,2021-10-10
1885,LAR,Sony Michel,RB,0,67,12,2021-09-26
1989,DEN,Tim Patrick,WR,0,0,16,2021-10-21
942,CIN,Ja'Marr Chase,WR,0,-2,101,2021-09-12
688,PHI,DeVonta Smith,WR,0,0,31,2021-10-14
2017,NOR,Tony Jones,RB,0,7,0,2021-09-19
1972,WAS,Terry McLaurin,WR,0,0,122,2021-10-24
97,MIN,Alexander Mattison,RB,0,90,34,2021-12-05
1258,DET,Kalif Raymond,WR,0,0,29,2021-11-14


In [5]:
df["yards"] = df["passing"] + df["rushing"] + df["receiving"]

In [6]:
df[df['name'] == 'Tom Brady']

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards
1995,TAM,Tom Brady,QB,379,0,0,2021-09-09,379
1996,TAM,Tom Brady,QB,276,6,0,2021-09-19,282
1997,TAM,Tom Brady,QB,432,14,0,2021-09-26,446
1998,TAM,Tom Brady,QB,269,3,0,2021-10-03,272
1999,TAM,Tom Brady,QB,411,13,0,2021-10-10,424
2000,TAM,Tom Brady,QB,297,1,0,2021-10-14,298
2001,TAM,Tom Brady,QB,211,0,0,2021-10-24,211
2002,TAM,Tom Brady,QB,375,2,0,2021-10-31,377
2003,TAM,Tom Brady,QB,220,2,0,2021-11-14,222
2004,TAM,Tom Brady,QB,307,10,0,2021-11-22,317


In [7]:
tom = df[df['name'] == 'Tom Brady'].copy()

In [8]:
tom['yards_1'] = tom['yards'].shift(1)
tom['yards_2'] = tom['yards'].shift(2)

In [9]:
tom.tail(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards,yards_1,yards_2
1997,TAM,Tom Brady,QB,432,14,0,2021-09-26,446,282.0,379.0
1998,TAM,Tom Brady,QB,269,3,0,2021-10-03,272,446.0,282.0
1999,TAM,Tom Brady,QB,411,13,0,2021-10-10,424,272.0,446.0
2000,TAM,Tom Brady,QB,297,1,0,2021-10-14,298,424.0,272.0
2001,TAM,Tom Brady,QB,211,0,0,2021-10-24,211,298.0,424.0
2002,TAM,Tom Brady,QB,375,2,0,2021-10-31,377,211.0,298.0
2003,TAM,Tom Brady,QB,220,2,0,2021-11-14,222,377.0,211.0
2004,TAM,Tom Brady,QB,307,10,0,2021-11-22,317,222.0,377.0
2005,TAM,Tom Brady,QB,226,2,0,2021-11-28,228,317.0,222.0
2006,TAM,Tom Brady,QB,368,-1,0,2021-12-05,367,228.0,317.0


In [10]:
df['yards_1'] = df.groupby('name')['yards'].shift(1)
df['yards_2'] = df.groupby('name')['yards'].shift(2)

In [11]:
df = df.dropna(subset=["yards_1", "yards_2"])

In [12]:
df.sample(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards,yards_1,yards_2
201,PIT,Ben Roethlisberger,QB,318,5,0,2021-09-26,323,295.0,193.0
1680,DEN,Noah Fant,TE,0,0,97,2021-10-17,97,20.0,46.0
1324,SFO,Kyle Juszczyk,FB,0,0,35,2021-10-10,35,41.0,51.0
2126,KAN,Tyreek Hill,WR,0,0,76,2021-10-17,76,78.0,186.0
1863,CAR,Sam Darnold,QB,177,10,0,2021-10-10,187,336.0,315.0
8,TEN,A.J. Brown,WR,0,7,16,2021-11-14,23,42.0,155.0
1752,PHI,Quez Watkins,WR,0,0,44,2021-10-14,44,48.0,33.0
232,HOU,Brandin Cooks,WR,0,0,45,2021-11-28,45,18.0,56.0
2097,LAR,Tyler Higbee,TE,0,0,46,2021-10-24,46,36.0,14.0
2113,SEA,Tyler Lockett,WR,0,0,57,2021-10-07,57,24.0,31.0


### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [13]:
target = 'yards'
y = df[target]
X = df[['position', 'yards_1', 'yards_2']]

In [14]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [17]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [18]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [19]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [20]:
model.score(Z_train, y_train)

0.7561605933982174

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [21]:
demo = pd.DataFrame({
    'position': ['TE', 'WR', 'RB', 'QB']
})

pd.get_dummies(demo)

Unnamed: 0,position_QB,position_RB,position_TE,position_WR
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,1,0,0,0


In [22]:
demo_2 = pd.DataFrame({
    'position': ['RB/QB', 'TE/WR', 'FB', 'RB']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_FB,position_RB,position_RB/QB,position_TE/WR
0,0,0,1,0
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0


In [23]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [24]:
X_train.sample(5)

Unnamed: 0,position,yards_1,yards_2
1567,RB,66.0,61.0
1265,WR,38.0,92.0
1345,TE,8.0,8.0
490,WR,96.0,58.0
1699,WR,23.0,79.0


In [25]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]])

In [26]:
lb.classes_

array(['CB', 'DE', 'FB', 'QB', 'RB', 'TE', 'WR'], dtype='<U2')

In [27]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [28]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0]])

In [29]:
new = pd.DataFrame({
    'position': [None]
})

lb.transform(new['position'])

ValueError: The type of target data is not known

In [30]:
X_train

Unnamed: 0,position,yards_1,yards_2
2,WR,43.0,49.0
3,WR,3.0,43.0
4,WR,38.0,3.0
5,WR,91.0,38.0
6,WR,133.0,91.0
...,...,...,...
1974,WR,35.0,122.0
1975,WR,59.0,35.0
1976,WR,103.0,59.0
1977,WR,51.0,103.0


In [31]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [32]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_CB,position_DE,position_FB,position_QB,position_RB,position_TE,position_WR,yards_1,yards_2
2,0,0,0,0,0,0,1,-0.564445,-0.508951
3,0,0,0,0,0,0,1,-0.983844,-0.571332
4,0,0,0,0,0,0,1,-0.616869,-0.987209
5,0,0,0,0,0,0,1,-0.061165,-0.623317
6,0,0,0,0,0,0,1,0.379205,-0.072281
7,0,0,0,0,0,0,1,0.609874,0.364389
8,0,0,0,0,0,0,1,-0.57493,0.593121
9,0,0,0,0,0,0,1,-0.774144,-0.581729
12,0,0,0,0,0,0,1,-0.312805,0.146054
13,0,0,0,0,0,0,1,-0.878994,-0.321806


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [33]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['yards_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['yards_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [34]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [35]:
del pipe

In [36]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [37]:
pipe.score(X_train, y_train)

0.7561605933982174

In [38]:
pipe.predict(X_train)[:10]

array([54.125, 48.375, 46.5  , 58.25 , 71.375, 80.375, 70.   , 50.625,
       66.5  , 53.125])

In [39]:
X_train.sample(1).to_dict(orient='list')

{'position': ['RB'], 'yards_1': [131.0], 'yards_2': [74.0]}

In [40]:
new = pd.DataFrame({
    'position': ['QB'], 
    'yards_1': [200.0], 
    'yards_2': [180.0]
})

In [41]:
pipe.predict(new)

array([240.75])