# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/football.csv', parse_dates=[6])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 706 entries, 0 to 705
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   team       706 non-null    object        
 1   name       706 non-null    object        
 2   position   706 non-null    object        
 3   passing    706 non-null    int64         
 4   rushing    706 non-null    int64         
 5   receiving  706 non-null    int64         
 6   date       706 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 38.7+ KB


In [3]:
df.shape

(706, 7)

In [4]:
df.sample(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date
208,CHI,David Montgomery,RB,0,106,0,2021-10-03
173,PHI,Dallas Goedert,TE,0,0,56,2021-10-03
625,NOR,Taysom Hill,QB,9,28,26,2021-10-03
216,ARI,DeAndre Hopkins,WR,0,0,83,2021-09-12
122,JAX,Chris Manhertz,TE,0,0,0,2021-09-19
238,LVR,Derek Carr,QB,386,-2,0,2021-09-26
368,IND,Jonathan Taylor,RB,0,64,8,2021-09-26
78,SFO,Brandon Aiyuk,WR,0,0,15,2021-10-03
703,NYJ,Zach Wilson,QB,210,19,0,2021-09-19
143,LAR,Cooper Kupp,WR,0,0,64,2021-10-03


In [5]:
df["yards"] = df["passing"] + df["rushing"] + df["receiving"]

In [6]:
df[df['name'] == 'Tom Brady'].head(3)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards
643,TAM,Tom Brady,QB,379,0,0,2021-09-09,379
644,TAM,Tom Brady,QB,276,6,0,2021-09-19,282
645,TAM,Tom Brady,QB,432,14,0,2021-09-26,446


In [7]:
tom = df[df['name'] == 'Tom Brady'].copy()

In [8]:
tom['points_1'] = tom['yards'].shift(1)
tom['points_2'] = tom['yards'].shift(2)

In [9]:
tom.tail(10)

Unnamed: 0,team,name,position,passing,rushing,receiving,date,yards,points_1,points_2
643,TAM,Tom Brady,QB,379,0,0,2021-09-09,379,,
644,TAM,Tom Brady,QB,276,6,0,2021-09-19,282,379.0,
645,TAM,Tom Brady,QB,432,14,0,2021-09-26,446,282.0,379.0
646,TAM,Tom Brady,QB,269,3,0,2021-10-03,272,446.0,282.0


In [10]:
df['yards_1'] = df.groupby('name')['yards'].shift(1)
df['yards_2'] = df.groupby('name')['yards'].shift(2)

In [11]:
df = df.dropna(subset=["yards_1", "yards_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [12]:
target = 'yards'
y = df[target]
X = df[['position', 'yards_1', 'yards_2']]

In [13]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [16]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [19]:
model.score(Z_train, y_train)

0.8270824684377578

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['TE', 'WR', 'RB', 'QB']
})

pd.get_dummies(demo)

Unnamed: 0,position_QB,position_RB,position_TE,position_WR
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,1,0,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['RB/QB', 'TE/WR', 'FB', 'RB']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_FB,position_RB,position_RB/QB,position_TE/WR
0,0,0,1,0
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,yards_1,yards_2
438,QB,345.0,346.0
230,WR,101.0,189.0
97,QB,194.0,284.0
445,RB,34.0,59.0
330,QB,132.0,130.0


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [25]:
lb.classes_

array(['FB', 'QB', 'RB', 'TE', 'WR'], dtype='<U2')

In [26]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [27]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0]])

In [28]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [29]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [30]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_FB,position_QB,position_RB,position_TE,position_WR,yards_1,yards_2
2,0,0,0,0,1,-0.63426,-0.575646
7,0,0,1,0,0,0.061314,-0.829461
8,0,0,1,0,0,-0.122241,0.044788
11,0,1,0,0,0,1.471783,0.213998
12,0,1,0,0,0,1.43314,1.417265
15,0,0,0,0,1,-0.6246,-0.942267
16,0,0,0,0,1,-0.982047,-0.622649
20,0,0,0,0,1,-0.672903,-0.171424
21,0,0,0,0,1,-0.566635,-0.669652
31,0,0,0,0,1,-0.817815,-0.707254


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [31]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['yards_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['yards_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [32]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [33]:
del pipe

In [34]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [35]:
pipe.score(X_train, y_train)

0.8270824684377578

In [36]:
pipe.predict(X_train)[:10]

array([ 51.1978269 ,  86.8225038 ,  98.79690783, 239.09359958,
       262.97375818,  43.84876891,  38.18722293,  58.3327628 ,
        51.56611004,  42.09295059])

In [37]:
X_train.sample(1).to_dict(orient='list')

{'position': ['QB'], 'yards_1': [253.0], 'yards_2': [374.0]}

In [38]:
new = pd.DataFrame({
    'position': ['TE'], 
    'yards_1': [15.0], 
    'yards_2': [33.0]
})

In [39]:
pipe.predict(new)

array([27.41865817])