# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[4])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4790 entries, 0 to 4789
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      4790 non-null   object        
 1   position  4790 non-null   object        
 2   minutes   4790 non-null   int64         
 3   points    4790 non-null   int64         
 4   date      4790 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 187.2+ KB


In [3]:
df.shape

(4790, 5)

In [4]:
df.sample(10)

Unnamed: 0,name,position,minutes,points,date
3958,Rodions Kurucs,F,25,2,2020-08-11
3369,N. Alexander-Walker,G,0,0,2020-07-30
2617,Kevin Hervey,F,0,0,2020-08-18
4275,T.J. Leaf,F,0,0,2020-08-12
3949,Robin Lopez,C,0,0,2020-09-04
808,Danny Green,G-F,23,9,2020-08-05
724,D. Sabonis,F-C,0,0,2020-08-04
1600,Grayson Allen,G,24,11,2020-08-07
2290,Josh Hart,G-F,25,15,2020-08-03
2013,Javonte Green,G-F,0,0,2020-08-21


In [5]:
df[df['name'] == 'Kawhi Leonard'].head(3)

Unnamed: 0,name,position,minutes,points,date
2515,Kawhi Leonard,F,33,28,2020-07-30
2516,Kawhi Leonard,F,27,24,2020-08-01
2517,Kawhi Leonard,F,37,27,2020-08-04


In [6]:
kawhi = df[df['name'] == 'Kawhi Leonard'].copy()

In [7]:
kawhi['points_1'] = kawhi['points'].shift(1)
kawhi['points_2'] = kawhi['points'].shift(2)

In [8]:
kawhi.head(10)

Unnamed: 0,name,position,minutes,points,date,points_1,points_2
2515,Kawhi Leonard,F,33,28,2020-07-30,,
2516,Kawhi Leonard,F,27,24,2020-08-01,28.0,
2517,Kawhi Leonard,F,37,27,2020-08-04,24.0,28.0
2518,Kawhi Leonard,F,38,29,2020-08-06,27.0,24.0
2519,Kawhi Leonard,F,0,0,2020-08-08,29.0,27.0
2520,Kawhi Leonard,F,37,39,2020-08-09,0.0,29.0
2521,Kawhi Leonard,F,33,26,2020-08-12,39.0,0.0
2522,Kawhi Leonard,F,0,0,2020-08-14,26.0,39.0
2523,Kawhi Leonard,F,38,29,2020-08-17,0.0,26.0
2524,Kawhi Leonard,F,41,35,2020-08-19,29.0,0.0


In [9]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [10]:
df = df.dropna(subset=["points_1", "points_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [11]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [12]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [15]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [16]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [17]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [18]:
model.score(Z_train, y_train)

0.4280142918027505

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
mean_squared_error(y_test, model.predict(Z_test)) ** (1/2)

5.043097271985561

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [21]:
demo = pd.DataFrame({
    'position': ['C', 'SF', 'SG', 'PG']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0


In [22]:
demo_2 = pd.DataFrame({
    'position': ['C', 'SF-SG', 'SG', 'C']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C,position_SF-SG,position_SG
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [23]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [24]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
3435,C-F,2.0,5.0
1770,G,31.0,15.0
4158,F-G,0.0,0.0
1275,G-F,20.0,23.0
1542,G,0.0,16.0


In [25]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [26]:
lb.classes_

array(['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F'], dtype='<U3')

In [27]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [28]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0]])

In [29]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [30]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [31]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,points_1,points_2
2,0,0,0,0,0,1,0,-0.594326,-0.8168
3,0,0,0,0,0,1,0,-0.817258,-0.595617
4,0,0,0,0,0,1,0,-0.817258,-0.8168
5,0,0,0,0,0,1,0,-0.817258,-0.8168
6,0,0,0,0,0,1,0,-0.817258,-0.8168
7,0,0,0,0,0,1,0,-0.817258,-0.8168
8,0,0,0,0,0,1,0,-0.371393,-0.8168
9,0,0,0,0,0,1,0,-0.817258,-0.374434
10,0,0,0,0,0,1,0,-0.817258,-0.8168
11,0,0,0,0,0,1,0,-0.817258,-0.8168


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [32]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True,
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['points_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['points_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [33]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [34]:
del pipe

In [35]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [36]:
pipe.score(X_train, y_train)

0.4280142918027505

In [37]:
pipe.predict(X_train)[:10]

array([3.0546875 , 3.00976562, 2.30761719, 2.30761719, 2.30761719,
       2.30761719, 3.80273438, 3.71289062, 2.30761719, 2.30761719])

In [38]:
X_train.sample(1).to_dict(orient='list')

{'position': ['F-G'], 'points_1': [3.0], 'points_2': [4.0]}

In [39]:
new = pd.DataFrame({
    'position': ['F'], 
    'points_1': [9.0], 
    'points_2': [8.0]
})

In [40]:
pipe.predict(new)

array([7.94042969])