# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[4])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15345 entries, 0 to 15344
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      15345 non-null  object        
 1   position  15345 non-null  object        
 2   minutes   15345 non-null  int64         
 3   points    15345 non-null  int64         
 4   date      15345 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 599.5+ KB


In [3]:
df.shape

(15345, 5)

In [4]:
df.sample(10)

Unnamed: 0,name,position,minutes,points,date
9363,Lonzo Ball,G,38,18,2021-01-04
4063,Duncan Robinson,F,39,13,2021-03-21
8644,Kevin Porter,G-F,0,0,2020-12-31
6275,Jakob Poeltl,C,25,8,2021-03-19
9696,M. Harrell,F-C,32,17,2021-03-12
13010,Sean McDermott,F,0,0,2021-02-06
14846,Vlatko Cancar,F,0,0,2021-01-05
2137,Clint Capela,C,31,22,2021-02-21
5756,J. McLaughlin,G,0,0,2021-01-05
13398,Svi Mykhailiuk,G-F,26,11,2021-03-11


In [5]:
df[df['name'] == 'Kawhi Leonard'].head(3)

Unnamed: 0,name,position,minutes,points,date
8205,Kawhi Leonard,F,34,26,2020-12-22
8206,Kawhi Leonard,F,30,21,2020-12-25
8207,Kawhi Leonard,F,0,0,2020-12-27


In [6]:
kawhi = df[df['name'] == 'Kawhi Leonard'].copy()

In [7]:
kawhi['points_1'] = kawhi['points'].shift(1)
kawhi['points_2'] = kawhi['points'].shift(2)

In [8]:
kawhi.head(10)

Unnamed: 0,name,position,minutes,points,date,points_1,points_2
8205,Kawhi Leonard,F,34,26,2020-12-22,,
8206,Kawhi Leonard,F,30,21,2020-12-25,26.0,
8207,Kawhi Leonard,F,0,0,2020-12-27,21.0,26.0
8208,Kawhi Leonard,F,0,0,2020-12-29,0.0,21.0
8209,Kawhi Leonard,F,30,28,2020-12-30,0.0,0.0
8210,Kawhi Leonard,F,38,20,2021-01-01,28.0,0.0
8211,Kawhi Leonard,F,34,15,2021-01-03,20.0,28.0
8212,Kawhi Leonard,F,37,30,2021-01-05,15.0,20.0
8213,Kawhi Leonard,F,34,21,2021-01-06,30.0,15.0
8214,Kawhi Leonard,F,34,24,2021-01-08,21.0,30.0


In [9]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [10]:
df = df.dropna(subset=["points_1", "points_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [11]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [12]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [15]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [16]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [17]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [18]:
model.score(Z_train, y_train)

0.4782386705540024

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
mean_squared_error(y_test, model.predict(Z_test)) ** (1/2)

5.936184414953741

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [21]:
demo = pd.DataFrame({
    'position': ['C', 'SF', 'SG', 'PG']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0


In [22]:
demo_2 = pd.DataFrame({
    'position': ['C', 'SF-SG', 'SG', 'C']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C,position_SF-SG,position_SG
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [23]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [24]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
1094,G,3.0,2.0
12959,F,17.0,13.0
8557,G-F,13.0,11.0
9973,C,8.0,5.0
12031,G,0.0,0.0


In [25]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [26]:
lb.classes_

array(['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F', 'PG'], dtype='<U3')

In [27]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [28]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0, 0]])

In [29]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [30]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [31]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,position_PG,points_1,points_2
2,0,0,1,0,0,0,0,0,-0.454679,-0.806253
3,0,0,1,0,0,0,0,0,0.128367,-0.455934
4,0,0,1,0,0,0,0,0,-0.804506,0.127931
5,0,0,1,0,0,0,0,0,-0.804506,-0.806253
6,0,0,1,0,0,0,0,0,-0.804506,-0.806253
7,0,0,1,0,0,0,0,0,-0.454679,-0.806253
8,0,0,1,0,0,0,0,0,-0.804506,-0.455934
9,0,0,1,0,0,0,0,0,-0.804506,-0.806253
10,0,0,1,0,0,0,0,0,-0.804506,-0.806253
11,0,0,1,0,0,0,0,0,-0.804506,-0.806253


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [32]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True,
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['points_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['points_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [33]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [34]:
del pipe

In [35]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [36]:
pipe.score(X_train, y_train)

0.4782386705540024

In [37]:
pipe.predict(X_train)[:10]

array([2.88867188, 6.0546875 , 4.09960938, 1.56054688, 1.56054688,
       2.88867188, 2.51367188, 1.56054688, 1.56054688, 1.56054688])

In [38]:
X_train.sample(1).to_dict(orient='list')

{'position': ['G'], 'points_1': [0.0], 'points_2': [3.0]}

In [39]:
new = pd.DataFrame({
    'position': ['F'], 
    'points_1': [9.0], 
    'points_2': [8.0]
})

In [40]:
pipe.predict(new)

array([8.08398438])