# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[4])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30431 entries, 0 to 30430
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      30431 non-null  object        
 1   position  30431 non-null  object        
 2   minutes   30431 non-null  int64         
 3   points    30431 non-null  int64         
 4   date      30431 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 1.2+ MB


In [3]:
df.shape

(30431, 5)

In [4]:
df.sample(10)

Unnamed: 0,name,position,minutes,points,date
29952,Will Barton,G,38,24,2021-02-08
8443,Dylan Windler,G-F,14,5,2021-02-08
18848,Luguentz Dort,G,0,0,2021-04-14
7366,Derrick Jones,F,0,0,2021-04-21
30031,Will Magnay,C,0,0,2021-03-16
5724,Damion Lee,G-F,8,3,2021-03-26
13044,James Harden,G,40,29,2021-03-03
12770,Jalen Lecque,G,0,0,2021-03-12
12140,JaVale McGee,C-F,0,0,2021-04-29
17021,Kenyon Martin,F,34,13,2021-05-14


In [9]:
df[df['name'] == 'LeBron James'].head(3)

Unnamed: 0,name,position,minutes,points,date
18542,LeBron James,F,28,22,2020-12-22
18543,LeBron James,F,31,22,2020-12-25
18544,LeBron James,F,26,18,2020-12-27


In [10]:
lebron = df[df['name'] == 'LeBron James'].copy()

In [11]:
lebron['points_1'] = lebron['points'].shift(1)
lebron['points_2'] = lebron['points'].shift(2)

In [12]:
lebron.tail(10)

Unnamed: 0,name,position,minutes,points,date,points_1,points_2
18594,LeBron James,F,0,0,2021-05-07,0.0,0.0
18595,LeBron James,F,0,0,2021-05-09,0.0,0.0
18596,LeBron James,F,0,0,2021-05-11,0.0,0.0
18597,LeBron James,F,0,0,2021-05-12,0.0,0.0
18598,LeBron James,F,28,24,2021-05-15,0.0,0.0
18599,LeBron James,F,27,25,2021-05-16,24.0,0.0
18600,LeBron James,F,34,22,2021-05-19,25.0,24.0
18601,LeBron James,F,36,18,2021-05-23,22.0,25.0
18602,LeBron James,F,38,23,2021-05-25,18.0,22.0
18603,LeBron James,F,37,21,2021-05-27,23.0,18.0


In [13]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [14]:
df = df.dropna(subset=["points_1", "points_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [15]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [16]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [19]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [20]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [21]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [22]:
model.score(Z_train, y_train)

0.4496993463125166

In [23]:
from sklearn.metrics import mean_squared_error

In [24]:
mean_squared_error(y_test, model.predict(Z_test)) ** (1/2)

6.09868516568878

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [25]:
demo = pd.DataFrame({
    'position': ['C', 'SF', 'SG', 'PG']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0


In [26]:
demo_2 = pd.DataFrame({
    'position': ['C', 'SF-SG', 'SG', 'C']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C,position_SF-SG,position_SG
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [27]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [28]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
1555,G,0.0,6.0
9325,F-C,0.0,0.0
23538,G,23.0,6.0
6949,C,0.0,0.0
26833,G,12.0,16.0


In [29]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [30]:
lb.classes_

array(['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F', 'PG'], dtype='<U3')

In [31]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [32]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0, 0]])

In [33]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [34]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [35]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,position_PG,points_1,points_2
2,0,0,1,0,0,0,0,0,-0.457586,-0.808254
3,0,0,1,0,0,0,0,0,0.125546,-0.458368
4,0,0,1,0,0,0,0,0,-0.807464,0.124775
5,0,0,1,0,0,0,0,0,-0.807464,-0.808254
6,0,0,1,0,0,0,0,0,-0.807464,-0.808254
7,0,0,1,0,0,0,0,0,-0.457586,-0.808254
8,0,0,1,0,0,0,0,0,-0.807464,-0.458368
9,0,0,1,0,0,0,0,0,-0.807464,-0.808254
10,0,0,1,0,0,0,0,0,-0.807464,-0.808254
11,0,0,1,0,0,0,0,0,-0.807464,-0.808254


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [36]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True,
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['points_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['points_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [37]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [38]:
del pipe

In [39]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [40]:
pipe.score(X_train, y_train)

0.4496993463125166

In [41]:
pipe.predict(X_train)[:10]

array([2.9921875 , 6.078125  , 4.16796875, 1.6953125 , 1.6953125 ,
       2.9921875 , 2.62109375, 1.6953125 , 1.6953125 , 1.6953125 ])

In [42]:
X_train.sample(1).to_dict(orient='list')

{'position': ['F-G'], 'points_1': [25.0], 'points_2': [16.0]}

In [43]:
new = pd.DataFrame({
    'position': ['F'], 
    'points_1': [9.0], 
    'points_2': [8.0]
})

In [44]:
pipe.predict(new)

array([8.05859375])