# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[3])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27649 entries, 0 to 27648
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      27649 non-null  object        
 1   position  27649 non-null  object        
 2   points    27649 non-null  int64         
 3   date      27649 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 864.2+ KB


In [3]:
df.shape

(27649, 4)

In [4]:
df.sample(10)

Unnamed: 0,name,position,points,date
19176,Maurice Harkless,PF,2,2021-11-22
14471,Josh Christopher,SG,6,2022-01-14
27493,Zach LaVine,SG,10,2022-01-11
17758,Lauri Markkanen,SF,15,2022-01-07
13066,Jayson Tatum,SF,22,2022-02-16
27440,Zach Collins,C,4,2022-03-09
4865,Damion Lee,SG,14,2021-12-18
5819,De'Anthony Melton,PG,13,2021-12-08
8866,Fred VanVleet,PG,16,2021-11-15
6341,Delon Wright,PG,2,2021-12-13


In [5]:
df[df['name'] == 'LeBron James']

Unnamed: 0,name,position,points,date
17792,LeBron James,SF,34,2021-10-19
17793,LeBron James,SF,25,2021-10-22
17794,LeBron James,SF,19,2021-10-24
17795,LeBron James,SF,26,2021-10-29
17796,LeBron James,PF,15,2021-10-31
17797,LeBron James,PF,30,2021-11-02
17798,LeBron James,PF,23,2021-11-19
17799,LeBron James,SF,10,2021-11-21
17800,LeBron James,PF,39,2021-11-24
17801,LeBron James,SF,30,2021-11-26


In [6]:
james = df[df['name'] == 'LeBron James'].copy()

In [7]:
james['points_1'] = james['points'].shift(1)
james['points_2'] = james['points'].shift(2)

In [8]:
james.tail(10)

Unnamed: 0,name,position,points,date,points_1,points_2
17838,LeBron James,C,23,2022-03-09,56.0,26.0
17839,LeBron James,C,50,2022-03-11,23.0,56.0
17840,LeBron James,C,31,2022-03-13,50.0,23.0
17841,LeBron James,C,30,2022-03-14,31.0,50.0
17842,LeBron James,PF,19,2022-03-16,30.0,31.0
17843,LeBron James,PF,36,2022-03-18,19.0,30.0
17844,LeBron James,C,38,2022-03-19,36.0,19.0
17845,LeBron James,SF,38,2022-03-21,38.0,36.0
17846,LeBron James,SF,39,2022-03-27,38.0,38.0
17847,LeBron James,SF,38,2022-04-01,39.0,38.0


In [9]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [10]:
df = df.dropna(subset=["points_1", "points_2"])

In [11]:
df.sample(10)

Unnamed: 0,name,position,points,date,points_1,points_2
10912,Isaiah Stewart II,C,6,2022-01-06,9.0,7.0
4672,Daishen Nix,SF,2,2022-03-18,1.0,8.0
17091,Kyle Anderson,PF,13,2021-10-30,8.0,6.0
22816,Ricky Rubio,PG,15,2021-12-05,7.0,6.0
3444,Carmelo Anthony,PF,4,2022-03-29,3.0,20.0
25455,Tobias Harris,PF,17,2022-03-27,12.0,20.0
21881,Patty Mills,SG,21,2021-11-17,8.0,29.0
9937,Grant Williams,PF,0,2022-01-28,8.0,10.0
4398,Corey Kispert,SF,5,2021-11-29,4.0,3.0
4812,Damian Jones,C,19,2022-04-10,12.0,22.0


### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [12]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [13]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [16]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

In [19]:
model.score(Z_train, y_train)

0.4159737343809118

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['SF', 'PG', 'SG', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,0,0,1,0
1,0,1,0,0
2,0,0,0,1
3,1,0,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['SF/SF', 'PG', 'SG', 'C/PG']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C/PG,position_PG,position_SF/SF,position_SG
0,0,0,1,0
1,0,1,0,0
2,0,0,0,1
3,1,0,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
1507,PG,5.0,11.0
8218,C,5.0,0.0
9819,SF,22.0,21.0
2924,SG,19.0,26.0
23213,PF,12.0,8.0


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1]])

In [25]:
lb.classes_

array(['C', 'PF', 'PG', 'SF', 'SG'], dtype='<U2')

In [26]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [27]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0]])

In [28]:
new = pd.DataFrame({
    'position': [None]
})

lb.transform(new['position'])

ValueError: The type of target data is not known

In [29]:
X_train

Unnamed: 0,position,points_1,points_2
2,PF,8.0,12.0
3,PF,12.0,8.0
4,PF,20.0,12.0
5,PF,13.0,20.0
6,PF,12.0,13.0
...,...,...,...
24863,SG,15.0,18.0
24864,PG,10.0,15.0
24865,PG,5.0,10.0
24866,PG,10.0,5.0


In [30]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [31]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_PF,position_PG,position_SF,position_SG,points_1,points_2
2,0,1,0,0,0,-0.309233,0.158553
3,0,1,0,0,0,0.156752,-0.307691
4,0,1,0,0,0,1.088724,0.158553
5,0,1,0,0,0,0.273249,1.091039
6,0,1,0,0,0,0.156752,0.275114
7,0,1,0,0,0,0.506242,0.158553
8,0,1,0,0,0,-0.192737,0.508235
9,0,1,0,0,0,-0.192737,-0.19113
10,0,0,0,1,0,-0.07624,-0.19113
11,0,0,0,1,0,-0.309233,-0.074569


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [32]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

### The Pickle 🥒

In [33]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [34]:
del pipe

In [35]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [36]:
pipe.score(X_train, y_train)

0.4159737343809118

In [37]:
pipe.predict(X_train)[:10]

array([10.04354858, 10.19219971, 14.61071777, 14.69403076, 11.90939331,
       12.70773315, 11.45446777,  9.39385986,  9.87091064,  9.45315552])

In [38]:
X_train.sample(1).to_dict(orient='list')

{'position': ['PG'], 'points_1': [17.0], 'points_2': [10.0]}

In [39]:
new = pd.DataFrame({
    'position': ['SF'], 
    'points_1': [12.0], 
    'points_2': [16.0]
})

In [40]:
pipe.predict(new)

array([13.03616333])