# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[3])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28050 entries, 0 to 28049
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      28050 non-null  object        
 1   position  28050 non-null  object        
 2   points    28050 non-null  int64         
 3   date      28050 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 876.7+ KB


In [3]:
df.shape

(28050, 4)

In [4]:
df.sample(10)

Unnamed: 0,name,position,points,date
8658,Evan Mobley,PF,4,2022-01-31
10834,Isaiah Hartenstein,C,6,2022-01-31
18898,Malik Monk,SG,28,2022-03-29
7716,Draymond Green,PF,14,2021-10-30
11634,Jaden McDaniels,PF,2,2021-11-20
24139,Scottie Barnes,PF,27,2022-01-21
15476,Justin Holiday,SF,2,2021-11-26
4279,Cody Martin,PG,12,2021-12-08
10763,Isaac Okoro,SF,11,2022-02-02
9935,Gordon Hayward,SF,21,2021-11-08


In [5]:
df[df['name'] == 'LeBron James']

Unnamed: 0,name,position,points,date
18039,LeBron James,SF,34,2021-10-19
18040,LeBron James,SF,25,2021-10-22
18041,LeBron James,SF,19,2021-10-24
18042,LeBron James,SF,26,2021-10-29
18043,LeBron James,PF,15,2021-10-31
18044,LeBron James,PF,30,2021-11-02
18045,LeBron James,PF,23,2021-11-19
18046,LeBron James,SF,10,2021-11-21
18047,LeBron James,PF,39,2021-11-24
18048,LeBron James,SF,30,2021-11-26


In [6]:
james = df[df['name'] == 'LeBron James'].copy()

In [7]:
james['points_1'] = james['points'].shift(1)
james['points_2'] = james['points'].shift(2)

In [8]:
james.tail(10)

Unnamed: 0,name,position,points,date,points_1,points_2
18085,LeBron James,C,23,2022-03-09,56.0,26.0
18086,LeBron James,C,50,2022-03-11,23.0,56.0
18087,LeBron James,C,31,2022-03-13,50.0,23.0
18088,LeBron James,C,30,2022-03-14,31.0,50.0
18089,LeBron James,PF,19,2022-03-16,30.0,31.0
18090,LeBron James,PF,36,2022-03-18,19.0,30.0
18091,LeBron James,C,38,2022-03-19,36.0,19.0
18092,LeBron James,SF,38,2022-03-21,38.0,36.0
18093,LeBron James,SF,39,2022-03-27,38.0,38.0
18094,LeBron James,SF,38,2022-04-01,39.0,38.0


In [9]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [10]:
df = df.dropna(subset=["points_1", "points_2"])

In [11]:
df.sample(10)

Unnamed: 0,name,position,points,date,points_1,points_2
13576,Jevon Carter,PG,0,2021-11-27,6.0,6.0
9274,Garrett Temple,SG,11,2022-02-14,0.0,3.0
2125,Bojan Bogdanovic,SF,15,2022-02-02,23.0,13.0
21630,Onyeka Okongwu,C,6,2022-04-08,8.0,0.0
17613,LaMarcus Aldridge,C,15,2021-11-14,7.0,21.0
12302,Jalen Suggs,PG,16,2022-01-15,12.0,17.0
10855,Isaiah Hartenstein,C,14,2022-03-22,4.0,12.0
8158,Dylan Windler,SG,8,2021-11-12,3.0,9.0
11554,JaVale McGee,C,16,2021-12-29,6.0,8.0
5124,Danilo Gallinari,PF,16,2021-11-20,8.0,10.0


### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [12]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [13]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [16]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

In [19]:
model.score(Z_train, y_train)

0.4183361100675499

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['SF', 'PG', 'SG', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,0,0,1,0
1,0,1,0,0
2,0,0,0,1
3,1,0,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['SF/SF', 'PG', 'SG', 'C/PG']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C/PG,position_PG,position_SF/SF,position_SG
0,0,0,1,0
1,0,1,0,0
2,0,0,0,1
3,1,0,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
3147,SG,0.0,0.0
14726,PG,15.0,14.0
10041,PF,7.0,8.0
8780,SG,7.0,10.0
6310,C,16.0,9.0


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]])

In [25]:
lb.classes_

array(['C', 'PF', 'PG', 'SF', 'SG'], dtype='<U2')

In [26]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [27]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0]])

In [28]:
new = pd.DataFrame({
    'position': [None]
})

lb.transform(new['position'])

ValueError: The type of target data is not known

In [30]:
X_train

Unnamed: 0,position,points_1,points_2
2,PF,8.0,12.0
3,PF,12.0,8.0
4,PF,20.0,12.0
5,PF,13.0,20.0
6,PF,12.0,13.0
...,...,...,...
25224,SG,4.0,4.0
25225,SG,6.0,4.0
25226,PG,5.0,6.0
25227,PG,6.0,5.0


In [31]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [32]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_PF,position_PG,position_SF,position_SG,points_1,points_2
2,0,1,0,0,0,-0.306299,0.160379
3,0,1,0,0,0,0.158367,-0.304575
4,0,1,0,0,0,1.087699,0.160379
5,0,1,0,0,0,0.274533,1.090288
6,0,1,0,0,0,0.158367,0.276618
7,0,1,0,0,0,0.506867,0.160379
8,0,1,0,0,0,-0.190133,0.509095
9,0,1,0,0,0,-0.190133,-0.188336
10,0,0,0,1,0,-0.073966,-0.188336
11,0,0,0,1,0,-0.306299,-0.072098


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [33]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

### The Pickle 🥒

In [34]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [35]:
del pipe

In [36]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [37]:
pipe.score(X_train, y_train)

0.4183361100675499

In [38]:
pipe.predict(X_train)[:10]

array([10.03955078, 10.17919922, 14.60354614, 14.70462036, 11.90631104,
       12.70187378, 11.45617676,  9.38363647,  9.84109497,  9.42581177])

In [39]:
X_train.sample(1).to_dict(orient='list')

{'position': ['PG'], 'points_1': [5.0], 'points_2': [3.0]}

In [40]:
new = pd.DataFrame({
    'position': ['SF'], 
    'points_1': [12.0], 
    'points_2': [16.0]
})

In [41]:
pipe.predict(new)

array([13.01968384])