# Scaling
* Put all pre processing in cross-validation:
    * in development only scale the training data
    * in production scale the whole data set, since the test data is incoming data
* check if the mean of training data set and test data set are close enough
* make sure  to fit only the scaled training scale
## Sparse data
* data with many zeros - only store non-zero entries
* Subtracting anything will make the data "dense" (no more zeros) and blow the RAM
Only scale, don't center (use MaxAbsScaler)
## Pipelines

In [11]:
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston(return_X_y=False)


In [12]:
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

ridge = Ridge().fit(X_train_scaled,y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.634588456488905

In [14]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(StandardScaler(),Ridge())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.634588456488905

# Naming Steps and cross validation

In [19]:
from sklearn.neighbors import KNeighborsRegressor
knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
print(knn_pipe.steps)

[('standardscaler', StandardScaler()), ('kneighborsregressor', KNeighborsRegressor())]


In [23]:
from sklearn.pipeline import Pipeline
pipe2 = Pipeline(())



TypeError: 'StandardScaler' object is not iterable

In [21]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn_pipe, X_train, y_train, cv = 10)
np.mean(scores), np.std(scores)

(0.7455608822349746, 0.10625173604969354)

In [27]:
from sklearn.model_selection import GridSearchCV
param_grid = {'kneighborsregressor__n_neighbors': range(1,10)}
grid=GridSearchCV(knn_pipe,param_grid,cv=10)
grid.fit(X_train,y_train)
print(grid.best_params_)
print(grid.score(X_test, y_test))

{'kneighborsregressor__n_neighbors': 7}
0.5999825126971097


# Categorical Variables

In [35]:
import pandas as pd
df = pd.DataFrame(
    {'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
     'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
df_coded = df
df_coded['boro_ordinal'] = df.boro.astype('category').cat.codes

In [30]:
df
# NOT GREAT IDEA AS IT ENFORCES ORDERING
# no natural way to order unrated

Unnamed: 0,boro,vegan,boro_ordinal
0,Manhattan,No,2
1,Queens,No,3
2,Manhattan,No,2
3,Brooklyn,Yes,1
4,Brooklyn,Yes,1
5,Bronx,No,0


# One-hot (Dummy) Encoding

In [34]:
pd.get_dummies(df, columns = ['boro'])
# for each original feature exactly one feature will have 1
# don't encode target

Unnamed: 0,vegan,boro_ordinal,boro_Bronx,boro_Brooklyn,boro_Manhattan,boro_Queens
0,No,2,0,0,1,0
1,No,3,0,0,0,1
2,No,2,0,0,1,0
3,Yes,1,0,1,0,0
4,Yes,1,0,1,0,0
5,No,0,1,0,0,0


In [37]:
df_salary=df
df_salary['salary'] = [103,89,142,54,63,219]

In [38]:
df_salary

Unnamed: 0,boro,vegan,boro_ordinal,salary
0,Manhattan,No,2,103
1,Queens,No,3,89
2,Manhattan,No,2,142
3,Brooklyn,Yes,1,54
4,Brooklyn,Yes,1,63
5,Bronx,No,0,219


# Pandas Categorical Columns

In [47]:
df_salary.drop('boro_ordinal', axis = 1)


Unnamed: 0,boro,vegan,salary
0,Manhattan,No,103
1,Queens,No,89
2,Manhattan,No,142
3,Brooklyn,Yes,54
4,Brooklyn,Yes,63
5,Bronx,No,219


In [56]:
df_salary=df_salary.drop('vegan', axis = 1)
df_salary['boro'] = pd.Categorical(df_salary.boro, categories=['Manhattan', 'Queens', 'Brooklyn', 'Bronx', 'Staten Island'])
pd.get_dummies(df_salary)

Unnamed: 0,salary,boro_Manhattan,boro_Queens,boro_Brooklyn,boro_Bronx,boro_Staten Island
0,103,1,0,0,0,0
1,89,0,1,0,0,0
2,142,1,0,0,0,0
3,54,0,0,1,0,0
4,63,0,0,1,0,0
5,219,0,0,0,1,0


# OneHotEncoder

In [58]:
from sklearn.preprocessing import OneHotEncoder
 
ce = OneHotEncoder().fit(df)
ce.transform(df).toarray()

array([[0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [63]:
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression

In [65]:
categorical = df.dtypes==object
preprocess = make_column_transformer(
    (StandardScaler(), ~categorical),
    (OneHotEncoder(), categorical))

model = make_pipeline(preprocess,LogisticRegression())

# Count-Based Encoding

* For high cardinality categorical features
    * Example: US states, given low samples
    * instead of 50 one-hot variables, one"response encoded" variables
* For regression:
    * "people in this state have an average response of y
* Binary classification - "people in this state have likelihood p for class 1"
* Multilassess: One feature per class: probaility distribution given category

# Feature Distribiution

In [68]:
from sklearn.preprocessing import PowerTransformer
trns = PowerTransformer(method='box-cox')
#yeo-johnson

# Feature Engineering

In [None]:
X_interaction=np.hstack([X,X["0:1"]])