# Standard Scaler Example

In [5]:
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.model_selection import train_test_split
x,y = datasets.load_boston(return_X_y=True)

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

ridge = Ridge().fit(x_train_scaled,y_train)
x_test_scaled = scaler.transform(x_test)
ridge.score(x_test_scaled,y_test)

0.6345884564889053

# CV_scores

### RidgeCV

In [2]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
scores = cross_val_score(RidgeCV(),x_train,y_train,cv=10)
round(np.mean(scores),3),round(np.std(scores),3)

(0.717, 0.125)

In [3]:
scores = cross_val_score(RidgeCV(),x_train_scaled,y_train,cv=10)
round(np.mean(scores),3),round(np.std(scores),3)

(0.718, 0.127)

### KNN

In [4]:
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),x_train,y_train,cv=10)
round(np.mean(scores),3),round(np.std(scores),3)

(0.499, 0.146)

In [5]:
scores = cross_val_score(KNeighborsRegressor(), x_train_scaled, y_train, cv=10)
round(np.mean(scores),3),round(np.std(scores),3)

(0.75, 0.106)

# Leaking Information

### What is Data Leakage in Machine Learning?

In [8]:
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.model_selection import train_test_split
x,y = datasets.load_boston(return_X_y=True)

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

ridge = Ridge().fit(x_train_scaled,y_train)
x_test_scaled = scaler.transform(x_test)
ridge.score(x_test_scaled,y_test)

0.6345884564889053

In [9]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(),Ridge())
pipe.fit(x_train,y_train)
pipe.score(x_test,y_test)

0.6345884564889053

#### 1.必须先用fit_transform(trainData)，之后再transform(testData)
#### 2.如果直接transform(testData)，程序会报错
#### 3.如果fit_transfrom(trainData)后，使用fit_transform(testData)而不transform(testData)，虽然也能归一化，但是两个结果不是在同一个“标准”下的，具有明显差异。(一定要避免这种情况)

In [13]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
knn_pipe = make_pipeline(StandardScaler(),KNeighborsRegressor())
scores = cross_val_score(knn_pipe,x_train,y_train,cv=10)
round(np.mean(scores),3),round(np.std(scores),3)

(0.746, 0.106)

## Naming Steps

In [14]:
from sklearn.pipeline import make_pipeline
knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
print(knn_pipe.steps)

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsregressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))]


In [19]:
from sklearn.pipeline import Pipeline
pipe = Pipeline((("scaler", StandardScaler()),
                 ("regressor", KNeighborsRegressor)))
print(pipe.steps)

(('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('regressor', <class 'sklearn.neighbors.regression.KNeighborsRegressor'>))


#### make_pipe function is a shorthand of Pipeline function

# Pipeline and GridSearchCV

In [23]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

knn_pipe = make_pipeline(StandardScaler(),KNeighborsRegressor())
param_grid = {'kneighborsregressor__n_neighbors':range(1,10)}
grid = GridSearchCV(knn_pipe,param_grid,cv=10)
grid.fit(x_train,y_train)
print(grid.best_params_)
print(round(grid.score(x_test,y_test),2))

{'kneighborsregressor__n_neighbors': 7}
0.6


# Discrete features

## categorical variables

In [28]:
import pandas as pd
df = pd.DataFrame(
    {'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
     'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})

In [29]:
df

Unnamed: 0,boro,vegan
0,Manhattan,No
1,Queens,No
2,Manhattan,No
3,Brooklyn,Yes
4,Brooklyn,Yes
5,Bronx,No


## ordinary encoding

In [26]:
df['boro_ordinal'] = df.boro.astype('category').cat.codes
df

Unnamed: 0,boro,vegan,boro_ordinal
0,Manhattan,No,2
1,Queens,No,3
2,Manhattan,No,2
3,Brooklyn,Yes,1
4,Brooklyn,Yes,1
5,Bronx,No,0


## one-hot(dummy) encoding

In [31]:
import pandas as pd
df = pd.DataFrame(
    {'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
     'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
pd.get_dummies(df)

Unnamed: 0,boro_Bronx,boro_Brooklyn,boro_Manhattan,boro_Queens,vegan_No,vegan_Yes
0,0,0,1,0,1,0
1,0,0,0,1,1,0
2,0,0,1,0,1,0
3,0,1,0,0,0,1
4,0,1,0,0,0,1
5,1,0,0,0,1,0


In [32]:
import pandas as pd
df = pd.DataFrame(
    {'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
     'vegan': ['No', 'No','No','Yes', 'Yes', 'No']})
pd.get_dummies(df,columns=['boro'])

Unnamed: 0,vegan,boro_Bronx,boro_Brooklyn,boro_Manhattan,boro_Queens
0,No,0,0,1,0
1,No,0,0,0,1
2,No,0,0,1,0
3,Yes,0,1,0,0
4,Yes,0,1,0,0
5,No,1,0,0,0


In [35]:
df = pd.DataFrame({
    'salary':[103,89,142,54,63,219],
    'boro':[0,1,0,2,2,3]
})
df

Unnamed: 0,salary,boro
0,103,0
1,89,1
2,142,0
3,54,2
4,63,2
5,219,3


In [36]:
pd.get_dummies(df)

Unnamed: 0,salary,boro
0,103,0
1,89,1
2,142,0
3,54,2
4,63,2
5,219,3


In [37]:
pd.get_dummies(df,columns=['boro'])

Unnamed: 0,salary,boro_0,boro_1,boro_2,boro_3
0,103,1,0,0,0
1,89,0,1,0,0
2,142,1,0,0,0
3,54,0,0,1,0
4,63,0,0,1,0
5,219,0,0,0,1


## Power Transformations

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='box-cox')
pt.fit(X)