In [3]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
%matplotlib inline

In [4]:
X_train = np.array([[1,9,2], [10,1,4], [5,3,8]])

## 标准化

### scale函数，均值为0，方差为1
- 适合于测试集与训练集具有同样的分布，否则分别对两个集合进行独立scale，会破坏二者之间的关系。

In [5]:
X_scaled = preprocessing.scale(X_train)
X_scaled

array([[-1.1769647 ,  1.37281295, -1.06904497],
       [ 1.26750044, -0.98058068, -0.26726124],
       [-0.09053575, -0.39223227,  1.33630621]])

In [6]:
print(X_scaled.mean(axis = 0))
print(X_scaled.std(axis = 0))

[ 8.32667268e-17  7.40148683e-17 -1.48029737e-16]
[1. 1. 1.]


In [7]:
X_scaled.mean(axis = 0)

array([ 8.32667268e-17,  7.40148683e-17, -1.48029737e-16])

### StandardScaler，对其他数据运用与训练集同样的变化方法（常选用）
- mean = X_train.mean
- scale = X_train.std
- 测试集
  - (X_test - mean)/scale

In [8]:
scaler = preprocessing.StandardScaler().fit(X_train)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
scaler.scale_

array([3.68178701, 3.39934634, 2.49443826])

In [10]:
scaler.mean_

array([5.33333333, 4.33333333, 4.66666667])

In [11]:
X_train

array([[ 1,  9,  2],
       [10,  1,  4],
       [ 5,  3,  8]])

In [12]:
X_train_trans = scaler.transform(X_train)

In [13]:
X_train_trans.mean(axis = 0)

array([ 8.32667268e-17,  7.40148683e-17, -1.48029737e-16])

In [14]:
X_train_trans.std(axis = 0)

array([1., 1., 1.])

In [15]:
X_test = np.array([[1,2,3], [2,9,5]])

In [16]:
X_test_trans = scaler.transform(X_test)
X_test_trans

array([[-1.1769647 , -0.68640647, -0.6681531 ],
       [-0.90535746,  1.37281295,  0.13363062]])

In [17]:
X_test_trans.mean(axis = 0)

array([-1.04116108,  0.34320324, -0.26726124])

### 线性范围放缩
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (max - min) + min

In [18]:
X_train

array([[ 1,  9,  2],
       [10,  1,  4],
       [ 5,  3,  8]])

In [19]:
min_max_scaler = preprocessing.MinMaxScaler().fit(X_train)

In [20]:
X_train_minmax = min_max_scaler.transform(X_train)
X_train_minmax

array([[0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.33333333],
       [0.44444444, 0.25      , 1.        ]])

In [21]:
min_max_scaler.min_

array([-0.11111111, -0.125     , -0.33333333])

In [22]:
min_max_scaler.scale_

array([0.11111111, 0.125     , 0.16666667])

In [23]:
X_test

array([[1, 2, 3],
       [2, 9, 5]])

In [24]:
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[0.        , 0.125     , 0.16666667],
       [0.11111111, 1.        , 0.5       ]])

## 非线性变换

In [25]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [26]:
iris = load_iris()

In [27]:
X, y = iris.data, iris.target

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### mapping to a uniform distribution

In [29]:
quantile_transformer = preprocessing.QuantileTransformer(random_state=0).fit(X_train)

  % (self.n_quantiles, n_samples))


In [30]:
X_train_trans = quantile_transformer.transform(X_train)

In [61]:
X_test_trans = quantile_transformer.transform(X_test)

In [62]:
np.percentile(X_train[:,1],[0, 25, 50, 75, 100])

array([1., 2., 3., 6., 9.])

In [63]:
np.percentile(X_train_trans[:,1],[0, 25, 50, 75, 100])

array([-5.19933758, -2.59966879,  0.        ,  2.59966879,  5.19933758])

In [34]:
np.percentile(X_test[:,1],[0, 25, 50, 75, 100])

array([2.2 , 2.8 , 2.95, 3.4 , 4.2 ])

In [35]:
np.percentile(X_test_trans[:,1],[0, 25, 50, 75, 100])

array([0.01351351, 0.26576577, 0.37612613, 0.81081081, 0.99399399])

### mapping to a Gaussian distribution

In [36]:
from sklearn.preprocessing import PowerTransformer

In [42]:
pt = preprocessing.PowerTransformer(method='box-cox', standardize=True)

In [68]:
X_lognormal = np.random.RandomState(615).lognormal(size=(3,3))

In [69]:
X_lognormal 

array([[0.43471252, 1.36404477, 2.55872317],
       [1.97280767, 2.96949204, 1.2921221 ],
       [1.00463976, 0.46680121, 2.17565247]])

In [46]:
X_lognormal_trans = pt.fit_transform(X_lognormal)

In [47]:
X_lognormal

array([[1.28331718, 1.18092228, 0.84160269],
       [0.94293279, 1.60960836, 0.3879099 ],
       [1.35235668, 0.21715673, 1.09977091]])

In [48]:
X_lognormal_trans.mean(axis = 0)

array([ 0.00000000e+00, -7.40148683e-17,  7.40148683e-17])

In [49]:
X_lognormal_trans.std(axis = 0)

array([1., 1., 1.])

In [56]:
qt = preprocessing.QuantileTransformer(output_distribution = 'normal', random_state = 0)

In [57]:
X_train = np.array([[1,9,2], [10,1,4], [5,3,8]], dtype = np.float)

In [58]:
X_train_trans = qt.fit_transform(X_train)
X_train_trans

  % (self.n_quantiles, n_samples))


array([[-5.19933758,  5.19933758, -5.19933758],
       [ 5.19933758, -5.19933758,  0.        ],
       [ 0.        ,  0.        ,  5.19933758]])

In [59]:
X_train_trans.mean(axis = 0)

array([3.26147997e-11, 3.26147997e-11, 3.26147997e-11])

In [60]:
X_train_trans.std(axis = 0)

array([4.24524136, 4.24524136, 4.24524136])

In [70]:
pt2 = preprocessing.PowerTransformer(method='yeo-johnson', standardize = False)

In [71]:
X_lognormal

array([[0.43471252, 1.36404477, 2.55872317],
       [1.97280767, 2.96949204, 1.2921221 ],
       [1.00463976, 0.46680121, 2.17565247]])

In [73]:
X_lognormal_trans2 = pt2.fit_transform(X_lognormal)
X_lognormal_trans2

array([[ 0.34396395,  0.82466619, 11.11224267],
       [ 0.94413364,  1.28848502,  3.11999422],
       [ 0.6341813 ,  0.3758934 ,  8.065803  ]])

## Normalization

In [74]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]

In [75]:
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [78]:
normalizer = preprocessing.Normalizer(norm='l2').fit(X)

In [79]:
X_normalized = normalizer.transform(X)
X_normalized 

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [84]:
normalizer.transform([[-1., 1., 0.]])

array([[-0.70710678,  0.70710678,  0.        ]])

## 编码类别特征

In [85]:
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]

In [86]:
enc = preprocessing.OrdinalEncoder().fit(X)

In [87]:
X_encoded = enc.transform(X)
X_encoded

array([[1., 1., 1.],
       [0., 0., 0.]])

In [89]:
enc.transform([['female', 'from US', 'uses Safari']])

array([[0., 1., 1.]])

In [90]:
enc = preprocessing.OneHotEncoder().fit(X)

In [92]:
X_encoded2 = enc.transform(X)
X_encoded2.toarray()

array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])

## 离散化

### K容器离散化

In [93]:
X = np.array([[ -3., 5., 15 ],
              [  0., 6., 14 ],
              [  6., 3., 11 ]])

In [94]:
est = preprocessing.KBinsDiscretizer(n_bins=[3,3,2], encode='ordinal').fit(X)

In [96]:
X_trans = est.transform(X)
X_trans

array([[0., 1., 1.],
       [1., 2., 1.],
       [2., 0., 0.]])

In [97]:
est2 = preprocessing.KBinsDiscretizer(n_bins=[3,3,2], encode='onehot').fit(X)

In [99]:
X_trans2 = est2.transform(X).toarray()
X_trans2

array([[1., 0., 0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 1., 1., 0., 0., 1., 0.]])

### 二值化

In [100]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]

In [102]:
binarizer = preprocessing.Binarizer().fit(X)
binarizer

Binarizer(copy=True, threshold=0.0)

In [103]:
X_trans = binarizer.transform(X)
X_trans

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

## 构建多项特征

In [104]:
from sklearn.preprocessing import PolynomialFeatures

In [105]:
X = np.arange(6).reshape(3,2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [106]:
poly = PolynomialFeatures(2)

In [107]:
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

The features of X have been transformed from ($X_1,X_2,X_3$) to ($1,X_1,X_2,X_3,X_1X_2,X_1X_3,X_2X_3,X_1X_2X_3$).