# Standardization, or mean removal and variance scaling

In [1]:
from sklearn import preprocessing
import numpy as np

X_train = np.array([[1., -1., 2.],
                   [2., 0., 0.],
                   [0., 1., -1.]])

X_scaled = preprocessing.scale(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

#### The way to understand the “axis” of numpy sum is it <b> collapses </b> the specified axis.


In [2]:
X_scaled.mean(axis=0)

array([0., 0., 0.])

In [3]:
X_scaled.std(axis=0)

array([1., 1., 1.])

#### It is possible to disable either centering or scaling by either passing with_mean=False or with_std=False to the constructor of StandardScaler

In [4]:
scaler = preprocessing.StandardScaler().fit(X_train)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

$$X = (X - scaler.mean\_) / scaler.scale\_$$

In [6]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [7]:
scaler.transform(X_train)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [8]:
X_test = [[-1., 1., 0.]]
scaler.transform(X_test)

array([[-2.44948974,  1.22474487, -0.26726124]])

In [9]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [10]:
X_test = np.array([[-3., -1., 4.]])

X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

$$ scaler.min\_ = min / (min - max) $$
$$ scaler.scale\_ = 1 / (max - min) $$

In [11]:
min_max_scaler.scale_

array([0.5       , 0.5       , 0.33333333])

In [12]:
min_max_scaler.min_

array([0.        , 0.5       , 0.33333333])

In [19]:
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [22]:
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs

array([[-1.5, -1. ,  2. ]])

In [23]:
max_abs_scaler.scale_

array([2., 1., 2.])

## Scaling sparse data

*Centering sparse data would destroy the sparseness structure in the data, and thus **rarely** is a sensible thing to do. However, it can make sense to scale sparse inputs, especially if features are on different scales.*

MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, and are the recommended way to go about this. However, scale and StandardScaler can accept scipy.sparse matrices as input, as long as with_mean=False is explicitly passed to the constructor.

Otherwise a *ValueError* will be raised as silently centering would break the sparsity and would often crash the execution by allocating excessive amounts of memory unintentionally. RobustScaler cannot be fitted to sparse inputs, but you can use the transform method on sparse inputs.

## Scaling data with outliers

If your data contains many outliers, scaling using the mean and variance of the data is likely to not work very well. In these cases, you can use **robust_scale** and **RobustScaler** as drop-in replacements instead. They use more robust estimates for the center and range of your data.


# Non-linear transformation

Like scalers, **QuantileTransformer** puts each feature into the same range or distribution. However, by performing a rank transformation, it smooths out unusual distributions and is less influenced by outliers than scaling methods. It does, however, distort correlations and distances within and across features.

In [29]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)
np.percentile(X_train[:, 0], [0, 25, 50, 75, 100])

array([4.3, 5.1, 5.8, 6.5, 7.9])

 Once the quantile transformation applied, those landmarks approach closely the percentiles previously defined:

In [30]:
np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100])

array([9.99999998e-08, 2.38738739e-01, 5.09009009e-01, 7.43243243e-01,
       9.99999900e-01])

In [31]:
np.percentile(X_test[:, 0], [0, 25, 50, 75, 100])

array([4.4  , 5.125, 5.75 , 6.175, 7.3  ])

In [32]:
np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100])

array([0.01351351, 0.25012513, 0.47972973, 0.6021021 , 0.94144144])

It is also possible to map the transformed data to a normal distribution by setting output_distribution='normal':

In [33]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
X_trans = quantile_transformer.fit_transform(X)
quantile_transformer.quantiles_

array([[4.3       , 2.        , 1.        , 0.1       ],
       [4.31491491, 2.02982983, 1.01491491, 0.1       ],
       [4.32982983, 2.05965966, 1.02982983, 0.1       ],
       ...,
       [7.84034034, 4.34034034, 6.84034034, 2.5       ],
       [7.87017017, 4.37017017, 6.87017017, 2.5       ],
       [7.9       , 4.4       , 6.9       , 2.5       ]])

# Normalization

In [37]:
X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

Normalizer acts **row-wise** and StandardScaler **column-wise**.[\[stackoverflow\]](https://stackoverflow.com/questions/39120942/difference-between-standardscaler-and-normalizer-in-sklearn-preprocessing?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa)

In [40]:
normalizer = preprocessing.Normalizer().fit(X)
normalizer

Normalizer(copy=True, norm='l2')

In [41]:
normalizer.transform(X)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

normalize and Normalizer accept both dense array-like and sparse matrices from scipy.sparse as input.

# Binarization

Feature binarization is the process of thresholding numerical features to get boolean values.

In [43]:
X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]
binarizer = preprocessing.Binarizer().fit(X)
binarizer

Binarizer(copy=True, threshold=0.0)

In [44]:
binarizer.transform(X)

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

binarize and Binarizer accept both dense array-like and sparse matrices from scipy.sparse as input.

# Encoding categorical features

In [45]:
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [46]:
enc.transform([[0, 1, 3]]).toarray()

array([[1., 0., 0., 1., 0., 0., 0., 0., 1.]])

Note that, if there is a possibility that the training data might have missing categorical features, one has to explicitly set ```n_values```.

In [48]:
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
enc.fit([[1, 2, 3], [0, 2, 0]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values=[2, 3, 4], sparse=True)

In [49]:
enc.transform([[1, 0, 0]]).toarray()

array([[0., 1., 1., 0., 0., 1., 0., 0., 0.]])

# Imputation of missing values

In [50]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [52]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


The Imputer class also supports sparse matrices:

In [53]:
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit(X)

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [54]:
X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
print(imp.transform(X_test))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


# Generating polynomial features

In [55]:
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [56]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In some cases, only interaction terms among features are required, and it can be gotten with the setting ```interaction_only=True```

In [57]:
X = np.arange(9).reshape(3, 3)
X

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [58]:
poly = PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)

array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
       [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
       [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])

# Custom transformers

In [60]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])