In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_datareader
import sklearn

In [3]:
data = np.random.randint(1,100,size=(10,3))
print(data)

[[87 52 48]
 [80  1 52]
 [47 27 83]
 [63 13 59]
 [ 2 77 68]
 [62 54 13]
 [59 97 17]
 [80 72  3]
 [98 73 94]
 [35 23 88]]


In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
sclar_ini = MinMaxScaler()  # create an instance of the Preprosseing step
print(sclar_ini)


MinMaxScaler(copy=True, feature_range=(0, 1))


In [6]:
# Fit the data to it and transform it 

scaler = sclar_ini.fit(data)
standardized_X = scaler.transform(data)
print (scaler)
print(standardized_X)


MinMaxScaler(copy=True, feature_range=(0, 1))
[[ 0.88541667  0.53125     0.49450549]
 [ 0.8125      0.          0.53846154]
 [ 0.46875     0.27083333  0.87912088]
 [ 0.63541667  0.125       0.61538462]
 [ 0.          0.79166667  0.71428571]
 [ 0.625       0.55208333  0.10989011]
 [ 0.59375     1.          0.15384615]
 [ 0.8125      0.73958333  0.        ]
 [ 1.          0.75        1.        ]
 [ 0.34375     0.22916667  0.93406593]]




### MinMaxScaler -- the max value in the column is made 1 and min value is made 0 and the values are fit in between them

### We fit the instance to the training data and then apply transformation (transform) to train and test data

In [7]:
### Standardization 
from sklearn.preprocessing import StandardScaler

In [8]:
standard_inst = StandardScaler()
standard_fit = standard_inst.fit(data)
print(standard_fit)

StandardScaler(copy=True, with_mean=True, with_std=True)




In [9]:
standard_inst.transform(data)



array([[ 0.96260083,  0.10340862, -0.14610313],
       [ 0.70041383, -1.59782996, -0.01623368],
       [-0.53561058, -0.73053186,  0.99025452],
       [ 0.06367398, -1.19753853,  0.21103785],
       [-2.22109841,  0.9373491 ,  0.5032441 ],
       [ 0.0262187 ,  0.17012386, -1.28246077],
       [-0.08614716,  1.60450148, -1.15259133],
       [ 0.70041383,  0.770561  , -1.60713439],
       [ 1.37460897,  0.80391862,  1.3473955 ],
       [-0.985074  , -0.86396233,  1.15259133]])

StandardScaler removes the mean and scales the data to unit variance. However, the outliers have an influence when computing the empirical mean and standard deviation which shrink the range of the feature values

StandardScaler therefore cannot guarantee balanced feature scales in the presence of outliers.

In [10]:
from sklearn.preprocessing import quantile_transform

In [20]:
quantile_transform(data,axis=1,n_quantiles=1000, random_state=0)

array([[  9.99999900e-01,   4.99602166e-01,   9.99999998e-08],
       [  9.99999900e-01,   9.99999998e-08,   5.00145715e-01],
       [  4.99857000e-01,   9.99999998e-08,   9.99999900e-01],
       [  9.99999900e-01,   9.99999998e-08,   5.00420420e-01],
       [  9.99999998e-08,   9.99999900e-01,   5.00380380e-01],
       [  9.99999900e-01,   5.00337072e-01,   9.99999998e-08],
       [  5.00025025e-01,   9.99999900e-01,   9.99999998e-08],
       [  9.99999900e-01,   5.00396500e-01,   9.99999998e-08],
       [  9.99999900e-01,   9.99999998e-08,   5.00340340e-01],
       [  4.99684300e-01,   9.99999998e-08,   9.99999900e-01]])

In [24]:
%%HTML
<img src="image.png" style="height:150px">

In [29]:
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "image.png")

## Normaliser 


The Normalizer rescales the vector for each sample to have unit norm, independently of the distribution of the samples.


## Unit-norm normalization

When unit-norm normalization is applied, all the data in each profile are multiplied so that the length of the associated vector be equal to 1. 
The length of the vector is the square root of the sum of squares of all values.


See the above figure that defines the L2 norm

In [30]:
Image(url= "image1.gif")

Normalize samples individually to unit norm.

Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1 or l2) equals one.

This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of a copy / conversion).

Scaling inputs to unit norms is a common operation for text classification or clustering for instance. For instance the dot product of two l2-normalized TF-IDF vectors is the cosine similarity of the vectors and is the base similarity metric for the Vector Space Model commonly used by the Information Retrieval community.



norm : ‘l1’, ‘l2’, or ‘max’, optional (‘l2’ by default)

The norm to use to normalize each non zero sample.

copy : boolean, optional, default True

set to False to perform inplace row normalization and avoid a copy

In [31]:
from sklearn.preprocessing import Normalizer

In [38]:
scaler = Normalizer('l2').fit(data) # try 'l1' and 'l2' in place of 'max' default value is 'l2'
normalized_X = scaler.transform(data)
# normalized_X_test = scaler.transform(X_test)

In [39]:
normalized_X   # here all are positive we will have eve negative values

array([[ 0.77576597,  0.46367621,  0.42800881],
       [ 0.83839757,  0.01047997,  0.54495842],
       [ 0.47411902,  0.27236625,  0.83727402],
       [ 0.72175793,  0.14893418,  0.67593203],
       [ 0.01946524,  0.74941169,  0.66181812],
       [ 0.74482871,  0.64872178,  0.15617376],
       [ 0.51393806,  0.84494902,  0.14808385],
       [ 0.74300557,  0.66870501,  0.02786271],
       [ 0.63565376,  0.47349719,  0.60970871],
       [ 0.35913023,  0.23599986,  0.902956  ]])

In [42]:
my_data = np.random.randint(1,150,size=(50,4))

In [43]:
my_data

array([[136,  29,  45,  57],
       [ 49,  78,  67, 117],
       [ 79,  45, 123,   7],
       [ 27,  75,  10,  54],
       [ 31, 122,  74,  39],
       [ 91,  24,  87, 129],
       [139, 148,  34,  70],
       [113,  68,  53,  83],
       [146,  12,  65, 123],
       [ 82, 122, 103, 129],
       [  7, 118,  97,  32],
       [ 70, 102,  38,  21],
       [ 98,  73, 140, 139],
       [ 68, 108, 130, 128],
       [ 61, 140,  32, 106],
       [ 67, 122,  12,  51],
       [ 50, 106,  12,  41],
       [140,  42, 148,  70],
       [ 56,  35, 124,  77],
       [ 11,  82, 109, 134],
       [ 82,  62,  33,  65],
       [  5,   5, 131,  50],
       [ 69, 117,  27,  54],
       [ 93,  82,  83, 142],
       [ 48,  44,  68,  29],
       [ 67,  91, 119,  62],
       [120, 149,  33, 144],
       [ 22,  57,   3,  41],
       [143, 113,  32,  74],
       [148,  98,  34, 105],
       [100,  73,   8,  27],
       [ 83,  32,  62,  64],
       [ 37,  23, 143,  18],
       [ 13,  48,  67,  82],
       [ 31, 1

In [46]:
df = pd.DataFrame(my_data,columns=['f1','f2','f3','Label'])

In [47]:
df

Unnamed: 0,f1,f2,f3,Label
0,136,29,45,57
1,49,78,67,117
2,79,45,123,7
3,27,75,10,54
4,31,122,74,39
5,91,24,87,129
6,139,148,34,70
7,113,68,53,83
8,146,12,65,123
9,82,122,103,129


In [48]:
X = df[['f1','f2','f3']]


In [50]:
Y = df['Label']

In [52]:
from sklearn.model_selection import train_test_split

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [54]:
X_train.shape

(35, 3)

In [55]:
X_test.shape

(15, 3)