In [2]:
from sklearn import preprocessing as pc
import numpy as np

In [3]:
# create an ndarray
x_train = np.array([[1,-1,2],
                   [2,0,0],
                   [0,1,-1]],dtype=float)
x_train

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [4]:
# scale the data to have zero mean and 1 std dev. Thus to normalize the data
x_scaled = pc.scale(x_train)
x_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [5]:
# check mean
np.mean(x_scaled,axis = 0)

array([0., 0., 0.])

In [6]:
# check stddev.
np.std(x_scaled,axis = 0)

array([1., 1., 1.])

## Testing on PIMA Indian Diabetes Dataset

In [7]:
import pandas as pd
import scipy as sp
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [8]:
# Example of MinMaxScaler :
# Goal : Convert data into values between 0 and 1

url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
cols = ['preg','plas','pres','skin','test','mass','pedi','age','class']
df = pd.read_csv(url,names = cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
arr = df.values
type(arr)

numpy.ndarray

In [11]:
# split the data in predictor and response datasets
X = arr[:,0:8]
y = arr[:,8]

In [13]:
# create a MinMaxScaler variable
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)
np.set_printoptions(precision=3)
rescaledX[:10,:5]

array([[0.353, 0.744, 0.59 , 0.354, 0.   ],
       [0.059, 0.427, 0.541, 0.293, 0.   ],
       [0.471, 0.92 , 0.525, 0.   , 0.   ],
       [0.059, 0.447, 0.541, 0.232, 0.111],
       [0.   , 0.688, 0.328, 0.354, 0.199],
       [0.294, 0.583, 0.607, 0.   , 0.   ],
       [0.176, 0.392, 0.41 , 0.323, 0.104],
       [0.588, 0.578, 0.   , 0.   , 0.   ],
       [0.118, 0.99 , 0.574, 0.455, 0.642],
       [0.471, 0.628, 0.787, 0.   , 0.   ]])

In [15]:
# Example of StandardScaler : 
# Goal : Convert data into values having mean = 0 and stddev = 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rescaledX = scaler.fit_transform(X)
rescaledX[:10,:5]
#rescaledX.mean(axis = 0)


array([[ 0.64 ,  0.848,  0.15 ,  0.907, -0.693],
       [-0.845, -1.123, -0.161,  0.531, -0.693],
       [ 1.234,  1.944, -0.264, -1.288, -0.693],
       [-0.845, -0.998, -0.161,  0.155,  0.123],
       [-1.142,  0.504, -1.505,  0.907,  0.766],
       [ 0.343, -0.153,  0.253, -1.288, -0.693],
       [-0.251, -1.342, -0.988,  0.719,  0.071],
       [ 1.828, -0.184, -3.573, -1.288, -0.693],
       [-0.548,  2.382,  0.046,  1.535,  4.022],
       [ 1.234,  0.128,  1.39 , -1.288, -0.693]])

In [17]:
# Normalize the data. The resultant vector will have magnitude of 1 
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
rescaledX = scaler.fit_transform(X)
rescaledX[:10,:5]

array([[0.034, 0.828, 0.403, 0.196, 0.   ],
       [0.008, 0.716, 0.556, 0.244, 0.   ],
       [0.04 , 0.924, 0.323, 0.   , 0.   ],
       [0.007, 0.588, 0.436, 0.152, 0.622],
       [0.   , 0.596, 0.174, 0.152, 0.731],
       [0.035, 0.81 , 0.517, 0.   , 0.   ],
       [0.022, 0.566, 0.363, 0.232, 0.638],
       [0.081, 0.926, 0.   , 0.   , 0.   ],
       [0.003, 0.336, 0.119, 0.077, 0.925],
       [0.048, 0.749, 0.576, 0.   , 0.   ]])

In [18]:
type(rescaledX)

numpy.ndarray

In [22]:
# to prove that the vector magnitude is having length = 1
np.sqrt((rescaledX**2).sum(axis=1))

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.