In [2]:
from sklearn import preprocessing as pc
import numpy as np

In [3]:
# create an ndarray
x_train = np.array([[1,-1,2],
                   [2,0,0],
                   [0,1,-1]],dtype=float)
x_train

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [4]:
# scale the data to have zero mean and 1 std dev. Thus to normalize the data
x_scaled = pc.scale(x_train)
x_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [5]:
# check mean
np.mean(x_scaled,axis = 0)

array([0., 0., 0.])

In [6]:
# check stddev.
np.std(x_scaled,axis = 0)

array([1., 1., 1.])

## Testing on PIMA Indian Diabetes Dataset

In [3]:
import pandas as pd
import scipy as sp
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Example of MinMaxScaler :
# Goal : Convert data into values between 0 and 1

url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
cols = ['preg','plas','pres','skin','test','mass','pedi','age','class']
df = pd.read_csv(url,names = cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
arr = df.values
type(arr)

numpy.ndarray

In [8]:
# split the data in predictor and response datasets
X = arr[:,0:8]
y = arr[:,8]
y

array([1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0.

In [9]:
# create a MinMaxScaler variable
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)
np.set_printoptions(precision=3)
rescaledX[:10,:5]

array([[0.353, 0.744, 0.59 , 0.354, 0.   ],
       [0.059, 0.427, 0.541, 0.293, 0.   ],
       [0.471, 0.92 , 0.525, 0.   , 0.   ],
       [0.059, 0.447, 0.541, 0.232, 0.111],
       [0.   , 0.688, 0.328, 0.354, 0.199],
       [0.294, 0.583, 0.607, 0.   , 0.   ],
       [0.176, 0.392, 0.41 , 0.323, 0.104],
       [0.588, 0.578, 0.   , 0.   , 0.   ],
       [0.118, 0.99 , 0.574, 0.455, 0.642],
       [0.471, 0.628, 0.787, 0.   , 0.   ]])

In [11]:
# Example of StandardScaler : 
# Goal : Convert data into values having mean = 0 and stddev = 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rescaledX = scaler.fit_transform(X)
rescaledX[:10,:5]
rescaledX.std(axis = 0)


array([1., 1., 1., 1., 1., 1., 1., 1.])

In [17]:
# Normalize the data. The resultant vector will have magnitude of 1 
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
rescaledX = scaler.fit_transform(X)
rescaledX[:10,:5]

array([[0.034, 0.828, 0.403, 0.196, 0.   ],
       [0.008, 0.716, 0.556, 0.244, 0.   ],
       [0.04 , 0.924, 0.323, 0.   , 0.   ],
       [0.007, 0.588, 0.436, 0.152, 0.622],
       [0.   , 0.596, 0.174, 0.152, 0.731],
       [0.035, 0.81 , 0.517, 0.   , 0.   ],
       [0.022, 0.566, 0.363, 0.232, 0.638],
       [0.081, 0.926, 0.   , 0.   , 0.   ],
       [0.003, 0.336, 0.119, 0.077, 0.925],
       [0.048, 0.749, 0.576, 0.   , 0.   ]])

In [18]:
type(rescaledX)

numpy.ndarray

In [22]:
# to prove that the vector magnitude is having length = 1
np.sqrt((rescaledX**2).sum(axis=1))

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [23]:
# Binarize the data
from sklearn.preprocessing import Binarizer
scaler = Binarizer(threshold = 0.0)
rescaledX = scaler.fit_transform(X)
rescaledX[:10,:5]

array([[1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1.],
       [1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 0., 0.]])

## Demonstrating Label Encoding

In [13]:
new_df = pd.read_csv('data/HR_comma_sep.csv')
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [14]:
new_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [15]:
from sklearn import preprocessing as pc

In [16]:
le = pc.LabelEncoder()
le.fit(new_df.salary)

LabelEncoder()

In [17]:
le.classes_

array(['high', 'low', 'medium'], dtype=object)

In [18]:
new_df['sal_encoded'] = le.transform(new_df.salary)

In [19]:
new_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,sal_encoded
0,0.38,0.53,2,157,3,0,1,0,sales,low,1
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2
3,0.72,0.87,5,223,5,0,1,0,sales,low,1
4,0.37,0.52,2,159,3,0,1,0,sales,low,1


In [20]:
new_df.drop(['salary'],axis=1,inplace=True)
new_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,sal_encoded
0,0.38,0.53,2,157,3,0,1,0,sales,1
1,0.8,0.86,5,262,6,0,1,0,sales,2
2,0.11,0.88,7,272,4,0,1,0,sales,2
3,0.72,0.87,5,223,5,0,1,0,sales,1
4,0.37,0.52,2,159,3,0,1,0,sales,1


In [21]:
new_df['inv_sal'] = le.inverse_transform(new_df.sal_encoded)
new_df.head()

  if diff:


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,sal_encoded,inv_sal
0,0.38,0.53,2,157,3,0,1,0,sales,1,low
1,0.8,0.86,5,262,6,0,1,0,sales,2,medium
2,0.11,0.88,7,272,4,0,1,0,sales,2,medium
3,0.72,0.87,5,223,5,0,1,0,sales,1,low
4,0.37,0.52,2,159,3,0,1,0,sales,1,low


In [22]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
df = pd.DataFrame([
 ['green','honda',2017],
 ['red','maruti',2016],
 ['blue','hyundai',2015]
])
df.columns = ['color','make','year']
df.head()

Unnamed: 0,color,make,year
0,green,honda,2017
1,red,maruti,2016
2,blue,hyundai,2015


In [23]:
df['color_encoded']=le.fit_transform(df.color)
df['make_encoded'] = le.fit_transform(df.make)
df.head()

Unnamed: 0,color,make,year,color_encoded,make_encoded
0,green,honda,2017,1,0
1,red,maruti,2016,2,2
2,blue,hyundai,2015,0,1


In [28]:
ohe.fit_transform(df.color_encoded.values.reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [31]:
x1 = ohe.fit_transform(df.color_encoded.values.reshape(-1,1)).toarray()
x2 = ohe.fit_transform(df.make_encoded.values.reshape(-1,1)).toarray()

In [32]:
df_color_encode = pd.DataFrame(x1, columns = ["Color_"+str(int(i)) for i in range(x1.shape[1])])
df = pd.concat([df, df_color_encode], axis=1)

df_make_encode = pd.DataFrame(x2, columns = ["Make_"+str(int(i)) for i in range(x2.shape[1])])
df = pd.concat([df, df_make_encode], axis=1)

In [33]:
df.drop(['color_encoded','make_encoded'],axis=1)

Unnamed: 0,color,make,year,Color_0,Color_1,Color_2,Make_0,Make_1,Make_2
0,green,honda,2017,0.0,1.0,0.0,1.0,0.0,0.0
1,red,maruti,2016,0.0,0.0,1.0,0.0,0.0,1.0
2,blue,hyundai,2015,1.0,0.0,0.0,0.0,1.0,0.0


In [30]:
# Creating polynomial features for curve fitting in higher order polynomial
from sklearn.preprocessing import PolynomialFeatures
x = np.arange(6).reshape(3,2)
x

array([[0, 1],
       [2, 3],
       [4, 5]])

In [31]:
poly = PolynomialFeatures(2)
poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [32]:
poly1 = PolynomialFeatures(interaction_only = False)
poly1.fit_transform(x)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

## How to do Outlier Detection?
* IQR Method
* Standard Deviation Method

* Following are additional methods provided by scikit learn which will be covered in later sessions : 
  * Elliptic Envelope
  * Isolation Forest
  * Local Outlier Factor