In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
df = pd.DataFrame({
    'Income': [15000, 1800, 120000, 10000],
    'Age': [25, 18, 42, 51],
    'Department': ['HR','Legal','Marketing','Management']
})

In [3]:
df.head()

Unnamed: 0,Income,Age,Department
0,15000,25,HR
1,1800,18,Legal
2,120000,42,Marketing
3,10000,51,Management


In [4]:
df_scaled = df.copy()
col_names = ['Income', 'Age']
features = df_scaled[col_names]

In [5]:
features

Unnamed: 0,Income,Age
0,15000,25
1,1800,18
2,120000,42
3,10000,51


In [41]:
#MINMAX SCALING (x-xmin/xmax-xmin)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features = df_scaled[col_names]

In [43]:
features = scaler.fit_transform(features.values)

In [8]:
features

array([[0.11167513, 0.21212121],
       [0.        , 0.        ],
       [1.        , 0.72727273],
       [0.06937394, 1.        ]])

In [9]:
df_scaled

Unnamed: 0,Income,Age,Department
0,15000,25,HR
1,1800,18,Legal
2,120000,42,Marketing
3,10000,51,Management


In [None]:
#If we want to change the range from 0 to 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(5,10))
df_scaled[col_names]= scaler.fit_transform(df_scaled[col_names].values)
df_scaled

In [53]:
#Standard Scaler(x-mean/std)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df_scaled[col_names]= scaler.fit_transform(df_scaled[col_names].values)
df_scaled#Mean turns out to be 0 for the coloumns and std close to 1)

Unnamed: 0,Income,Age,Department
0,-0.449056,-0.685248,HR
1,-0.722214,-1.218219,Legal
2,1.723796,0.60911,Marketing
3,-0.552525,1.294358,Management


In [61]:
#MaxAbsScaler
from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
df_scaled = df.copy()
df_scaled[col_names]=scaler.fit_transform(df_scaled[col_names].values)
df_scaled

Unnamed: 0,Income,Age,Department
0,0.125,0.490196,HR
1,0.015,0.352941,Legal
2,1.0,0.823529,Marketing
3,0.083333,1.0,Management


In [63]:
#Robust Scaler#x_scaled=x-xmedian/75%ile-25%ile
from sklearn.preprocessing import RobustScaler
scaler=RobustScaler()
df_scaled = df.copy()
df_scaled[col_names]=scaler.fit_transform(df_scaled[col_names].values)
df_scaled

Unnamed: 0,Income,Age,Department
0,0.075075,-0.404762,HR
1,-0.321321,-0.738095,Legal
2,3.228228,0.404762,Marketing
3,-0.075075,0.833333,Management


In [65]:
#Quantile Tranformer Scaler:
#1. It computes the cumulative distribution function of the variable
#2. It uses this cdf to map the values to a normal distribution
#3. Maps the obtained values to the desired output distribution using the associated quantile function
#Best for non lineardata
from sklearn.preprocessing import QuantileTransformer
scaler=QuantileTransformer()
df_scaled = df.copy()
df_scaled[col_names]=scaler.fit_transform(df_scaled[col_names].values)
df_scaled



Unnamed: 0,Income,Age,Department
0,0.666667,0.333333,HR
1,0.0,0.0,Legal
2,1.0,0.666667,Marketing
3,0.333333,1.0,Management


In [67]:
#Log Transform
df['log_income'] = np.log(df['Income'])

In [69]:
df

Unnamed: 0,Income,Age,Department,log_income
0,15000,25,HR,9.615805
1,1800,18,Legal,7.495542
2,120000,42,Marketing,11.695247
3,10000,51,Management,9.21034


In [73]:
#While income ranged from 1800 to 120000 the log ranged only from 7 to 11.
#Using log reduced the impact oftoolarge/too small data values
#We need to ensure that no negative values are present

In [77]:
#PowerTranform Scaler
#Changes distribution ofvariable to makeit more gaussian
#Using the box_cox tranformation
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method = 'box-cox')
df_scaled=df.copy()
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department,log_income
0,-0.090548,-0.624107,HR,9.615805
1,-1.107316,-1.279451,Legal,7.495542
2,1.610914,0.667412,Marketing,11.695247
3,-0.41305,1.236146,Management,9.21034


In [None]:
#Unit Vector Scaler/Normalizer
#It works on rows
#If we are using L1 norm, the values in each column are converted so that the sum of their absolute values along the row = 1
#If we are using L2 norm, the values in each column are first squared and added so that the sum of their absolute values along the row = 1
#The Normalizer also converts the values between 0 and 1, and between -1 to 1 when there are negative values in our data.

In [79]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer(norm = 'l2')#default
df_scaled=df.copy()
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department,log_income
0,0.675911,0.736983,HR,9.615805
1,0.707107,0.707107,Legal,7.495542
2,0.756823,0.65362,Marketing,11.695247
3,0.471517,0.881857,Management,9.21034


In [81]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer(norm = 'l1')
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department,log_income
0,0.478388,0.521612,HR,9.615805
1,0.5,0.5,Legal,7.495542
2,0.536585,0.463415,Marketing,11.695247
3,0.348401,0.651599,Management,9.21034


In [83]:
#Custom Transformer-our own function to transform values
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log2, validate = True)
df_scaled=df.copy()
df_scaled[col_names] = transformer.transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department,log_income
0,2.474663,2.599462,HR,9.615805
1,2.321928,2.321928,Legal,7.495542
2,3.321928,3.110424,Marketing,11.695247
3,2.418695,3.321928,Management,9.21034


In [None]:
#End of Transformations & Scaling#