In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data = pd.read_csv('loan.csv', nrows = 30000, usecols = ['loan_amnt', 'int_rate', 'installment'])
data = data.dropna()

In [6]:
data.head(10)

Unnamed: 0,loan_amnt,int_rate,installment
0,2500,13.56,84.92
1,30000,18.94,777.23
2,5000,17.97,180.69
3,4000,18.94,146.51
4,30000,16.14,731.78
5,5550,15.02,192.45
6,2000,17.97,72.28
7,6000,13.56,203.79
8,5000,17.97,180.69
9,6000,14.47,206.44


In [7]:
data.describe()

Unnamed: 0,loan_amnt,int_rate,installment
count,30000.0,30000.0,30000.0
mean,15941.94,12.948691,461.282355
std,10257.787699,4.880157,287.407671
min,1000.0,6.0,30.64
25%,8000.0,8.81,248.4
50%,13800.0,11.8,380.66
75%,22000.0,16.14,622.7
max,40000.0,30.84,1618.24


## Standardization (Standard Scalar):
\mu = 0 and \sigma = 1

Z = (x - mu)/sigma

In [8]:
from sklearn.preprocessing import StandardScaler
# assumes data is normally distributed (or close to)

In [9]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [10]:
print(data_scaled.mean(axis=0))
print(data_scaled.std(axis=0))

[-8.71599089e-17  1.77635684e-18  3.03164901e-17]
[1. 1. 1.]


In [11]:
print('Min values (Loan Amount, Int Rate, and Installment): ', data_scaled.min(axis=0))

Min values (Loan Amount, Int Rate, and Installment):  [-1.4566678  -1.42389012 -1.49839262]


In [12]:
print('Max values (Loan Amount, Int Rate, and Installment): ', data_scaled.max(axis=0))

Max values (Loan Amount, Int Rate, and Installment):  [2.34538496 3.66619529 4.02556036]


However, the min and max values vary according to the spread out the variable was to begin with. There must be outliers.

## Normalization (Min-Max Scalar)

In this approach, the data is scaled to a fixed range — usually 0 to 1.
In contrast to standardization, the cost of having this bounded range is that we will end up with smaller standard deviations, which can suppress the effect of outliers. Thus MinMax Scalar is sensitive to outliers.

Xnorm = (X - Xmin)/(Xmax - Xmin)


In [13]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

In [15]:
print('means (Loan Amount, Int rate and Installment): ', data_scaled.mean(axis=0))
print('std (Loan Amount, Int rate and Installment): ', data_scaled.std(axis=0))

means (Loan Amount, Int rate and Installment):  [0.38312667 0.27973796 0.27125369]
std (Loan Amount, Int rate and Installment):  [0.26301581 0.19646036 0.18102978]


In [16]:
print('Min (Loan Amount, Int rate and Installment): ', data_scaled.min(axis=0))
print('Max (Loan Amount, Int rate and Installment): ', data_scaled.max(axis=0))

Min (Loan Amount, Int rate and Installment):  [0. 0. 0.]
Max (Loan Amount, Int rate and Installment):  [1. 1. 1.]


## Robust Scalar (Scalling to median and quantiles):
IQR = 75th quantile - 25th quantile
X_scaled = (X - X.median)/IQR

In [17]:
from sklearn.preprocessing import RobustScaler


In [18]:
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)

In [19]:
print('means (Loan Amount, Int rate and Installment): ', data_scaled.mean(axis=0))
print('std (Loan Amount, Int rate and Installment): ', data_scaled.std(axis=0))

means (Loan Amount, Int rate and Installment):  [0.15299571 0.15671091 0.21539502]
std (Loan Amount, Int rate and Installment):  [0.73268691 0.66576743 0.76784099]


In [20]:
print('Min (Loan Amount, Int rate and Installment): ', data_scaled.min(axis=0))
print('Max (Loan Amount, Int rate and Installment): ', data_scaled.max(axis=0))

Min (Loan Amount, Int rate and Installment):  [-0.91428571 -0.79126876 -0.93513225]
Max (Loan Amount, Int rate and Installment):  [1.87142857 2.59754434 3.30638525]
