In [27]:
#importing libraries
import pandas as pd
import numpy as np

In [2]:
bigmart = pd.read_csv('train_bm.csv')

In [3]:
data = bigmart[['Item_Visibility','Item_MRP']]
data.head()

Unnamed: 0,Item_Visibility,Item_MRP
0,0.016047,249.8092
1,0.019278,48.2692
2,0.01676,141.618
3,0.0,182.095
4,0.0,53.8614


**We are selecting only above two rows for performing scaling**

In this notebook we will see how MinMaxScaler and StandardScaler works

## MinMaxScaler
x_new = [Xi - Min(X)]/[Max(X) - Min(X)]  
Here, Max(X) and Min(X) are maximum and Minimum values respectively in the given feature to be scaled.

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
scaler = MinMaxScaler()
#as we need to use same data to learn about StandardScaler we will not changing it
scaled_data = scaler.fit(data) #Computes the minimum and maximum to be used for later scaling.

In [15]:
scaled_data

MinMaxScaler(copy=True, feature_range=(0, 1))

In [16]:
scaled_data = scaler.transform(data)

In [17]:
print(scaled_data)

[[0.04886645 0.92750715]
 [0.05870508 0.0720684 ]
 [0.05103696 0.46828841]
 ...
 [0.10714751 0.22849221]
 [0.44221878 0.30493925]
 [0.13666114 0.18750976]]


Data is perfectly scaled

In [18]:
scaled_data = pd.DataFrame(scaled_data, columns=['Item_Visibility','Item_MRP'])

In [19]:
scaled_data.head()

Unnamed: 0,Item_Visibility,Item_MRP
0,0.048866,0.927507
1,0.058705,0.072068
2,0.051037,0.468288
3,0.0,0.640093
4,0.0,0.095805


In [20]:
scaled_data.describe()

Unnamed: 0,Item_Visibility,Item_MRP
count,8523.0,8523.0
mean,0.201382,0.465635
std,0.157123,0.264327
min,0.0,0.0
25%,0.082187,0.265437
50%,0.164228,0.474209
75%,0.288026,0.655156
max,1.0,1.0


min is 0 and max is 1 because MinMaxScaler scales down data between 0 and 1

## Standard Scaler
X_new = [Xi - mean] / std  
here mean and std(standard deviation is for entire feature or column)

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

**In MinMaxScaler we used fit() and transform() to see how both work individually**  
 - fit_transform() does same job but in 1 step  
 - In this case it computes mean and deviation in fit stage and applies transformation to each observation in feature  

In [23]:
scaled_data

array([[-0.97073217,  1.74745381],
       [-0.90811123, -1.48902325],
       [-0.95691733,  0.01004021],
       ...,
       [-0.59978449, -0.89720755],
       [ 1.53287976, -0.60797692],
       [-0.41193591, -1.05226104]])

In [24]:
scaled_data = pd.DataFrame(scaled_data, columns = ['Item_Visibility','Item_MRP'])

In [25]:
scaled_data.head()

Unnamed: 0,Item_Visibility,Item_MRP
0,-0.970732,1.747454
1,-0.908111,-1.489023
2,-0.956917,0.01004
3,-1.281758,0.66005
4,-1.281758,-1.39922


In [26]:
scaled_data.describe()

Unnamed: 0,Item_Visibility,Item_MRP
count,8523.0,8523.0
mean,1.822365e-16,-1.637654e-16
std,1.000059,1.000059
min,-1.281758,-1.761688
25%,-0.7586531,-0.7574307
50%,-0.2364792,0.03243893
75%,0.5514755,0.7170372
max,5.08305,2.021724


 **Observations:**  
 - **As we can see the mean is 1.822 * 10^-16 which is negligible and is almost 0**   
 - **Standard deviation is ~1**  
 This is exactly what we call standardization of data. The mean is 0 and std. deviation is 1