**************************************************************************************
NORMALIZATION USING MIN MAX SCALER FROM SK LEARN
(scales down to a scale of 0-1)

used when the difference between features is big, for example, pedigreefunction in this dataset has values like 0.627, 0.351 but Glucose has values like 148, 183. Their scales are very different and therefore its better to scale them down to the same scale using normalization.

Formula: X - Xmin / Xmax - Xmin

*****************************************************************************************


In [8]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [9]:
dp = r"C:\Users\anany\Downloads\archive (2)\diabetes.csv"
df = pd.read_csv(dp)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin','BMI','DiabetesPedigreeFunction', 'Age']
scale = MinMaxScaler()
scale.fit_transform(df[features])


array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

*****************************************************************************************************************
STANDARDIZATION USING SK LEARN
(scales down based on standard normal distribution (mean = 0, standard deviation = 1))

very useful for situations where we need values outside 0-1, for example, in case of images depicted in pixels which are in a range from 0-255

formula: z = (x - u) / s

z = scaled data.
x = to be scaled data.
u = the mean of the training samples
s = the standard deviation of the training samples.

*****************************************************************************************************************


In [11]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [12]:
dp = r"C:\Users\anany\Downloads\archive (2)\diabetes.csv"
df = pd.read_csv(dp)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin','BMI','DiabetesPedigreeFunction', 'Age']
scale = StandardScaler()
scale.fit_transform(df[features])


array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])