# Scaling Data
- Standardisation
- Normalisation
- Binarisation

In [1]:
## Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(precision=3, suppress=True)
pd.options.display.float_format = '{:.2f}'.format
#https://pandas.pydata.org/docs/user_guide/options.html

In [2]:
# data
df1 = data('mtcars')
df1.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.88,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.21,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Standardisation

In [3]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [4]:
type(df1.describe())
df1D = df1.describe()
#df1D.reset_index(inplace=True)
#df1D

In [5]:
df1D.loc[['min','max']]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
min,10.4,4.0,71.1,52.0,2.76,1.51,14.5,0.0,0.0,3.0,1.0
max,33.9,8.0,472.0,335.0,4.93,5.42,22.9,1.0,1.0,5.0,8.0


In [6]:
scaler = StandardScaler()

In [7]:
scalerFit = scaler.fit(df1)
scalerFit

In [8]:
normalisedDF = scalerFit.transform(df1)
np.set_printoptions(suppress = True,  formatter = {'float_kind':'{:0.2f}'.format})
np.set_printoptions(edgeitems=2)
np.core.arrayprint._line_width = 220

edgeitems : Number of array items in summary at beginning and end of each dimension (default 3).
linewidth : The number of characters per line for the purpose of inserting line breaks (default 75).

In [9]:
normalisedDF

array([[ 0.153, -0.107, -0.58 , -0.544,  0.577, -0.62 , -0.79 , -0.882,
         1.209,  0.43 ,  0.747],
       [ 0.153, -0.107, -0.58 , -0.544,  0.577, -0.355, -0.471, -0.882,
         1.209,  0.43 ,  0.747],
       [ 0.457, -1.244, -1.006, -0.796,  0.482, -0.932,  0.433,  1.134,
         1.209,  0.43 , -1.14 ],
       [ 0.221, -0.107,  0.224, -0.544, -0.982, -0.002,  0.905,  1.134,
        -0.827, -0.947, -1.14 ],
       [-0.234,  1.031,  1.06 ,  0.42 , -0.849,  0.231, -0.471, -0.882,
        -0.827, -0.947, -0.511],
       [-0.336, -0.107, -0.047, -0.618, -1.59 ,  0.252,  1.348,  1.134,
        -0.827, -0.947, -1.14 ],
       [-0.976,  1.031,  1.06 ,  1.457, -0.735,  0.366, -1.142, -0.882,
        -0.827, -0.947,  0.747],
       [ 0.726, -1.244, -0.689, -1.255,  0.178, -0.028,  1.223,  1.134,
        -0.827,  0.43 , -0.511],
       [ 0.457, -1.244, -0.737, -0.766,  0.615, -0.07 ,  2.872,  1.134,
        -0.827,  0.43 , -0.511],
       [-0.15 , -0.107, -0.517, -0.351,  0.615,  0.231,

In [10]:
np.around(normalisedDF,2)[0:3]

array([[ 0.15, -0.11, -0.58, -0.54,  0.58, -0.62, -0.79, -0.88,  1.21,
         0.43,  0.75],
       [ 0.15, -0.11, -0.58, -0.54,  0.58, -0.36, -0.47, -0.88,  1.21,
         0.43,  0.75],
       [ 0.46, -1.24, -1.01, -0.8 ,  0.48, -0.93,  0.43,  1.13,  1.21,
         0.43, -1.14]])

In [11]:
df2D = pd.DataFrame(np.around(normalisedDF,2), columns = df1.columns)
df2D.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,0.15,-0.11,-0.58,-0.54,0.58,-0.62,-0.79,-0.88,1.21,0.43,0.75
1,0.15,-0.11,-0.58,-0.54,0.58,-0.36,-0.47,-0.88,1.21,0.43,0.75
2,0.46,-1.24,-1.01,-0.8,0.48,-0.93,0.43,1.13,1.21,0.43,-1.14
3,0.22,-0.11,0.22,-0.54,-0.98,-0.0,0.9,1.13,-0.83,-0.95,-1.14
4,-0.23,1.03,1.06,0.42,-0.85,0.23,-0.47,-0.88,-0.83,-0.95,-0.51


In [12]:
df2D.describe().loc[['min','max']]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
min,-1.63,-1.24,-1.31,-1.4,-1.59,-1.77,-1.9,-0.88,-0.83,-0.95,-1.14
max,2.33,1.03,1.98,2.79,2.53,2.29,2.87,1.13,1.21,1.81,3.26


## Min Max Scaling

In [13]:
from sklearn.preprocessing import MinMaxScaler
df1.describe().loc[['min','max']]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
min,10.4,4.0,71.1,52.0,2.76,1.51,14.5,0.0,0.0,3.0,1.0
max,33.9,8.0,472.0,335.0,4.93,5.42,22.9,1.0,1.0,5.0,8.0


In [14]:
scalerMMS = MinMaxScaler(feature_range=(0,1))

In [15]:
scalerMMSfit = scalerMMS.fit(df1)
scalerMMSfit

In [16]:
scalerMMSfit.data_max_

array([ 33.9  ,   8.   , 472.   , 335.   ,   4.93 ,   5.424,  22.9  ,
         1.   ,   1.   ,   5.   ,   8.   ])

In [17]:
minMaxDF = scalerMMSfit.transform(df1)
minMaxDF[1:3]

array([[0.451, 0.5  , 0.222, 0.205, 0.525, 0.348, 0.3  , 0.   , 1.   ,
        0.5  , 0.429],
       [0.528, 0.   , 0.092, 0.145, 0.502, 0.206, 0.489, 1.   , 1.   ,
        0.5  , 0.   ]])

In [18]:
df2E = pd.DataFrame(np.around(minMaxDF,2), columns = df1.columns)
df2E

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,0.45,0.5,0.22,0.2,0.53,0.28,0.23,0.0,1.0,0.5,0.43
1,0.45,0.5,0.22,0.2,0.53,0.35,0.3,0.0,1.0,0.5,0.43
2,0.53,0.0,0.09,0.14,0.5,0.21,0.49,1.0,1.0,0.5,0.0
3,0.47,0.5,0.47,0.2,0.15,0.44,0.59,1.0,0.0,0.0,0.0
4,0.35,1.0,0.72,0.43,0.18,0.49,0.3,0.0,0.0,0.0,0.14
5,0.33,0.5,0.38,0.19,0.0,0.5,0.68,1.0,0.0,0.0,0.0
6,0.17,1.0,0.72,0.68,0.21,0.53,0.16,0.0,0.0,0.0,0.43
7,0.6,0.0,0.19,0.04,0.43,0.43,0.65,1.0,0.0,0.5,0.14
8,0.53,0.0,0.17,0.15,0.53,0.42,1.0,1.0,0.0,0.5,0.14
9,0.37,0.5,0.24,0.25,0.53,0.49,0.45,1.0,0.0,0.5,0.43


## output to Pandas

In [20]:
df1.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.88,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.21,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [21]:
scaler = StandardScaler()
scaler.fit_transform(df1)

array([[ 0.153, -0.107, -0.58 , -0.544,  0.577, -0.62 , -0.79 , -0.882,
         1.209,  0.43 ,  0.747],
       [ 0.153, -0.107, -0.58 , -0.544,  0.577, -0.355, -0.471, -0.882,
         1.209,  0.43 ,  0.747],
       [ 0.457, -1.244, -1.006, -0.796,  0.482, -0.932,  0.433,  1.134,
         1.209,  0.43 , -1.14 ],
       [ 0.221, -0.107,  0.224, -0.544, -0.982, -0.002,  0.905,  1.134,
        -0.827, -0.947, -1.14 ],
       [-0.234,  1.031,  1.06 ,  0.42 , -0.849,  0.231, -0.471, -0.882,
        -0.827, -0.947, -0.511],
       [-0.336, -0.107, -0.047, -0.618, -1.59 ,  0.252,  1.348,  1.134,
        -0.827, -0.947, -1.14 ],
       [-0.976,  1.031,  1.06 ,  1.457, -0.735,  0.366, -1.142, -0.882,
        -0.827, -0.947,  0.747],
       [ 0.726, -1.244, -0.689, -1.255,  0.178, -0.028,  1.223,  1.134,
        -0.827,  0.43 , -0.511],
       [ 0.457, -1.244, -0.737, -0.766,  0.615, -0.07 ,  2.872,  1.134,
        -0.827,  0.43 , -0.511],
       [-0.15 , -0.107, -0.517, -0.351,  0.615,  0.231,

In [22]:
scaler = StandardScaler().set_output(transform='pandas')
scaler.fit_transform(df1)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,0.15,-0.11,-0.58,-0.54,0.58,-0.62,-0.79,-0.88,1.21,0.43,0.75
Mazda RX4 Wag,0.15,-0.11,-0.58,-0.54,0.58,-0.36,-0.47,-0.88,1.21,0.43,0.75
Datsun 710,0.46,-1.24,-1.01,-0.8,0.48,-0.93,0.43,1.13,1.21,0.43,-1.14
Hornet 4 Drive,0.22,-0.11,0.22,-0.54,-0.98,-0.0,0.9,1.13,-0.83,-0.95,-1.14
Hornet Sportabout,-0.23,1.03,1.06,0.42,-0.85,0.23,-0.47,-0.88,-0.83,-0.95,-0.51
Valiant,-0.34,-0.11,-0.05,-0.62,-1.59,0.25,1.35,1.13,-0.83,-0.95,-1.14
Duster 360,-0.98,1.03,1.06,1.46,-0.73,0.37,-1.14,-0.88,-0.83,-0.95,0.75
Merc 240D,0.73,-1.24,-0.69,-1.25,0.18,-0.03,1.22,1.13,-0.83,0.43,-0.51
Merc 230,0.46,-1.24,-0.74,-0.77,0.61,-0.07,2.87,1.13,-0.83,0.43,-0.51
Merc 280,-0.15,-0.11,-0.52,-0.35,0.61,0.23,0.26,1.13,-0.83,0.43,0.75


In [31]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [33]:
pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), 
                    ('scaler', StandardScaler())]).set_output(transform='pandas')
pipeline.fit_transform(df1)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,0.15,-0.11,-0.58,-0.54,0.58,-0.62,-0.79,-0.88,1.21,0.43,0.75
Mazda RX4 Wag,0.15,-0.11,-0.58,-0.54,0.58,-0.36,-0.47,-0.88,1.21,0.43,0.75
Datsun 710,0.46,-1.24,-1.01,-0.8,0.48,-0.93,0.43,1.13,1.21,0.43,-1.14
Hornet 4 Drive,0.22,-0.11,0.22,-0.54,-0.98,-0.0,0.9,1.13,-0.83,-0.95,-1.14
Hornet Sportabout,-0.23,1.03,1.06,0.42,-0.85,0.23,-0.47,-0.88,-0.83,-0.95,-0.51
Valiant,-0.34,-0.11,-0.05,-0.62,-1.59,0.25,1.35,1.13,-0.83,-0.95,-1.14
Duster 360,-0.98,1.03,1.06,1.46,-0.73,0.37,-1.14,-0.88,-0.83,-0.95,0.75
Merc 240D,0.73,-1.24,-0.69,-1.25,0.18,-0.03,1.22,1.13,-0.83,0.43,-0.51
Merc 230,0.46,-1.24,-0.74,-0.77,0.61,-0.07,2.87,1.13,-0.83,0.43,-0.51
Merc 280,-0.15,-0.11,-0.52,-0.35,0.61,0.23,0.26,1.13,-0.83,0.43,0.75


## Normalizer
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html

In [19]:
from sklearn.preprocessing import Normalizer

In [110]:
scaler = Normalizer().fit(df1)
scaler

In [112]:
normalizedDF = scaler.transform(df1)
normalizedDF[1:3]

array([[0.107, 0.031, 0.815, 0.56 , 0.02 , 0.015, 0.087, 0.   , 0.005,
        0.02 , 0.02 ],
       [0.156, 0.027, 0.741, 0.638, 0.026, 0.016, 0.128, 0.007, 0.007,
        0.027, 0.007]])

In [113]:
df2F = pd.DataFrame(np.around(normalizedDF,2), columns = df1.columns)
df2F

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,0.11,0.03,0.82,0.56,0.02,0.01,0.08,0.0,0.01,0.02,0.02
1,0.11,0.03,0.82,0.56,0.02,0.01,0.09,0.0,0.01,0.02,0.02
2,0.16,0.03,0.74,0.64,0.03,0.02,0.13,0.01,0.01,0.03,0.01
3,0.08,0.02,0.91,0.39,0.01,0.01,0.07,0.0,0.0,0.01,0.0
4,0.05,0.02,0.9,0.44,0.01,0.01,0.04,0.0,0.0,0.01,0.0
5,0.07,0.02,0.9,0.42,0.01,0.01,0.08,0.0,0.0,0.01,0.0
6,0.03,0.02,0.83,0.56,0.01,0.01,0.04,0.0,0.0,0.01,0.01
7,0.15,0.02,0.9,0.38,0.02,0.02,0.12,0.01,0.0,0.02,0.01
8,0.13,0.02,0.81,0.55,0.02,0.02,0.13,0.01,0.0,0.02,0.01
9,0.09,0.03,0.8,0.59,0.02,0.02,0.09,0.0,0.0,0.02,0.02


## Binarization
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme.

In [114]:
from sklearn.preprocessing import Binarizer

In [115]:
binarizer = Binarizer(threshold=0.0).fit(df1)

In [116]:
binaryDF = binarizer.transform(df1)

In [119]:
binaryDF[0:3]

array([[1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [136]:
binarizer = Binarizer(threshold=25).fit(df1['mpg'].to_numpy().reshape(-1, 1))
binaryDF = binarizer.transform(df1['mpg'].to_numpy().reshape(-1, 1))
print(np.hstack(binaryDF),df1['mpg'].to_numpy())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.
 0. 1. 1. 1. 0. 0. 0. 0.] [21.  21.  22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.  30.4
 15.8 19.7 15.  21.4]


In [140]:
np.concatenate((binaryDF, df1['mpg'].to_numpy().reshape(-1, 1)), axis=1)
#all mpg > 25 are 1, rest 0

array([[ 0. , 21. ],
       [ 0. , 21. ],
       [ 0. , 22.8],
       [ 0. , 21.4],
       [ 0. , 18.7],
       [ 0. , 18.1],
       [ 0. , 14.3],
       [ 0. , 24.4],
       [ 0. , 22.8],
       [ 0. , 19.2],
       [ 0. , 17.8],
       [ 0. , 16.4],
       [ 0. , 17.3],
       [ 0. , 15.2],
       [ 0. , 10.4],
       [ 0. , 10.4],
       [ 0. , 14.7],
       [ 1. , 32.4],
       [ 1. , 30.4],
       [ 1. , 33.9],
       [ 0. , 21.5],
       [ 0. , 15.5],
       [ 0. , 15.2],
       [ 0. , 13.3],
       [ 0. , 19.2],
       [ 1. , 27.3],
       [ 1. , 26. ],
       [ 1. , 30.4],
       [ 0. , 15.8],
       [ 0. , 19.7],
       [ 0. , 15. ],
       [ 0. , 21.4]])