# Feature Scaling

Our dataset deals with several different economic and non-economic indicators, where such indicators have varying degrees of magnitude, range and units. In order for our machine learning models to interpret these features on the same scale, we need to perform some sort of feature scaling. This is especially for Linear Regression, though not so much for CART.

Here, we perform two methods of feature scaling. Normalization and Standardization 

Normalization means rescaling the values into a range of [0,1]. Standardization means rescaling data to have a mean of 0 and a standard deviation of 1 (unit variance).


In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import csv

In [2]:
asia_dataset = pd.read_csv('final_asia_dataset_cleaned.csv')
pd.set_option('display.max_rows', asia_dataset.shape[0]+1)
pd.set_option('display.max_columns', None)

In [3]:
print("Shape: " + str(asia_dataset.shape))
asia_dataset
Countries = asia_dataset.iloc[:,0]
Years = asia_dataset.iloc[:,1]
GDP_growth = asia_dataset.iloc[:,2]

Shape: (150, 23)


In [4]:
asia_dataset = asia_dataset.iloc[:,3:]
asia_dataset

Unnamed: 0,Market_Size,Life_Exp,Mean_Years_of_Schooling,Internet_Penetration,Gov_Indicator,Trade_Openness,Labour_Force,Domestic_credit_to_private_sector,inflation,GNI,BoT,Child_Mortality,HDI,Pol_Stability,CPI,Urban_pop,Renew_Energy,CO2_emissions,ind_val_add,unemployment_rate
0,52203.13,82.9,12.7,,1.825413,24.11097,65.93,135.840755,1.610768,50540,2.51,3.6,0.944,0.911746,77.0,0.86124,54.39985,411015667,25.38245,0.0516
1,50252.84,82.74878,12.7,,1.805214,21.8219,65.65,139.525909,1.911401,48660,0.38,3.7,0.943,0.989604,77.0,0.86012,44.443,415953947,24.15897,0.053
2,48482.65,82.5,12.7,86.54505,1.798351,21.20575,65.23,140.292665,1.948647,47160,0.59,3.7,0.941,0.896061,77.0,0.85904,40.206,415097428,23.46106,0.0559
3,47339.97,82.44878,12.6,86.54,1.816108,19.25106,64.99,142.423168,1.276991,46220,-2.28,3.8,0.939,1.047958,79.0,0.858,37.152,411031531,22.28487,0.0571
4,46288.33,82.4,12.5,84.56051,1.882711,19.98485,65.15,136.31738,1.508367,45340,-1.52,3.9,0.938,0.88498,79.0,0.85701,33.423,401554757,23.64421,0.0605
5,46880.22,82.3,12.3,84.0,1.853722,21.03946,64.86,128.506816,2.487923,45650,-0.43,4.0,0.933,1.032192,80.0,0.85602,36.112,394116892,25.51482,0.0608
6,45902.05,82.14878,12.2,83.4535,1.785226,19.95617,65.1,124.782324,2.449889,44760,-1.3,4.1,0.931,1.031073,81.0,0.85502,32.705,397943179,25.12799,0.0566
7,42826.79,82.04634,12.8,79.0,1.985249,21.48083,65.33,121.283724,1.76278,41520,-0.25,4.3,0.937,0.997997,85.0,0.85402,26.318,406506242,26.28416,0.0522
8,41965.36,81.89512,12.7,79.4877,2.04471,21.43642,65.63,122.334841,3.30385,40280,0.97,4.5,0.932,0.93571,,0.853,26.273,404172757,26.43613,0.0508
9,39301.34,81.69512,12.6,76.0,2.032045,19.81001,65.6,125.495658,2.91834,37770,-1.03,4.8,0.93,0.88886,,0.85182,21.72,405502799,25.15142,0.0521


In [5]:
names = asia_dataset.iloc[:,:].columns
names

Index(['Market_Size', 'Life_Exp', 'Mean_Years_of_Schooling',
       'Internet_Penetration', 'Gov_Indicator', 'Trade_Openness',
       'Labour_Force', 'Domestic_credit_to_private_sector', 'inflation', 'GNI',
       'BoT', 'Child_Mortality', 'HDI', 'Pol_Stability', 'CPI', 'Urban_pop',
       'Renew_Energy', 'CO2_emissions', 'ind_val_add', 'unemployment_rate'],
      dtype='object')

In [6]:
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(asia_dataset.iloc[:,:])
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.insert(0,'GDP_Growth', GDP_growth)
scaled_df.insert(0,'Years', Years)
scaled_df.insert(0,'Countries', Countries)
scaled_df.head()

Unnamed: 0,Countries,Years,GDP_Growth,Market_Size,Life_Exp,Mean_Years_of_Schooling,Internet_Penetration,Gov_Indicator,Trade_Openness,Labour_Force,Domestic_credit_to_private_sector,inflation,GNI,BoT,Child_Mortality,HDI,Pol_Stability,CPI,Urban_pop,Renew_Energy,CO2_emissions,ind_val_add,unemployment_rate
0,Australia,2019,2.160956,0.496973,0.930089,0.988506,,0.871433,0.11815,0.579867,0.761788,0.128265,0.546305,0.357073,0.013002,1.0,0.840946,0.815789,0.830375,0.026245,0.039175,0.263817,0.736607
1,Australia,2018,2.949286,0.477496,0.92283,0.988506,,0.866402,0.106887,0.57008,0.783206,0.143621,0.524863,0.304944,0.014184,0.997685,0.858539,0.815789,0.829006,0.02137,0.03966,0.221224,0.75744
2,Australia,2017,2.300611,0.459818,0.910887,0.988506,0.928576,0.864693,0.103855,0.5554,0.787662,0.145523,0.507755,0.310083,0.014184,0.993056,0.837402,0.815789,0.827686,0.019295,0.039576,0.196928,0.800595
3,Australia,2016,2.770652,0.448407,0.908428,0.977011,0.928521,0.869116,0.094237,0.547012,0.800045,0.111217,0.497035,0.239843,0.015366,0.988426,0.871724,0.842105,0.826414,0.0178,0.039177,0.155981,0.818452
4,Australia,2015,2.192647,0.437904,0.906087,0.965517,0.907221,0.885704,0.097848,0.552604,0.764558,0.123035,0.486998,0.258443,0.016548,0.986111,0.834899,0.842105,0.825204,0.015974,0.038247,0.203304,0.869048


In [7]:
asia_dataset.to_excel('normalised_final_asia_dataset.xlsx')

In [8]:
asia_dataset

Unnamed: 0,Market_Size,Life_Exp,Mean_Years_of_Schooling,Internet_Penetration,Gov_Indicator,Trade_Openness,Labour_Force,Domestic_credit_to_private_sector,inflation,GNI,BoT,Child_Mortality,HDI,Pol_Stability,CPI,Urban_pop,Renew_Energy,CO2_emissions,ind_val_add,unemployment_rate
0,52203.13,82.9,12.7,,1.825413,24.11097,65.93,135.840755,1.610768,50540,2.51,3.6,0.944,0.911746,77.0,0.86124,54.39985,411015667,25.38245,0.0516
1,50252.84,82.74878,12.7,,1.805214,21.8219,65.65,139.525909,1.911401,48660,0.38,3.7,0.943,0.989604,77.0,0.86012,44.443,415953947,24.15897,0.053
2,48482.65,82.5,12.7,86.54505,1.798351,21.20575,65.23,140.292665,1.948647,47160,0.59,3.7,0.941,0.896061,77.0,0.85904,40.206,415097428,23.46106,0.0559
3,47339.97,82.44878,12.6,86.54,1.816108,19.25106,64.99,142.423168,1.276991,46220,-2.28,3.8,0.939,1.047958,79.0,0.858,37.152,411031531,22.28487,0.0571
4,46288.33,82.4,12.5,84.56051,1.882711,19.98485,65.15,136.31738,1.508367,45340,-1.52,3.9,0.938,0.88498,79.0,0.85701,33.423,401554757,23.64421,0.0605
5,46880.22,82.3,12.3,84.0,1.853722,21.03946,64.86,128.506816,2.487923,45650,-0.43,4.0,0.933,1.032192,80.0,0.85602,36.112,394116892,25.51482,0.0608
6,45902.05,82.14878,12.2,83.4535,1.785226,19.95617,65.1,124.782324,2.449889,44760,-1.3,4.1,0.931,1.031073,81.0,0.85502,32.705,397943179,25.12799,0.0566
7,42826.79,82.04634,12.8,79.0,1.985249,21.48083,65.33,121.283724,1.76278,41520,-0.25,4.3,0.937,0.997997,85.0,0.85402,26.318,406506242,26.28416,0.0522
8,41965.36,81.89512,12.7,79.4877,2.04471,21.43642,65.63,122.334841,3.30385,40280,0.97,4.5,0.932,0.93571,,0.853,26.273,404172757,26.43613,0.0508
9,39301.34,81.69512,12.6,76.0,2.032045,19.81001,65.6,125.495658,2.91834,37770,-1.03,4.8,0.93,0.88886,,0.85182,21.72,405502799,25.15142,0.0521


In [9]:
stanscaler = StandardScaler()
names = asia_dataset.iloc[:,:].columns
d = stanscaler.fit_transform(asia_dataset.iloc[:,:])
standscaled_df = pd.DataFrame(d, columns=names)
standscaled_df.insert(0,'GDP_Growth', GDP_growth)
standscaled_df.insert(0,'Years', Years)
standscaled_df.insert(0,'Countries', Countries)
standscaled_df.head()

Unnamed: 0,Countries,Years,GDP_Growth,Market_Size,Life_Exp,Mean_Years_of_Schooling,Internet_Penetration,Gov_Indicator,Trade_Openness,Labour_Force,Domestic_credit_to_private_sector,inflation,GNI,BoT,Child_Mortality,HDI,Pol_Stability,CPI,Urban_pop,Renew_Energy,CO2_emissions,ind_val_add,unemployment_rate
0,Australia,2019,2.160956,1.370287,1.42351,1.489049,,1.468124,-0.425862,0.32186,0.935875,-0.724272,1.401842,0.207444,-0.909408,1.543124,1.089711,1.351209,1.24002,-0.244575,-0.251599,-0.72998,0.991597
1,Australia,2018,2.949286,1.28419,1.397222,1.489049,,1.450416,-0.477069,0.283421,1.005924,-0.629395,1.313894,-0.041948,-0.904796,1.535557,1.160104,1.351209,1.235581,-0.272436,-0.24952,-0.903664,1.077602
2,Australia,2017,2.300611,1.206044,1.353974,1.489049,1.367457,1.4444,-0.490852,0.225763,1.020498,-0.61764,1.243722,-0.01736,-0.904796,1.520422,1.075529,1.351209,1.2313,-0.284292,-0.249881,-1.002738,1.255756
3,Australia,2016,2.770652,1.155599,1.34507,1.451332,1.36729,1.459967,-0.534578,0.192816,1.060996,-0.829608,1.199748,-0.353394,-0.900184,1.505287,1.212863,1.443128,1.227179,-0.292838,-0.251592,-1.169709,1.329474
4,Australia,2015,2.192647,1.109174,1.33659,1.413616,1.301727,1.518354,-0.518163,0.214781,0.944935,-0.756588,1.158581,-0.26441,-0.895572,1.497719,1.065511,1.443128,1.223255,-0.303272,-0.255581,-0.976738,1.538344


In [10]:
asia_dataset.to_excel('standardised_final_asia_dataset.xlsx')