## Feature Scaling - Standardization & Normalization

In [1]:
# Import libraries
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# load the dataset
dataset = sns.load_dataset("titanic")
dataset.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [3]:
# take only numeric value data for feature scaling
df1 = dataset[['survived',"pclass",'age','parch']]
df1.head()

Unnamed: 0,survived,pclass,age,parch
0,0,3,22.0,0
1,1,1,38.0,0
2,1,3,26.0,0
3,1,1,35.0,0
4,0,3,35.0,0


In [4]:
# check the missing value
df1.isnull().sum()

survived      0
pclass        0
age         177
parch         0
dtype: int64

In [5]:
# fill the missing value with Mean
df2 = df1.fillna(df1.mean())

In [9]:
# define independent and dependent variables
X = df2.drop("survived", axis = 1)
y = df2["survived"]
print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (891, 3)
Shape of y =  (891,)


In [10]:
# split the data into train test
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=51)
print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (712, 3)
Shape of y_train =  (712,)
Shape of X_test =  (179, 3)
Shape of y_test =  (179,)


### Apply StandardScaler

In [14]:
sc = StandardScaler()
sc.fit(X_train)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
sc.mean_

array([ 2.30617978, 29.55409121,  0.39185393])

In [13]:
sc.scale_

array([ 0.84405789, 12.99162985,  0.79647463])

In [15]:
X_train.describe()

Unnamed: 0,pclass,age,parch
count,712.0,712.0,712.0
mean,2.30618,29.554091,0.391854
std,0.844651,13.000763,0.797035
min,1.0,0.42,0.0
25%,1.75,22.0,0.0
50%,3.0,29.699118,0.0
75%,3.0,35.0,0.0
max,3.0,71.0,5.0


In [16]:
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

  """Entry point for launching an IPython kernel.
  


In [17]:
X_train_sc

array([[ 0.8220055 , -0.42751304, -0.49198545],
       [ 0.8220055 ,  1.997125  , -0.49198545],
       [ 0.8220055 , -1.42815732, -0.49198545],
       ...,
       [ 0.8220055 ,  1.41983023,  3.27461284],
       [ 0.8220055 ,  0.01116307,  0.76354731],
       [ 0.8220055 , -0.08113618, -0.49198545]])

In [18]:
X_train_sc =  pd.DataFrame(X_train_sc, columns = ["pclass",'age','parch'])
X_test_sc =  pd.DataFrame(X_test_sc, columns = ["pclass",'age','parch'])

In [19]:
X_train_sc.head()

Unnamed: 0,pclass,age,parch
0,0.822005,-0.427513,-0.491985
1,0.822005,1.997125,-0.491985
2,0.822005,-1.428157,-0.491985
3,0.822005,-0.889349,-0.491985
4,0.822005,0.011163,2.01908


In [20]:
X_train_sc.describe().round(2)

Unnamed: 0,pclass,age,parch
count,712.0,712.0,712.0
mean,0.0,0.0,0.0
std,1.0,1.0,1.0
min,-1.55,-2.24,-0.49
25%,-0.66,-0.58,-0.49
50%,0.82,0.01,-0.49
75%,0.82,0.42,-0.49
max,0.82,3.19,5.79


### Apply MinMax Scaler

In [21]:
mmc = MinMaxScaler()
mmc.fit(X_train)

  return self.partial_fit(X, y)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [22]:
X_train_mmc = mmc.transform(X_train)
X_test_mmc = mmc.transform(X_test)

In [23]:
X_train_mmc

array([[1.        , 0.33408898, 0.        ],
       [1.        , 0.78039105, 0.        ],
       [1.        , 0.14990082, 0.        ],
       ...,
       [1.        , 0.67412865, 0.6       ],
       [1.        , 0.4148359 , 0.2       ],
       [1.        , 0.39784642, 0.        ]])

In [24]:
X_train_mmc =  pd.DataFrame(X_train_mmc, columns = ["pclass",'age','parch'])
X_test_mmc =  pd.DataFrame(X_test_mmc, columns = ["pclass",'age','parch'])

In [25]:
X_train_mmc.describe().round(2)

Unnamed: 0,pclass,age,parch
count,712.0,712.0,712.0
mean,0.65,0.41,0.08
std,0.42,0.18,0.16
min,0.0,0.0,0.0
25%,0.38,0.31,0.0
50%,1.0,0.41,0.0
75%,1.0,0.49,0.0
max,1.0,1.0,1.0
