# Data Preprocessing

In [69]:
import pandas as pd
import numpy as np

In [70]:
# Loading dataset

df = pd.read_csv("./data/clean_data.csv")

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Age           714 non-null    float64
 3   Fare          891 non-null    float64
 4   Family_size   891 non-null    int64  
 5   Family_type   891 non-null    object 
 6   Sex_female    891 non-null    int64  
 7   Sex_male      891 non-null    int64  
 8   Embarked_C    891 non-null    float64
 9   Embarked_Q    891 non-null    float64
 10  Embarked_S    891 non-null    float64
 11  Embarked_nan  891 non-null    float64
dtypes: float64(6), int64(5), object(1)
memory usage: 83.7+ KB


In [72]:
df.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 'Family_size', 'Family_type',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_nan'],
      dtype='object')

## Filling Missing Values in Age Column

In [73]:
# Using median to fill it, because distribution of Age column is right skewed

df['Age'] = df['Age'].fillna(df['Age'].median())

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Age           891 non-null    float64
 3   Fare          891 non-null    float64
 4   Family_size   891 non-null    int64  
 5   Family_type   891 non-null    object 
 6   Sex_female    891 non-null    int64  
 7   Sex_male      891 non-null    int64  
 8   Embarked_C    891 non-null    float64
 9   Embarked_Q    891 non-null    float64
 10  Embarked_S    891 non-null    float64
 11  Embarked_nan  891 non-null    float64
dtypes: float64(6), int64(5), object(1)
memory usage: 83.7+ KB


In [75]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family_size,Family_type,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,0,3,22.0,7.25,2,small,0,1,0.0,0.0,1.0,0.0
1,1,1,38.0,71.2833,2,small,1,0,1.0,0.0,0.0,0.0
2,1,3,26.0,7.925,1,alone,1,0,0.0,0.0,1.0,0.0
3,1,1,35.0,53.1,2,small,1,0,0.0,0.0,1.0,0.0
4,0,3,35.0,8.05,1,alone,0,1,0.0,0.0,1.0,0.0


## Train-test Split

In [76]:
from sklearn.model_selection import train_test_split

X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
print(f"Shape of X_train : {X_train.shape}")
print(f"Shape of X_test : {X_test.shape}")
print(f"Shape of y_train : {y_train.shape}")
print(f"Shape of y_test : {y_test.shape}")

Shape of X_train : (712, 11)
Shape of X_test : (179, 11)
Shape of y_train : (712,)
Shape of y_test : (179,)


## Feature scaling and normalization
- StandardScaler
- Normalization (MinMaxScaler)
  - MinMaxScaler was tested for learning purposes. However, StandardScaler is chosen because Fare column is highly skewed and StandardScaler handles such distributions better.

### StandardScaler

In [78]:
from sklearn.preprocessing import StandardScaler

num_cols = ['Age', 'Fare', 'Family_size']

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.fit_transform(X_test[num_cols])

In [79]:
X_train.sample(5)

Unnamed: 0,Pclass,Age,Fare,Family_size,Family_type,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
90,3,-0.015704,-0.47246,-0.554666,alone,0,1,0.0,0.0,1.0,0.0
872,1,0.292016,-0.53119,-0.554666,alone,0,1,0.0,0.0,1.0,0.0
737,1,0.445876,9.237724,-0.554666,alone,0,1,1.0,0.0,0.0,0.0
556,1,1.445966,0.135053,0.040096,small,1,0,1.0,0.0,0.0,0.0
370,1,-0.323424,0.440094,0.040096,small,0,1,1.0,0.0,0.0,0.0


In [80]:
X_test.sample(5)

Unnamed: 0,Pclass,Age,Fare,Family_size,Family_type,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
820,1,1.687093,1.596841,0.930016,small,1,0,0.0,0.0,1.0,0.0
39,3,-1.225372,-0.494257,0.159308,small,1,0,1.0,0.0,0.0,0.0
319,1,0.767367,2.639106,0.930016,small,1,0,1.0,0.0,0.0,0.0
211,2,0.384148,-0.24619,-0.611399,alone,1,0,0.0,0.0,1.0,0.0
739,3,-0.152358,-0.579313,-0.611399,alone,0,1,0.0,0.0,1.0,0.0


### Saving df and Scaler Object for further use

In [81]:
X_train.to_csv("./data/scaled_df/X_train_scaled.csv", index=False)
X_test.to_csv("./data/scaled_df/X_test_scaled.csv", index=False)

y_train.to_csv("./data/scaled_df/y_train.csv", index=False)
y_test.to_csv("./data/scaled_df/y_test.csv", index=False)

In [82]:
import joblib
joblib.dump(scaler, "./data/scaled_df/standard_scaler.pkl")

['./data/scaled_df/standard_scaler.pkl']

### Normalization (MinMaxScaler)

In [83]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()

X_train_minmax = minmax.fit_transform(X_train[num_cols])
X_test_minmax = minmax.transform(X_test[num_cols])

In [84]:
# Converting back to DataFrame

X_train_minmax = pd.DataFrame(
    X_train_minmax,
    columns=num_cols,
    index=X_train.index
)

X_test_minmax = pd.DataFrame(
    X_test_minmax,
    columns=num_cols,
    index=X_test.index
)

In [85]:
X_train_minmax.sample(5)

Unnamed: 0,Age,Fare,Family_size
354,0.346569,0.014102,0.0
0,0.271174,0.014151,0.1
743,0.296306,0.031425,0.1
145,0.233476,0.071731,0.2
245,0.547625,0.175668,0.2


In [86]:
X_test_minmax.sample(5)

Unnamed: 0,Age,Fare,Family_size
227,0.24292,0.003217,-0.009539
312,0.311775,0.051533,0.249626
78,-0.003333,0.059264,0.249626
344,0.436968,0.018034,-0.009539
5,0.336814,0.006331,-0.009539
