## Titanic: Preprocessing

### Importing libraries

In [None]:
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

### Loading the dataset

In [None]:
df = pd.read_csv('./data/raw_data.csv')

In [None]:
df.head()

### Visualizing features

In [None]:
df.select_dtypes(include=np.number).describe().T

We are sure, that we want to drop the ***PassengerId*** column.

In [None]:
df.select_dtypes(include=object).describe().T

From just looking at the stats, we can assume that we want to remove the ***Name***, ***Ticket*** and the ***Cabin***.

#### Handling NaNs

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

#### Sex Column

In [None]:
df.loc[df['Survived'] == 0 , 'Sex'].value_counts()

In [None]:
df.loc[df['Survived'] == 1 , 'Sex'].value_counts()

In [None]:
sex_mapping = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(sex_mapping)

#### Embarked Column

In [None]:
df.loc[df['Survived'] == 0 , 'Embarked'].value_counts() / df['Embarked'].value_counts()

In [None]:
df.loc[df['Survived'] == 1 , 'Embarked'].value_counts() / df['Embarked'].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['Embarked']).drop(columns=['Embarked_Q'])
df['Embarked_C'] = df['Embarked_C'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)

In [None]:
df.head()

### Looking at the target variable

In [None]:
sns.heatmap(df[['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].corr(), annot=True, fmt=".2f", cmap='YlOrRd', center=0,
                linewidths=.5)
plt.show()

In [None]:
df['Survived'].value_counts()

### Feature selection

In [None]:
FEATS_TO_DROP = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df.drop(columns=FEATS_TO_DROP, inplace=True)

In [None]:
df.isna().sum()

### Splitting & Saving data

In [None]:
def applyScaling(X_train, X_test, y_train, y_test, scaler, columns):
    X_train[columns] = scaler.fit_transform(X_train[columns])
    X_test[columns] = scaler.fit_transform(X_test[columns])

    X = pd.concat([X_train, X_test])
    y = pd.concat([y_train, y_test])

    return pd.concat([X, y], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != 'Survived']
y = df.loc[:, 'Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)

#### Normalization

In [None]:
from sklearn.preprocessing import Normalizer

df_normalized = applyScaling(X_train, X_test, y_train, y_test, Normalizer(), ['Fare', 'Age'])

#### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

df_standardized = applyScaling(X_train, X_test, y_train, y_test, StandardScaler(), ['Fare', 'Age'])

#### Saving scaled data

In [None]:
df_normalized.to_csv('./data/normalized.csv', index=False)
df_standardized.to_csv('./data/standardized.csv', index=False)