In [271]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import math

In [272]:
def preprocess(train):
    train.set_index("PassengerId",inplace = True)
    train_df = train.copy()
    train_df.drop(["Ticket"], axis = 1, inplace = True)
    train_df["Age"].isna().sum()
    train_df["Age"].fillna(train_df["Age"].median(),inplace=True)
    train_df["Age"] = train_df["Age"].map(lambda x: 0 if x < 1 else x)

    train_df["Age"] = mean_normalization(train_df,"Age")
    train_df["Fare"] = mean_normalization(train_df,"Fare")
    
    dummies = pd.get_dummies(train_df["Embarked"])
    train_df.drop("Embarked", axis = 1, inplace = True)
    train_df = pd.concat([train_df,dummies], axis = 1)

    cabin_list = train.groupby("Cabin")["Cabin"].value_counts()
    train_df["Has_cabin"] = train_df["Cabin"].notna()
    train_df["Cabin"] = train_df["Cabin"].fillna("0")
    
    train_df["People_In_Cabin"] = [cabin_list[x][0] if x != '0' else 1 for x in train_df["Cabin"]]
    
    train_df["Cabin"] = train_df["Cabin"].map(lambda x: x[0] if x != "0" else "0")

    cabin_types = sorted(train_df["Cabin"].unique())
    cabins = {}
    cabins_rev = {}

    for index,label in enumerate(cabin_types):
        cabins[label] = index
        cabins_rev[index] = label
        
    train_df["Cabin"] = train_df["Cabin"].map(lambda x: cabins[x])
    train_df["Sex"] = train_df["Sex"].map(lambda x: True if x == "male" else False)
    
    train_df["Is_alone"] = (train_df["SibSp"] + train_df["Parch"]).map(lambda x: True if x == 0 else False)
    train_df["Family_size"] = (train_df["SibSp"] + train_df["Parch"])
    train_df["Name"] = train_df["Name"].map(lambda x: titles(x.split(" ")[1]))   
    
    dummies = pd.get_dummies(train_df["Name"])
    train_df.drop("Name", axis = 1, inplace = True)
    train_df = pd.concat([train_df,dummies], axis = 1)

    train_df.drop(["SibSp","Parch"],axis = 1,inplace = True)
    
    return train_df


def mean_normalization(df,column):
    df[column]=(df[column]-df[column].mean())/df[column].std()
    return df[column]
    
def titles(x):
    return {
        'Mr.': 1,
        'Miss.': 2,
        'Mrs.': 3,
        'Master.': 4,
    }.get(x,5)

In [273]:
train = pd.read_csv("Data/train.csv")
target = pd.read_csv("Data/test.csv")
# display(train.head())

In [274]:
train = preprocess(train)
target = preprocess(target)
target["Fare"] = target["Fare"].fillna(0)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Cabin,C,Q,S,Has_cabin,People_In_Cabin,Is_alone,Family_size,1,2,3,4,5
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,True,-0.564413,-0.502163,0,0,0,1,False,1,False,1,1,0,0,0,0
2,1,1,False,0.663276,0.786404,3,1,0,0,True,1,False,1,0,0,1,0,0
3,1,3,False,-0.257491,-0.48858,0,0,0,1,False,1,True,0,0,1,0,0,0
4,1,1,False,0.433084,0.420494,3,0,0,1,True,2,False,1,0,0,1,0,0
5,0,3,True,0.433084,-0.486064,0,0,0,1,False,1,True,0,1,0,0,0,0


In [275]:
X_train,X_test,y_train,y_test = train_test_split(train.drop("Survived", axis = 1),train["Survived"], test_size = 0.2, random_state = 4)
X_test.to_csv("Preprocessed_Data/X_test.csv")
X_train.to_csv("Preprocessed_Data/X_train.csv")
y_test.to_csv("Preprocessed_Data/y_test.csv")
y_train.to_csv("Preprocessed_Data/y_train.csv")
target.to_csv("Preprocessed_Data/target.csv")