In [122]:
import pandas as pd
from config import *
import numpy as np
import os.path as osp
import matplotlib.pyplot as plt
from math import isnan
from keras.utils import to_categorical
%matplotlib inline

In [123]:
TRAIN_FILE = osp.join(DATA_FLD, 'train.csv')
TEST_FILE = osp.join(DATA_FLD, 'test.csv')

In [126]:
train_df = pd.read_csv(TRAIN_FILE)
# train_df.head()

In [127]:
y = train_df['Survived']
X = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
X['Cabin'] = X['Cabin'].fillna('None')
X['Cabin'] = X['Cabin'].apply(lambda x: x[0])
X['Age'] = pd.cut(X['Age'], bins = [0, 18, 30, 50, 75, 100], labels=[0, 1, 2, 3, 4])
X['Age'] = X['Age'].cat.codes
X['Age'] = X['Age'].replace(-1, 5)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,1,1,0,7.25,N,S
1,1,female,2,1,0,71.2833,C,C
2,3,female,1,0,0,7.925,N,S
3,1,female,2,1,0,53.1,C,S
4,3,male,2,0,0,8.05,N,S


In [104]:
X['Sex'] = X['Sex'].astype('category')
X['Sex_cat'] = X['Sex'].cat.codes
X['Cabin'] = X['Cabin'].astype('category')
X['Cabin_cat'] = X['Cabin'].cat.codes
X['Embarked'] = X['Embarked'].astype('category')
X['Embarked_cat'] = X['Embarked'].cat.codes

X = X.drop(['Sex', 'Cabin', 'Embarked'], axis=1)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_cat,Cabin_cat,Embarked_cat
0,3,1,1,0,7.25,1,7,2
1,1,2,1,0,71.2833,0,2,0
2,3,1,0,0,7.925,0,7,2
3,1,2,1,0,53.1,0,2,2
4,3,2,0,0,8.05,1,7,2


In [118]:
np.unique(X['Cabin_cat'])

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int8)

In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [117]:
num_vals = np.unique(X['Age']).shape[0]
new_cols = np.reshape([to_categorical(X['Age'], num_vals)], (-1, num_vals))
headers = ['Age_{}'.format(i) for i in range(num_vals)]
n_df = pd.DataFrame(new_cols, columns=headers)
X = X.join(n_df)
X = X.drop('Age', axis=1)
X.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_cat,Cabin_cat,Embarked_cat,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5
0,3,1,0,7.25,1,7,2,0.0,1.0,0.0,0.0,0.0,0.0
1,1,1,0,71.2833,0,2,0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,0,0,7.925,0,7,2,0.0,1.0,0.0,0.0,0.0,0.0
3,1,1,0,53.1,0,2,2,0.0,0.0,1.0,0.0,0.0,0.0
4,3,0,0,8.05,1,7,2,0.0,0.0,1.0,0.0,0.0,0.0


In [99]:
merged_df = X
merged_df['Survived'] = y
merged_df.corr()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_cat,Cabin_cat,Embarked_cat,Survived
Pclass,1.0,0.036776,0.083081,0.018443,-0.5495,0.1319,0.742093,0.173511,-0.338481
Age,0.036776,1.0,-0.068596,-0.178423,-0.056899,0.088337,0.027942,-0.139835,-0.103917
SibSp,0.083081,-0.068596,1.0,0.414838,0.159651,-0.114631,0.041058,0.07148,-0.035322
Parch,0.018443,-0.178423,0.414838,1.0,0.216225,-0.245489,-0.031553,0.043351,0.081629
Fare,-0.5495,-0.056899,0.159651,0.216225,1.0,-0.182333,-0.525742,-0.230365,0.257307
Sex_cat,0.1319,0.088337,-0.114631,-0.245489,-0.182333,1.0,0.118635,0.118492,-0.543351
Cabin_cat,0.742093,0.027942,0.041058,-0.031553,-0.525742,0.118635,1.0,0.217467,-0.295113
Embarked_cat,0.173511,-0.139835,0.07148,0.043351,-0.230365,0.118492,0.217467,1.0,-0.176509
Survived,-0.338481,-0.103917,-0.035322,0.081629,0.257307,-0.543351,-0.295113,-0.176509,1.0


In [125]:
test_df = pd.read_csv(TEST_FILE)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
