In [2]:
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops

import numpy as np
import pandas as pd
from pandas import Series

# Input data
df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')

# Family
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# Fare
df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].median()

df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

def category_fare(x):
    if x == 0:
        return 0
    elif x < 2:
        return 1
    elif x < 3:
        return 2
    elif x < 4:
        return 3    
    else:
        return 4
    
df_train['Fare_cat'] = df_train['Fare'].apply(category_fare)
df_test['Fare_cat'] = df_test['Fare'].apply(category_fare)

# Name
df_train['Initial']= df_train.Name.str.extract('([A-Za-z]+)\.')
df_test['Initial']= df_test.Name.str.extract('([A-Za-z]+)\.')

df_train['Initial'].replace(['Master', 'Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col',\
                             'Rev','Capt','Sir','Don', 'Dona'],
                        ['Other', 'Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other',\
                         'Mr','Mr','Mr', 'Mr'],inplace=True)
df_test['Initial'].replace(['Master', 'Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col',\
                            'Rev','Capt','Sir','Don', 'Dona'],
                        ['Other', 'Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other',\
                         'Mr','Mr','Mr', 'Mr'],inplace=True)

# Age
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mr'),'Age'] = \
                                            np.round(df_train.groupby("Initial").mean()["Age"]["Mr"])
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mrs'),'Age'] = \
                                            np.round(df_train.groupby("Initial").mean()["Age"]["Mrs"])
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Miss'),'Age'] = \
                                            np.round(df_train.groupby("Initial").mean()["Age"]["Miss"])
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Other'),'Age'] = \
                                            np.round(df_train.groupby("Initial").mean()["Age"]["Other"])

df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Mr'),'Age'] = \
                                            np.round(df_test.groupby("Initial").mean()["Age"]["Mr"])
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Mrs'),'Age'] = \
                                            np.round(df_test.groupby("Initial").mean()["Age"]["Mrs"])
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Miss'),'Age'] = \
                                            np.round(df_test.groupby("Initial").mean()["Age"]["Miss"])
df_test.loc[(df_test.Age.isnull())&(df_test.Initial=='Other'),'Age'] = \
                                            np.round(df_test.groupby("Initial").mean()["Age"]["Other"])

def category_age(x):
    if x < 18:
        return 0
    elif x < 27:
        return 1
    elif x < 36:
        return 2
    elif x < 45:
        return 3
    elif x < 54:
        return 4
    else:
        return 5
    
df_train['Age_cat'] = df_train['Age'].apply(category_age)
df_test['Age_cat'] = df_test['Age'].apply(category_age)

# Initial
df_train['Initial'] = df_train['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})
df_test['Initial'] = df_test['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})

# Embarked
df_train['Embarked'].fillna('S', inplace=True)

df_train['Embarked'] = df_train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df_test['Embarked'] = df_test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Sex
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})

# Cabin_cat
df_train['Cabin_cat'] = df_train.Cabin.isnull()
df_train['Cabin_cat'] = df_train['Cabin_cat'].map({True:0, False:1})
df_test['Cabin_cat'] = df_test.Cabin.isnull()
df_test['Cabin_cat'] = df_test['Cabin_cat'].map({True:0, False:1})

### One-hot enconding
df_train = pd.get_dummies(df_train, columns=['Pclass'], prefix='Pclass')
df_test = pd.get_dummies(df_test, columns=['Pclass'], prefix='Pclass')
df_train = pd.get_dummies(df_train, columns=['Fare_cat'], prefix='Fare_cat')
df_test = pd.get_dummies(df_test, columns=['Fare_cat'], prefix='Fare_cat')
df_train = pd.get_dummies(df_train, columns=['Age_cat'], prefix='Age_cat')
df_test = pd.get_dummies(df_test, columns=['Age_cat'], prefix='Age_cat')
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')
df_train = pd.get_dummies(df_train, columns=['Cabin_cat'], prefix='Cabin_cat')
df_test = pd.get_dummies(df_test, columns=['Cabin_cat'], prefix='Cabin_cat')

### Drop
df_train.drop(['PassengerId', 'Name', 'Fare', 'Age',  'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'Fare', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)

X_train_orig = df_train.drop('Survived', axis= 1).values
temp_1 = df_train['Survived'].values
temp_2 = pd.get_dummies(temp_1, columns=['Survived'], prefix='Survived')
Y_train_orig = temp_2.values
cut = int(np.floor(len(Y_train_orig)*0.9))
X_train = X_train_orig[:cut]
Y_train = Y_train_orig[:cut]
X_valid = X_train_orig[cut:]
Y_valid = Y_train_orig[cut:]
X_test = df_test.values

In [5]:
a = pd.DataFrame(X_train_orig)

In [7]:
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,2,0,0,1,0,1,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,2,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
2,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,1,0,0,0,1,0
3,0,2,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
4,1,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0


In [8]:
X_train_orig[0]

array([1, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0], dtype=int64)