# Titanic Survival Prediction - NN

## Introduction

Classification use NN

## Steps
1. Load Data
2. Feature engineering
3. Train model with NN back propagation
4. Conclusion

### Load data

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tempfile as tempfile
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split
import itertools

In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

Some observations:
- We can drop Name,Ticket column
- We can drop Fare due to pclass already represent
- Cabin contains NaN value but its also potental relate to survival
- Combine column: SibSp/Parch

### Feature Engineering

In [3]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Check missing values

In [4]:
print(pd.isnull(train).sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


- Drop Cabin due to having many Null values


In [5]:
train.pop('Name')
train.pop('Ticket')
train.pop('Cabin')

0              NaN
1              C85
2              NaN
3             C123
4              NaN
5              NaN
6              E46
7              NaN
8              NaN
9              NaN
10              G6
11            C103
12             NaN
13             NaN
14             NaN
15             NaN
16             NaN
17             NaN
18             NaN
19             NaN
20             NaN
21             D56
22             NaN
23              A6
24             NaN
25             NaN
26             NaN
27     C23 C25 C27
28             NaN
29             NaN
          ...     
861            NaN
862            D17
863            NaN
864            NaN
865            NaN
866            NaN
867            A24
868            NaN
869            NaN
870            NaN
871            D35
872    B51 B53 B55
873            NaN
874            NaN
875            NaN
876            NaN
877            NaN
878            NaN
879            C50
880            NaN
881            NaN
882         

In [6]:
print(pd.isnull(train).sum())

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64


In [7]:
train["Age"].fillna(train["Age"].mean(),inplace=True) 
train=train.dropna(how='any')  

### plot data

In [8]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
886,887,0,2,male,27.0,0,0,13.0,S
887,888,1,1,female,19.0,0,0,30.0,S
888,889,0,3,female,29.699118,1,2,23.45,S
889,890,1,1,male,26.0,0,0,30.0,C
890,891,0,3,male,32.0,0,0,7.75,Q


## Train data

### Build column


In [9]:
train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [10]:
_CSV_COLUMNS = [
    'PassengerId', 'Survived', 'Pclass', 'Sex', 'Age',
    'SibSp', 'Parch', 'Fare', 'Embarked'
]
def build_model_columns():
    age = tf.feature_column.numeric_column('Age')
    pclass = tf.feature_column.numeric_column('Pclass')
    sibsp = tf.feature_column.numeric_column('SibSp')
    parch = tf.feature_column.numeric_column('Parch')
    sex = tf.feature_column.categorical_column_with_vocabulary_list(
      'Sex', ['male','female'])
    fare = tf.feature_column.numeric_column('Fare')
    embarked = tf.feature_column.categorical_column_with_vocabulary_list(
      'Embarked', ['S','C','Q'])
    base_columns = [
      pclass,sex,age ,sibsp, parch, fare,embarked
    ]
    

    return base_columns;




In [11]:
def build_estimator():
    deep_columns = build_model_columns()
    hidden_units = [100, 75, 50, 25]
    model_dir = tempfile.mkdtemp()
  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
    run_config = tf.estimator.RunConfig().replace(
      session_config=tf.ConfigProto(device_count={'GPU': 0}))
    return tf.estimator.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=hidden_units,
        config=run_config)

In [12]:
X=train.drop(['PassengerId','Survived'],axis=1)
y=train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X={
#     "Age":np.array(X['Age']),
#     "Pclass":np.array(X['Pclass']),
#     "Sex":np.array(X['Sex']),
#     "SibSp":np.array(X['SibSp']),
#     "Parch":np.array(X['Parch']),
#     "Fare":np.array(X['Fare']),
#     "Embarked":np.array(X['Embarked'])
# }


In [13]:
FEATURES = ['Pclass', 'Sex', 'Age',
    'SibSp', 'Parch', 'Fare', 'Embarked']
LABEL = "Survived"
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=pd.DataFrame({k: X_train[k].values for k in FEATURES}),
      y = pd.Series(y_train.values),
      num_epochs=None,
      shuffle=True)

In [47]:
model = build_estimator()
model.train(input_fn=train_input_fn,steps=3000)


INFO:tensorflow:Using config: {'_num_ps_replicas': 0, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_session_config': device_count {
  key: "GPU"
}
, '_save_checkpoints_secs': 600, '_task_id': 0, '_save_summary_steps': 100, '_model_dir': '/var/folders/3t/47kly2mn3l39d_ncbyv_qh740000gp/T/tmp8ywososa', '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x118e678d0>, '_num_worker_replicas': 1, '_master': '', '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_is_chief': True}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/3t/47kly2mn3l39d_ncbyv_qh740000gp/T/tmp8ywososa/model.ckpt.
INFO:tensorflow:loss = 88.72288, step = 1
INFO:tensorflow:global_step/sec: 284.37
INFO:tensorflow:loss = 58.074165, step = 101 (0.353 sec)
INFO:tensorflow:global_step/sec: 370.29
INFO:tensorflow:loss = 64.11471, step = 201 (0.271

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1163ebda0>

### Evaluate model

In [48]:
test_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=pd.DataFrame({k: X_test[k].values for k in FEATURES}),
      y = pd.Series(y_test.values),
      num_epochs=1,
      shuffle=False)

In [49]:
results = model.evaluate(input_fn=test_input_fn)
for key in sorted(results):
  print('%s: %s' % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2018-01-07-13:54:16
INFO:tensorflow:Restoring parameters from /var/folders/3t/47kly2mn3l39d_ncbyv_qh740000gp/T/tmp8ywososa/model.ckpt-3000
INFO:tensorflow:Finished evaluation at 2018-01-07-13:54:17
INFO:tensorflow:Saving dict for global step 3000: accuracy = 0.83707863, accuracy_baseline = 0.6404494, auc = 0.85834706, auc_precision_recall = 0.83046573, average_loss = 0.42328954, global_step = 3000, label/mean = 0.35955057, loss = 37.672768, prediction/mean = 0.3561749
accuracy: 0.83707863
accuracy_baseline: 0.6404494
auc: 0.85834706
auc_precision_recall: 0.83046573
average_loss: 0.42328954
global_step: 3000
label/mean: 0.35955057
loss: 37.672768
prediction/mean: 0.3561749


### Prediction

In [17]:
test.pop('Name')
test.pop('Ticket')
test.pop('Cabin')
test["Age"].fillna(test["Age"].mean(),inplace=True) 
test["Embarked"].fillna('S',inplace=True) 

In [18]:
test.tail()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
413,1305,3,male,30.27259,0,0,8.05,S
414,1306,1,female,39.0,0,0,108.9,C
415,1307,3,male,38.5,0,0,7.25,S
416,1308,3,male,30.27259,0,0,8.05,S
417,1309,3,male,30.27259,1,1,22.3583,C


In [50]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=pd.DataFrame({k: test[k].values for k in FEATURES}),
      num_epochs=1,
      shuffle=False)

In [51]:
predictions = model.predict(input_fn=predict_input_fn)
predicted_survived = [p["classes"] for p in predictions]

INFO:tensorflow:Restoring parameters from /var/folders/3t/47kly2mn3l39d_ncbyv_qh740000gp/T/tmp8ywososa/model.ckpt-3000


In [52]:
passengerid = test['PassengerId']
output = pd.DataFrame({ 'PassengerId' : passengerid, 'Survived': predicted_survived})
output.to_csv('./output/submission.csv', index=False)

In [53]:
output.tail()

Unnamed: 0,PassengerId,Survived
413,1305,[b'0']
414,1306,[b'1']
415,1307,[b'0']
416,1308,[b'0']
417,1309,[b'0']
