In [1]:
import numpy as np
from sklearn import preprocessing

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd


In [2]:

raw_csv = pd.read_csv('train.csv')
raw_csv.drop(['Name','Ticket','Cabin'],axis=1, inplace=True)
    
embarked_dummies = pd.get_dummies(raw_csv.Embarked, prefix='Embarked')
sex_dummies = pd.get_dummies(raw_csv.Sex, prefix='Sex')
raw_csv.drop(['Sex','Embarked'],axis=1, inplace=True)

encoded_data = pd.concat([raw_csv, sex_dummies, embarked_dummies], axis=1)

encoded_data = encoded_data[['PassengerId','Pclass','Age','SibSp','Parch','Fare','Sex_female','Embarked_Q','Embarked_C','Survived']]
encoded_data.dropna(subset = ["Age"], inplace=True)

encoded_data.to_csv('encoded_data.csv',index=False, header=False)
encoded_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_Q,Embarked_C,Survived
0,1,3,22.0,1,0,7.25,0,0,0,0
1,2,1,38.0,1,0,71.2833,1,0,1,1
2,3,3,26.0,0,0,7.925,1,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,1
4,5,3,35.0,0,0,8.05,0,0,0,0


In [6]:
test_raw_csv = pd.read_csv('test.csv')
test_raw_csv.drop(['Name','Ticket','Cabin'],axis=1, inplace=True)
    
test_embarked_dummies = pd.get_dummies(test_raw_csv.Embarked, prefix='Embarked')
test_sex_dummies = pd.get_dummies(test_raw_csv.Sex, prefix='Sex')
test_raw_csv.drop(['Sex','Embarked'],axis=1, inplace=True)

test_encoded_data = pd.concat([test_raw_csv, test_sex_dummies, test_embarked_dummies], axis=1)

test_encoded_data = test_encoded_data[['PassengerId','Pclass','Age','SibSp','Parch','Fare','Sex_female','Embarked_Q','Embarked_C']]
# test_encoded_data.dropna(subset = ["Age"], inplace=True)
# test_encoded_data.dropna(subset = ["Fare"], inplace=True)

test_encoded_data = test_encoded_data.fillna(test_encoded_data.mean())

test_encoded_data.to_csv('test_encoded_data.csv',index=False, header=False)
test_encoded_data.head()


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_Q,Embarked_C
0,892,3,34.5,0,0,7.8292,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0
2,894,2,62.0,0,0,9.6875,0,1,0
3,895,3,27.0,0,0,8.6625,0,0,0
4,896,3,22.0,1,1,12.2875,1,0,0


In [7]:
encoded_data_csv = np.loadtxt('encoded_data.csv', delimiter = ',')

test_encoded_data_csv = np.loadtxt('test_encoded_data.csv', delimiter = ',')

unscaled_inputs_all = encoded_data_csv[:,1:-1]
targets_all = encoded_data_csv[:,-1]

test_unscaled_inputs_all = test_encoded_data_csv[:,1:]

In [8]:
scaled_inputs = preprocessing.scale(unscaled_inputs_all)
test_scaled_inputs = preprocessing.scale(test_unscaled_inputs_all)

In [9]:
samples_count = scaled_inputs.shape[0]

train_samples_count = int(.85*samples_count)
validation_samples = samples_count - train_samples_count

In [10]:
train_inputs = scaled_inputs[:train_samples_count]
train_targets= targets_all[:train_samples_count]

validation_inputs = scaled_inputs[train_samples_count:]
validation_targets = targets_all[train_samples_count:]

In [11]:
np.savez('train_data', inputs = train_inputs, targets = train_targets)
np.savez('validation_data', inputs = validation_inputs, targets = validation_targets)

np.savez('test_data', inputs = test_scaled_inputs)

In [12]:
npz = np.load('train_data.npz')
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('validation_data.npz')
validation_inputs = npz['inputs'].astype(np.float)
validation_targets = npz['targets'].astype(np.int)

npz = np.load('test_data.npz')
test_inputs = npz['inputs'].astype(np.float)

In [13]:
input_size = 8
output_size = 2
hidden_layer_size = 150

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')    
])

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 200
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 5)

model.fit(train_inputs , train_targets,
         batch_size = batch_size,
         epochs = max_epochs,
         callbacks = [early_stopping],
         validation_data=(validation_inputs,validation_targets),
         verbose = 2)

Epoch 1/100
4/4 - 0s - loss: 0.6934 - accuracy: 0.5050 - val_loss: 0.6084 - val_accuracy: 0.8056
Epoch 2/100
4/4 - 0s - loss: 0.5982 - accuracy: 0.7723 - val_loss: 0.5399 - val_accuracy: 0.8056
Epoch 3/100
4/4 - 0s - loss: 0.5418 - accuracy: 0.7970 - val_loss: 0.4883 - val_accuracy: 0.8056
Epoch 4/100
4/4 - 0s - loss: 0.5002 - accuracy: 0.8020 - val_loss: 0.4523 - val_accuracy: 0.8148
Epoch 5/100
4/4 - 0s - loss: 0.4735 - accuracy: 0.8036 - val_loss: 0.4320 - val_accuracy: 0.8148
Epoch 6/100
4/4 - 0s - loss: 0.4570 - accuracy: 0.7987 - val_loss: 0.4191 - val_accuracy: 0.8241
Epoch 7/100
4/4 - 0s - loss: 0.4477 - accuracy: 0.8053 - val_loss: 0.4136 - val_accuracy: 0.8241
Epoch 8/100
4/4 - 0s - loss: 0.4411 - accuracy: 0.8036 - val_loss: 0.4126 - val_accuracy: 0.8148
Epoch 9/100
4/4 - 0s - loss: 0.4445 - accuracy: 0.7954 - val_loss: 0.4223 - val_accuracy: 0.8148
Epoch 10/100
4/4 - 0s - loss: 0.4476 - accuracy: 0.7954 - val_loss: 0.4316 - val_accuracy: 0.8056
Epoch 11/100
4/4 - 0s - loss:

<tensorflow.python.keras.callbacks.History at 0x21a768ff448>

In [14]:
prediction = model.predict(test_inputs).round(0)
prediction

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [16]:
predictions = pd.DataFrame({'PassengerId': test_encoded_data_csv[:, 0], 'Survived': prediction[:, 1]})
predictions.PassengerId = predictions.PassengerId.astype(int)
predictions.Survived = predictions.Survived.astype(int)
predictions.to_csv('Titanic_predictions.csv' , index=False)
predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
