In [11]:
import numpy as np 
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

#Try: pd.DataFrame.to_csv(path)

In [4]:
#Setting index for easier usage
train = pd.read_csv("train.csv").set_index("PassengerId")
test = pd.read_csv("test.csv").set_index("PassengerId")

train = train.drop("Cabin", axis = 1)
test = test.drop("Cabin", axis = 1)


#Dropping cabin column as a whole and also removing rows that are missing an Age value
# train = train[train.Age.notnull()].drop("Cabin", axis = 1)
# test = test[test.Age.notnull()].drop("Cabin", axis = 1)

#Filling the two missing values in Embarked that I found on google
train.Embarked = train.Embarked.fillna("S")

#Removing ticket prefix,replacing the special value of ticket and then converting all to floats
train.Ticket = train.Ticket.map(lambda t : t.split()[-1]).replace("LINE", -1).astype("float")
test.Ticket = test.Ticket.map(lambda t : t.split()[-1]).replace("LINE", -1).astype("float")

#Replacing each value of Embarked with a respective integer
# train.Embarked = train.Embarked.replace("S", 1).replace("Q", 2).replace("C", 3)
# test.Embarked = test.Embarked.replace("S", 1).replace("Q", 2).replace("C", 3)

#Same with sex
train.Sex = train.Sex.replace("male", 1).replace("female", 0)
test.Sex = test.Sex.replace("male", 1).replace("female", 0)

dummies = pd.get_dummies(train.Pclass, prefix = "class")
dummies_embark = pd.get_dummies(train.Embarked, prefix = "loc")
train = pd.concat([train, dummies, dummies_embark], axis = 1).drop("Pclass", axis = 1).drop("Embarked", axis = 1)

dummies = pd.get_dummies(test.Pclass, prefix = "class")
dummies_embark = pd.get_dummies(test.Embarked, prefix = "loc")
test = pd.concat([test, dummies, dummies_embark], axis = 1).drop("Pclass", axis = 1).drop("Embarked", axis = 1)

#Replacing titles with dummy
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
train['Title'] = train['Name'].map(lambda x: get_title(x))
train['Title'] = train.apply(replace_titles, axis=1)

test['Title'] = test['Name'].map(lambda x: get_title(x))
test['Title'] = test.apply(replace_titles, axis=1)

dummies = pd.get_dummies(train.Title)
test_dummies = pd.get_dummies(test.Title)

train = pd.concat([train, dummies], axis = 1).drop("Name", axis = 1).drop("Title", axis = 1)
test = pd.concat([test, test_dummies], axis =1).drop("Name", axis = 1).drop("Title", axis = 1)

scaler = StandardScaler()

ticket = train.Ticket.to_numpy().reshape(-1, 1)
fare = train.Fare.to_numpy().reshape(-1, 1)
age = train.Age.to_numpy().reshape(-1, 1)

train.Ticket = scaler.fit_transform(ticket)
train.Fare = scaler.fit_transform(fare)
train.Age = scaler.fit_transform(age)

ticket = test.Ticket.to_numpy().reshape(-1, 1)
fare = test.Fare.to_numpy().reshape(-1, 1)
age = test.Age.to_numpy().reshape(-1, 1)

test.Ticket = scaler.fit_transform(ticket)
test.Fare = scaler.fit_transform(fare)
test.Age = scaler.fit_transform(age)

#Dealing with missing ages

train.Age = train.Age.fillna(train.Age.mean())

test.Age = test.Age.fillna(test.Age.mean())

In [5]:
features = ["Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "class_1", "class_2", "class_3", "loc_C", "loc_Q", "loc_S", "Master", "Miss", "Mr", "Mrs"]

#Converting the features to numpy arrays
X = train[features].to_numpy()
X_test = test[features].to_numpy().transpose()

X_test = X_test.T

#Same with Y
Y = train["Survived"].to_numpy()
Y = Y.reshape((Y.shape[0], 1))



#Splitting into 80-20 train dev split
#TODO: MANUALLY SPLIT, MAKE THE AVG AGE DISTRIBUTION BE IN 
X, X_dev, Y, Y_dev = train_test_split(X, Y, test_size = 0.20)


#Transposing all so they have correct dimentions
X = X.astype("float32")
X_dev = X_dev.astype("float32")
Y = Y.astype("float32")
Y_dev = Y_dev.astype("float32")


In [61]:
model = tf.keras.Sequential()

init_he = tf.keras.initializers.HeNormal()
init_x = tf.keras.initializers.GlorotNormal()

#model.add(tf.keras.layers.InputLayer(input_shape = (571, 8)))
model.add(tf.keras.layers.Dense(256, activation = "relu", kernel_initializer = init_he))
model.add(tf.keras.layers.Dense(128, activation = "relu", kernel_initializer = init_he))
model.add(tf.keras.layers.Dense(64, activation = "relu", kernel_initializer = init_he))
model.add(tf.keras.layers.Dense(64, activation = "relu", kernel_initializer = init_he))
model.add(tf.keras.layers.Dense(64, activation = "relu", kernel_initializer = init_he))
model.add(tf.keras.layers.Dense(1, activation = "sigmoid", kernel_initializer = init_x))

cost = tf.keras.losses.BinaryCrossentropy(from_logits = True)
opt = tf.keras.optimizers.SGD(learning_rate = 0.001, momentum = 0.7)


model.compile(optimizer = opt, loss = cost, metrics = ["accuracy"])

model.fit(X, Y, batch_size = 8, epochs = 100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x19ae5b19070>

In [62]:
model.evaluate(X, Y, verbose = 2)
model.evaluate(X_dev, Y_dev, verbose = 2)

23/23 - 0s - loss: 0.6115 - accuracy: 0.8483
6/6 - 0s - loss: 0.6253 - accuracy: 0.8492


[0.6252683401107788, 0.8491619825363159]

In [64]:
def write_to_csv(filename, predictions, passID):
    '''
    Writes to csv two columns of passangerID and whether they survived
    
    filename: Name of csv to write to
    prediction: npArray of prediction
    passID: npArray containing passIDs
    '''
    new_df = pd.DataFrame({"PassengerID": passID, "Survived":predictions}, index = [0] * len(passID))
    
    new_df.to_csv(filename + ".csv", index = False)
    
pred = model.predict(X_test).round().astype(int).reshape(-1,)

passID = test.index.to_numpy().astype(int)

write_to_csv("SGD", pred, passID)

In [41]:
test.index

Int64Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
            ...
            1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
           dtype='int64', name='PassengerId', length=418)