In [393]:
import os

import pandas as pd
import torch
from torch import nn, optim
import numpy as np

In [394]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [395]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [396]:
y_train_df = train_data['Survived']
print(y_train_df.head())

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [397]:
def normalize(xi, range_min, range_max):
    return (xi - range_min) / (range_max - range_min)

In [398]:
def convert_data(dataset):

    new_data = dataset.PassengerId

    # PClass to one hot encoding
    x1_train_df = pd.get_dummies(dataset.Pclass)
    x1_train_df = x1_train_df.set_axis([f'IsPClass{i}' for i in range(1, 4)], axis=1)
    new_data = pd.concat([new_data, x1_train_df], axis=1)

    # Gender to one hot encoding
    x1_train_df = pd.get_dummies(dataset.Sex)
    x1_train_df = x1_train_df.set_axis(['Female', 'Male'], axis=1)
    new_data = pd.concat([new_data, x1_train_df], axis=1)

    # Amount of siblings to continuous variable
    x1_train_df = dataset[['SibSp']].copy()
    x1_train_df = x1_train_df.apply(lambda cell: normalize(cell, min(x1_train_df.SibSp), max(x1_train_df.SibSp)), axis=0)
    new_data = pd.concat([new_data, x1_train_df], axis=1)

    # Age
    num_age_bins = 10
    age_bin_labels = [f'Bin {i}' for i in range(num_age_bins)]
    age_bins = dataset[['Age']].copy()
    age_bins['Age'] = pd.cut(age_bins['Age'], bins=num_age_bins, include_lowest=True, labels=age_bin_labels)
    age_bins['Age'] = age_bins['Age'].cat.add_categories('missing value').fillna('missing value')
    age_bin_dummies = pd.get_dummies(age_bins)
    new_data = pd.concat([new_data, age_bin_dummies], axis=1)

    # Fare
    num_fare_bins = 25
    fare_bin_labels = [f'FBin {i}' for i in range(num_fare_bins)]
    fare_bins = dataset[['Fare']].copy()
    fare_bins['Fare']= pd.cut(fare_bins['Fare'], bins=num_fare_bins, include_lowest=True, labels=fare_bin_labels)
    fare_bin_dummies = pd.get_dummies(pd.DataFrame(fare_bins))
    new_data = pd.concat([new_data, fare_bin_dummies], axis=1)

    return new_data

In [399]:
x_train_df = convert_data(train_data)
x_test_df = convert_data(test_data)

x_train_df.head()

Unnamed: 0,PassengerId,IsPClass1,IsPClass2,IsPClass3,Female,Male,SibSp,Age_Bin 0,Age_Bin 1,Age_Bin 2,...,Fare_FBin 15,Fare_FBin 16,Fare_FBin 17,Fare_FBin 18,Fare_FBin 19,Fare_FBin 20,Fare_FBin 21,Fare_FBin 22,Fare_FBin 23,Fare_FBin 24
0,1,0,0,1,0,1,0.125,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,1,0,0.125,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,1,1,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,0,0.125,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,1,0,1,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [400]:
from sklearn.ensemble import RandomForestClassifier

print(x_test_df.columns)

features = [f for f in x_train_df.columns if f != 'PassengerId']

forest_train_x = x_train_df[features]
forest_test_x = x_test_df[features]

# Random Forest
forestModel = RandomForestClassifier(n_estimators = 100, max_depth=5, random_state=1)
forestModel.fit(forest_train_x, y_train_df)
predictions = forestModel.predict(forest_test_x)
output = pd.DataFrame({'PassengerId': x_test_df.PassengerId, 'Survived': predictions})
print(os.getcwd())
output.to_csv('data/submission.csv', index=False)


Index(['PassengerId', 'IsPClass1', 'IsPClass2', 'IsPClass3', 'Female', 'Male',
       'SibSp', 'Age_Bin 0', 'Age_Bin 1', 'Age_Bin 2', 'Age_Bin 3',
       'Age_Bin 4', 'Age_Bin 5', 'Age_Bin 6', 'Age_Bin 7', 'Age_Bin 8',
       'Age_Bin 9', 'Age_missing value', 'Fare_FBin 0', 'Fare_FBin 1',
       'Fare_FBin 2', 'Fare_FBin 3', 'Fare_FBin 4', 'Fare_FBin 5',
       'Fare_FBin 6', 'Fare_FBin 7', 'Fare_FBin 8', 'Fare_FBin 9',
       'Fare_FBin 10', 'Fare_FBin 11', 'Fare_FBin 12', 'Fare_FBin 13',
       'Fare_FBin 14', 'Fare_FBin 15', 'Fare_FBin 16', 'Fare_FBin 17',
       'Fare_FBin 18', 'Fare_FBin 19', 'Fare_FBin 20', 'Fare_FBin 21',
       'Fare_FBin 22', 'Fare_FBin 23', 'Fare_FBin 24'],
      dtype='object')
/home/buckfae/Documents/DataScience/Titanic


In [401]:
def create_nn(input_shape, hidden_shapes):
    assert len(hidden_shapes) > 0
    # Input Layer
    network_architecture = [nn.Linear(input_shape, hidden_shapes[0]), nn.ReLU()]

    # Hidden Layer
    hidden = [[nn.Linear(hidden_shapes[i], hidden_shapes[i + 1]), nn.ReLU()] for i in range(len(hidden_shapes) - 1)]
    for layer in hidden:
        for item in layer:
            network_architecture.append(item)

    # Output Layer
    network_architecture += [nn.Linear(hidden_shapes[-1], 1), nn.Sigmoid()]
    return nn.Sequential(*network_architecture)

In [402]:
# Dropping ID column
x_train_df = x_train_df.drop(columns=['PassengerId'], axis=0)
x_test_df = x_test_df.drop(columns=['PassengerId'], axis=0)
x_train_df.head()

x_train = []
y_train = []
for train_row in x_train_df.values:
    x_train.append(torch.tensor(train_row.astype(np.float32)))
for train_row in y_train_df.values:
    y_train.append(torch.tensor([train_row.astype(np.float32)]))

print(f'Input tensor features: {x_train_df.columns}')


Input tensor features: Index(['IsPClass1', 'IsPClass2', 'IsPClass3', 'Female', 'Male', 'SibSp',
       'Age_Bin 0', 'Age_Bin 1', 'Age_Bin 2', 'Age_Bin 3', 'Age_Bin 4',
       'Age_Bin 5', 'Age_Bin 6', 'Age_Bin 7', 'Age_Bin 8', 'Age_Bin 9',
       'Age_missing value', 'Fare_FBin 0', 'Fare_FBin 1', 'Fare_FBin 2',
       'Fare_FBin 3', 'Fare_FBin 4', 'Fare_FBin 5', 'Fare_FBin 6',
       'Fare_FBin 7', 'Fare_FBin 8', 'Fare_FBin 9', 'Fare_FBin 10',
       'Fare_FBin 11', 'Fare_FBin 12', 'Fare_FBin 13', 'Fare_FBin 14',
       'Fare_FBin 15', 'Fare_FBin 16', 'Fare_FBin 17', 'Fare_FBin 18',
       'Fare_FBin 19', 'Fare_FBin 20', 'Fare_FBin 21', 'Fare_FBin 22',
       'Fare_FBin 23', 'Fare_FBin 24'],
      dtype='object')


In [403]:
input_size = x_train_df.shape[1]
hidden_layers = [64, 64, 64]
epochs = 100

print(f'Input size: {input_size}')
model = create_nn(input_size, hidden_layers)
print(model)

Input size: 42
Sequential(
  (0): Linear(in_features=42, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=1, bias=True)
  (7): Sigmoid()
)


In [404]:
optimizer = optim.SGD(model.parameters(), lr=0.0003)
loss_func = nn.BCELoss()

for e in range(epochs + 1):

    running_loss = 0
    correct = 0
    wrong = 0

    for x, y in zip(x_train, y_train):
        optimizer.zero_grad()
        output = model(x)

        if output[0] > 0.5 and y == 1 or output[0] < 0.5 and y == 0:
            correct += 1
        else:
            wrong += 1

        loss = loss_func(output, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if e % 10 == 0:
        avg_loss = running_loss / len(x_train)
        print(f'Episode: {e:04d}: {correct:04d} / {wrong:04d} - Loss: {avg_loss:.10f}')

Episode: 0000: 0484 / 0407 - Loss: 0.6924033796
Episode: 0010: 0549 / 0342 - Loss: 0.6642561893
Episode: 0020: 0549 / 0342 - Loss: 0.6548432316
Episode: 0030: 0549 / 0342 - Loss: 0.6433532642
Episode: 0040: 0549 / 0342 - Loss: 0.6204391499
Episode: 0050: 0590 / 0301 - Loss: 0.5748073137
Episode: 0060: 0690 / 0201 - Loss: 0.5202306847
Episode: 0070: 0704 / 0187 - Loss: 0.4829985039
Episode: 0080: 0703 / 0188 - Loss: 0.4638215694
Episode: 0090: 0710 / 0181 - Loss: 0.4518913827
Episode: 0100: 0715 / 0176 - Loss: 0.4422046982
