In [None]:
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('train.csv')
df.head(5)

In [None]:
df.describe(include='all')

In [None]:
def preprocess(data_old, training=False):
    if training:
        # get relevant columns
        data = data_old[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']]
        data.Survived = data.Survived.astype('float')
    else:
        # get relevant columns
        data = data_old[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    
    # encode in float
    data.Sex = data.Sex.map({'female': 0.0, 'male': 1.0})
    data.Embarked = data.Embarked.map({'S': 0.0, 'C': 1.0, 'Q': 2.0})
    data.SibSp = data.SibSp.astype('float')
    data.Parch = data.Parch.astype('float')
    
    
    # fill nan with mean
    data = data.fillna(data.mean())

    return data

def get_model():
    return nn.Sequential(nn.Linear(input_dimensions,10),
                         nn.ReLU(),
                         nn.Linear(10,10),
                         nn.ReLU(),
                         nn.Linear(10,10),
                         nn.ReLU(),
                         nn.Linear(10,2),
                         nn.LogSoftmax()
                        )

def data_to_inputs(data):
    if 'Survived' in data.columns:
        inputs = data.copy()
        inputs.pop('Survived')
    else:
        inputs = data.copy()
    return torch.tensor(np.array(inputs), dtype=torch.float)

def data_to_labels(data):
    labels = data.Survived
    return torch.tensor(np.array(labels), dtype=torch.long)

def ps_to_class(ps_out):
    _, top_class = ps_out.topk(1, dim=1)
    return top_class.view(-1)

def split_const_size(data, size=32):
    nr_splits = np.math.ceil(len(data)/size)
    return [data.iloc[b*size:(b+1)*size] for b in range(nr_splits)]

def train_loop(model, train, test, criterion, optimizer, epochs=50, trainsize=1000, testsize=1000):
    train_losses = []
    test_accs = []

    for e in range(epochs):
        
        model.train()
        running_loss = 0
        shuffled_train = train.sample(frac=1)
        batches_train = split_const_size(shuffled_train)
        input_batches = [data_to_inputs(batch) for batch in batches_train]
        label_batches = [data_to_labels(batch) for batch in batches_train]
        for i, (inputs, labels) in enumerate(zip(input_batches, label_batches)):
            trainsize = min(trainsize, len(inputs))
            if i > trainsize:
                continue
            optimizer.zero_grad()
            ps_out = model(inputs)
            loss = criterion(ps_out, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss
        else:
            equals_total = 0
            total = 0
            model.eval()
      
            inputs = data_to_inputs(test)
            labels = data_to_labels(test)
            
            # cut off test set
            testsize = min(testsize, len(inputs))
            inputs = inputs[:testsize]
            labels = labels[:testsize]

            # compare model output to targets
            ps_out = model(inputs)
            classification = ps_to_class(ps_out)
            
            equals_total += sum(classification == labels).item()
            total += len(classification)
            
            acc_test = equals_total/total
            train_losses.append(running_loss.item()/trainsize)
            test_accs.append(acc_test)
    return train_losses, test_accs

data = preprocess(df, training=True)
input_dimensions = data.shape[1] - 1
data.head(10)

In [None]:
train = data.sample(frac=0.75)
test = data.drop(train.index)

model = get_model()
criterion = nn.NLLLoss()
optimizer=optim.Adam(model.parameters(), lr=0.003)
train_losses, test_accs = train_loop(model, train, test, criterion, optimizer, epochs=100)

# visualize train loss and test acc
plt.plot(train_losses)
plt.xlabel('epoch')
plt.ylabel('train NLLLoss')
plt.show()

plt.plot(test_accs)
plt.xlabel('epoch')
plt.ylabel('test accuracy')
plt.show()

In [None]:
def predict_gender_based(x):
    if x == 'female':
        return 1
    else:
        return 0

In [None]:
# predict test set
test_df = pd.read_csv('test.csv')

#test_df['Survived'] = test_df['Sex'].map(predict_gender_based)
model.eval()
preprocessed = preprocess(test_df)
inputs = data_to_inputs(preprocessed)
ps_out = model(inputs)
classification = ps_to_class(ps_out)

test_df['Survived'] = pd.Series(classification, dtype='int')
test_df.head()

# to submission file: passengerId,Survived
submission = test_df[['PassengerId', 'Survived']]
submission.to_csv('submission.csv', index=False)