Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

### Importing packages

In [29]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

import torch
from torch import nn

In [30]:
import torch
print('PyTorch version:', torch.__version__)
if torch.cuda.is_available():
  print("You have %d GPUs" % torch.cuda.device_count())
  gpu_id = torch.cuda.current_device()
  print("The selected GPU is GPU", gpu_id)
  print("- Name:", torch.cuda.get_device_name(gpu_id))
  print("- All properties:",torch.cuda.get_device_properties(gpu_id))

PyTorch version: 2.0.0
You have 1 GPUs
The selected GPU is GPU 0
- Name: NVIDIA GeForce RTX 3080 Ti
- All properties: _CudaDeviceProperties(name='NVIDIA GeForce RTX 3080 Ti', major=8, minor=6, total_memory=12287MB, multi_processor_count=80)


### Data loading and preprocessing

In [176]:
column_names = ['PassengerId','Survived','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_train = pd.read_csv('train.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

column_names = ['PassengerId','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_test = pd.read_csv('test.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

raw_data_train.tail()

Unnamed: 0,PassengerId,Survived,TicketClass,Name,Sex,Age,SiblingsSpouses,ParentChildren,TicketNumber,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Now we'll analyze the data in the training set:

In [32]:
raw_data_train.isna().sum()/len(raw_data_train)

PassengerId        0.000000
Survived           0.000000
TicketClass        0.000000
Name               0.000000
Sex                0.000000
Age                0.198653
SiblingsSpouses    0.000000
ParentChildren     0.000000
TicketNumber       0.000000
Fare               0.000000
Cabin              0.771044
Embarked           0.002245
dtype: float64

In [33]:
train_stats = raw_data_train.describe()
train_stats.pop("Survived") # the column we want to predict its values
train_stats.pop("PassengerId")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TicketClass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SiblingsSpouses,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
ParentChildren,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


After inspecting the dataset, the following considerations can be made:
- PassengerId and TicketNumer are apparently useless features for the task so we'll drop them
- The cabin attribute is unknown for 77% of the samples, so we'll drop the feature
- We'll drop the two samples without embarkment informations
- Even if for almost 20% of the dataset the age is unknown, it is probably an important feature, so we'll drop the samples without age info for semplicity
- We must use One-Hot Encoding for the categorical features 'TicketClass', 'Sex' and 'Embarked'
- We can extract useful information about the social status of the person from the Name attribute
- Numeric features must be normalized

We define a function to preprocess raw data. It will be used to preprocess both the training set and later the test set:

In [185]:
def preprocess_raw_data(raw_data: pd.DataFrame, dropna = True, stats: pd.DataFrame = pd.DataFrame()):
  data = raw_data.copy()
  #Drop unnecessary features
  data = data.drop(['PassengerId','TicketNumber','Cabin'],axis=1)
  #Drop samples without age or without embarkment info if dropna == True, otherwise replace missing data 
  if dropna:
    data = data.dropna()
  else:
    data['Embarked'] = data['Embarked'].fillna('S')
    data['Age'] = data['Age'].fillna(stats['mean']['Age'])
    data['Fare'] = data['Fare'].fillna(stats['mean']['Fare'])
  #One-Hot encode categorical features
  sex = data.pop('Sex')
  data['Male'] = (sex == 'male') * 1.0
  data['Female'] = (sex == 'female') * 1.0
  tClass = data.pop('TicketClass')
  data['UpperClass'] = (tClass == 1) * 1.0
  data['MiddleClass'] = (tClass == 2) * 1.0
  data['LowerClass'] = (tClass == 3) * 1.0
  embarked = data.pop('Embarked')
  data['CherbourgPort'] = (embarked == 'C') * 1.0
  data['QueenstownPort'] = (embarked == 'Q') * 1.0
  data['SouthamptonPort'] = (embarked == 'S') * 1.0
  #Extract the title from the name
  titles = data.pop('Name').str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
  data['Mr'] = (titles == 'Mr') * 1.0
  data['Miss/Mrs/Ms/Lady'] = ((titles == 'Miss') | (titles == 'Mrs') | (titles == 'Ms') | (titles == 'Lady') | (titles == 'Mme') | (titles == 'Mlle')) * 1.0
  data['Master'] = (titles == 'Master') * 1.0
  data['Rev/Dr/Major/Col/Contess/Capt/Sir/Don/Jonkheer'] = ((titles == 'Rev') | (titles == 'Dr') | (titles == 'Major') | (titles == 'Col') | (titles == 'the Countess') | (titles == 'Capt') | (titles == 'Sir') | (titles == 'Don') | (titles == 'Jonkheer') ) * 1.0
  return data

def normalize_data(raw_data: pd.DataFrame, stats: pd.DataFrame):
  data = raw_data
  #Data normalization based only on the statistics
  for col in ['Age', 'SiblingsSpouses', 'ParentChildren', 'Fare']:
    data[col] = (data[col] - stats['mean'][col]) / stats['std'][col]
  return data

# def preprocess_train_val(raw_train_df: pd.DataFrame, raw_val_df: pd.DataFrame):
#   # Preprocess data
#   train_df = preprocess_raw_data(raw_train_df)
#   val_df = preprocess_raw_data(raw_val_df)

#   # Extract statistics for data normalization only from the training set
#   train_stats = train_df.describe()
#   train_stats = train_stats.transpose()

#   train_df = normalize_data(train_df, train_stats)
#   val_df = normalize_data(val_df, train_stats)
#   return (train_df,val_df)

# train_stats = raw_data_train.describe()
# train_stats.pop("Survived") # the column we want to predict its values
# train_stats.pop("PassengerId")
# train_stats = train_stats.transpose()
# train_data = preprocess_raw_data(raw_data_train,train_stats)
# train_data.tail()


### Training and validation extraction

In [148]:
dataset = raw_data_train.copy()
train_data = dataset.sample(frac=0.8, random_state=0)
val_data = dataset.drop(train_data.index)

#(train_data, val_data) = preprocess_train_val(train_data, val_data)


train_data = preprocess_raw_data(train_data)
train_stats = train_data.describe().transpose()
val_data = preprocess_raw_data(val_data, dropna=False, stats=train_stats) #We don't drop samples for validation

train_data = normalize_data(train_data, train_stats)
val_data = normalize_data(val_data, train_stats)

train_labels = train_data.pop("Survived")
val_labels = val_data.pop("Survived")
# train_data.describe()

### Model definition

In [149]:
from torch.utils.data import TensorDataset
def df2tensor(df):
    return torch.from_numpy(df.values).float().cuda()
train_dataset = TensorDataset(df2tensor(train_data), df2tensor(train_labels).type(torch.LongTensor))
val_dataset = TensorDataset(df2tensor(val_data), df2tensor(val_labels).type(torch.LongTensor))

num_train_samples, num_features = train_data.shape

In [150]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_size=num_features, hidden_size=[64, 64], output_size=1):
        # init function executed once when the nn is instantiated
        super().__init__() # execute the nn.Module init function

        # layers with trainable parameters
        all_size = [input_size, ] + hidden_size + [output_size, ] # [num_features, 64, 64, 1]
        linears = []
        for in_size, out_size in zip(all_size[:-1], all_size[1:]):
            linears.append(nn.Linear(in_size, out_size)) # fully-connected layer
        self.linears = nn.ModuleList(linears)

        # layers without trainable parameters
        self.sigmoid = nn.Sigmoid()                      # sigmoid activation layer

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, verbose=False):
        # forward function executed when an input is passed to the nn
        for i, l in enumerate(self.linears):
            x = l(x)                     # apply the i-th fully-connected layer
            if i < len(self.linears) - 1:
                x = self.sigmoid(x)   # apply the sigmoid activation layer, but for the output
            else: #Output layer
                x = self.softmax(x)
        return x.squeeze()               # model output
    
model = MultiLayerPerceptron().cuda()
print(model)

MultiLayerPerceptron(
  (linears): ModuleList(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=-1)
)


### Model training and validation

In [221]:
def evaluate_loss_accuracy(lossFunction, data_loader):
  with torch.no_grad():
    loss = []
    corr_pred = []
    for X, y in data_loader:
      X = X.cuda()
      y = y.cuda()

      o = model(X)
      if(o.dim() == 1):
        o = o[None,:] #BUGFIX for a mini-batch of size 1
      #print(X,y,o,lossFunction(o, y),o.argmax(-1) == y)
      loss.append(lossFunction(o, y))
      corr_pred.append(o.argmax(-1) == y)
    loss = torch.stack(loss).mean().item()
    accuracy = torch.concatenate(corr_pred).float().mean().item()
  return loss, accuracy

def one_epoch(model, lossFunction, optimizer, train_loader, val_loader, writer, epoch_num):
  i_start = epoch_num * len(train_loader)
  for i, (X, y) in enumerate(train_loader):
    X = X.cuda()
    y = y.cuda()

    optimizer.zero_grad()

    o = model(X)
    if(o.dim() == 1):
      o = o[None,:] #BUGFIX for a mini-batch of size 1
    #print(o.shape, y.shape,i)
    l = lossFunction(o, y)

    l.backward()
    optimizer.step()

    acc = (o.detach().argmax(-1) == y.detach()).float().mean()

    # print("- batch loss and accuracy : {:.7f}\t{:.4f}".format(l.detach().item(), acc))
    writer.add_scalar('train/loss', l.detach().item(), i_start+i)
    writer.add_scalar('train/acc', acc, i_start+i)
    #print('train/loss', l.detach().item(), i_start+i)
    #print('train/acc', acc, i_start+i)

  with torch.no_grad():
    train_loss, train_accuracy = evaluate_loss_accuracy(lossFunction, train_loader)
    val_loss, val_accuracy = evaluate_loss_accuracy(lossFunction, val_loader)

    # print("Validation loss and accuracy : {:.7f}\t{:.4f}".format(val_loss, val_accuracy))
    writer.add_scalar('val/loss', val_loss, i_start+i)
    writer.add_scalar('val/acc', val_accuracy, i_start+i)
    #print('val/loss', val_loss, i_start+i)
    #print('val/acc', val_accuracy, i_start+i)
  return val_loss, val_accuracy, train_loss, train_accuracy

In [None]:
import os
from torch.utils.tensorboard.writer import SummaryWriter
from tensorboard import notebook

def start_tensorboard(log_dir):
  writer = SummaryWriter(os.path.join("runs", log_dir))

  # run tensorboard in background
  ! killall tensorboard
  %load_ext tensorboard
  %tensorboard --logdir ./runs --port=8008


  notebook.list() # View open TensorBoard instances

  return writer

experiment_name = 'second-experiment'
os.mkdir(experiment_name)
writer = start_tensorboard(experiment_name)


In [224]:
from torch.utils.data import DataLoader

lossFunction = nn.CrossEntropyLoss()
batch_size = 16
lr = 0.001
momentum = .9
lambda_reg = 0

epochs = 20000
early_stopping_patience = 30

# dataloader, network, optimizer for each fold
train_loader = DataLoader(train_dataset, shuffle=True, batch_size = batch_size)# num_workers = 1, pin_memory = True)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size = batch_size)# num_workers = 1, pin_memory = True)

model = MultiLayerPerceptron(output_size=2, hidden_size=[64, 64]).cuda()
optimizer = torch.optim.SGD(model.parameters(),
                            lr=lr,
                            weight_decay=lambda_reg,
                            momentum=momentum)

# early stopping and best model saving
early_stopping_counter = early_stopping_patience
min_val_loss = 1e10

val_losses = torch.zeros(epochs, 1)
val_accuracies = torch.zeros(epochs, 1)
train_losses = torch.zeros(epochs, 1)
train_accuracies = torch.zeros(epochs, 1)

for e in range(epochs):
    print("FOLD {} - EPOCH {}".format(0, e))
    val_loss, val_accuracy, train_loss, train_accuracy = one_epoch(model, lossFunction, optimizer, train_loader, val_loader, writer, e)

    # store the validation metrics
    val_losses[e, 0] = val_loss
    val_accuracies[e, 0] = val_accuracy
    train_losses[e, 0] = train_loss
    train_accuracies[e, 0] = train_loss

    generalization_gap = val_loss - train_loss

    torch.save(val_losses, os.path.join(experiment_name,'val_losses.pth'))
    torch.save(val_accuracies, os.path.join(experiment_name,'val_accuracies.pth'))
    torch.save(train_losses, os.path.join(experiment_name,'train_losses.pth'))
    torch.save(train_accuracies, os.path.join(experiment_name,'train_accuracies.pth'))
    # save the best model and check the early stopping criteria
    if val_loss < min_val_loss: # save the best model
        min_val_loss = val_loss
        early_stopping_counter = early_stopping_patience # reset early stopping counter
        torch.save(model.state_dict(), os.path.join(experiment_name,'fold_{}_best_model.pth'.format(0)))
        print("- saved best model")

    if e>0: # early stopping counter update
        if generalization_gap > val_losses[e-1, 0] - train_losses[e-1, 0] or val_loss >= min_val_loss:
            early_stopping_counter -= 1 # update early stopping counter
            print(generalization_gap, early_stopping_counter)
        else:
            early_stopping_counter = early_stopping_patience # reset early stopping counter
    if early_stopping_counter == 0: # early stopping
        break


FOLD 0 - EPOCH 0
- saved best model
FOLD 0 - EPOCH 1
0.037335991859436035 29
FOLD 0 - EPOCH 2
0.042100727558135986 28
FOLD 0 - EPOCH 3
0.04541635513305664 27
FOLD 0 - EPOCH 4
FOLD 0 - EPOCH 5
0.04704439640045166 29
FOLD 0 - EPOCH 6
FOLD 0 - EPOCH 7
0.049262046813964844 29
FOLD 0 - EPOCH 8
FOLD 0 - EPOCH 9
0.04894179105758667 29
FOLD 0 - EPOCH 10
0.04923534393310547 28
FOLD 0 - EPOCH 11
FOLD 0 - EPOCH 12
0.05005753040313721 29
FOLD 0 - EPOCH 13
FOLD 0 - EPOCH 14
0.05032014846801758 29
FOLD 0 - EPOCH 15
FOLD 0 - EPOCH 16
FOLD 0 - EPOCH 17
0.047917842864990234 29
FOLD 0 - EPOCH 18
0.04845243692398071 28
FOLD 0 - EPOCH 19
FOLD 0 - EPOCH 20
FOLD 0 - EPOCH 21
0.04908883571624756 29
FOLD 0 - EPOCH 22
FOLD 0 - EPOCH 23
0.04898726940155029 29
FOLD 0 - EPOCH 24
FOLD 0 - EPOCH 25
FOLD 0 - EPOCH 26
0.04829061031341553 29
FOLD 0 - EPOCH 27
FOLD 0 - EPOCH 28
0.04659676551818848 29
FOLD 0 - EPOCH 29
0.04839581251144409 28
FOLD 0 - EPOCH 30
FOLD 0 - EPOCH 31
0.04793393611907959 29
FOLD 0 - EPOCH 32
FO

In [220]:
from torch.utils.data import DataLoader

experiment_name = 'second-experiment'

model = MultiLayerPerceptron(output_size=2, hidden_size=[64,64]).cuda()
model.load_state_dict(torch.load(os.path.join(experiment_name,'best_model.pth')))

Y, Y_hat = [], []
with torch.no_grad():
  for X, y in DataLoader(val_dataset, shuffle=False):
    Y.append(y)
    Y_hat.append(model(X.cuda()).argmax(-1).cpu())

Y = torch.concatenate(Y)
Y_hat = torch.stack(Y_hat)
print("Test accuracy:", (Y==Y_hat).float().mean().item())

Test accuracy: 0.8089887499809265


In [213]:
from torch.utils.data import DataLoader

experiment_name = 'first-experiment'

model = MultiLayerPerceptron(output_size=2, hidden_size=[64,64]).cuda()
model.load_state_dict(torch.load(os.path.join(experiment_name,'best_model.pth')))

test_data = raw_data_test.copy()

predictions = pd.DataFrame(columns=["PassengerId"])
predictions["PassengerId"] = test_data["PassengerId"]

test_data = preprocess_raw_data(test_data, dropna=False, stats = train_stats)
test_data = normalize_data(test_data, train_stats)
test_data_tensor = df2tensor(test_data)

with torch.no_grad():
  Y_hat = model(test_data_tensor.cuda()).argmax(-1).cpu()
  print(Y_hat)

Y_hat = pd.DataFrame(Y_hat.numpy())
predictions["Survived"] = Y_hat
predictions.to_csv("predictions.csv", encoding='utf-8', index=False)

tensor([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,