Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

### Importing packages

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

import torch
from torch import nn

In [6]:
import torch
print('PyTorch version:', torch.__version__)
if torch.cuda.is_available():
  print("You have %d GPUs" % torch.cuda.device_count())
  gpu_id = torch.cuda.current_device()
  print("The selected GPU is GPU", gpu_id)
  print("- Name:", torch.cuda.get_device_name(gpu_id))
  print("- All properties:",torch.cuda.get_device_properties(gpu_id))

PyTorch version: 2.0.0
You have 1 GPUs
The selected GPU is GPU 0
- Name: NVIDIA GeForce RTX 3080 Ti
- All properties: _CudaDeviceProperties(name='NVIDIA GeForce RTX 3080 Ti', major=8, minor=6, total_memory=12287MB, multi_processor_count=80)


### Data loading and preprocessing

In [7]:
column_names = ['PassengerId','Survived','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_train = pd.read_csv('train.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

column_names = ['PassengerId','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_test = pd.read_csv('train.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

raw_data_train.tail()

Unnamed: 0,PassengerId,Survived,TicketClass,Name,Sex,Age,SiblingsSpouses,ParentChildren,TicketNumber,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Now we'll analyze the data in the training set:

In [8]:
raw_data_train.isna().sum()/len(raw_data_train)

PassengerId        0.000000
Survived           0.000000
TicketClass        0.000000
Name               0.000000
Sex                0.000000
Age                0.198653
SiblingsSpouses    0.000000
ParentChildren     0.000000
TicketNumber       0.000000
Fare               0.000000
Cabin              0.771044
Embarked           0.002245
dtype: float64

In [9]:
train_stats = raw_data_train.describe()
train_stats.pop("Survived") # the column we want to predict its values
train_stats.pop("PassengerId")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TicketClass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SiblingsSpouses,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
ParentChildren,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


After inspecting the dataset, the following considerations can be made:
- PassengerId and TicketNumer are apparently useless features for the task so we'll drop them
- The cabin attribute is unknown for 77% of the samples, so we'll drop the feature
- We'll drop the two samples without embarkment informations
- Even if for almost 20% of the dataset the age is unknown, it is probably an important feature, so we'll drop the samples without age info for semplicity
- We must use One-Hot Encoding for the categorical features 'TicketClass', 'Sex' and 'Embarked'
- We can extract useful information about the social status of the person from the Name attribute
- Numeric features must be normalized

We define a function to preprocess raw data. It will be used to preprocess both the training set and later the test set:

In [10]:
def preprocess_raw_data(raw_data: pd.DataFrame):
  data = raw_data.copy()
  #Drop unnecessary features
  data = data.drop(['PassengerId','TicketNumber','Cabin'],axis=1)
  #Drop samples without age or without embarkment info
  data = data.dropna()
  #One-Hot encode categorical features
  sex = data.pop('Sex')
  data['Male'] = (sex == 'male') * 1.0
  data['Female'] = (sex == 'female') * 1.0
  tClass = data.pop('TicketClass')
  data['UpperClass'] = (tClass == 1) * 1.0
  data['MiddleClass'] = (tClass == 2) * 1.0
  data['LowerClass'] = (tClass == 3) * 1.0
  embarked = data.pop('Embarked')
  data['CherbourgPort'] = (embarked == 'C') * 1.0
  data['QueenstownPort'] = (embarked == 'Q') * 1.0
  data['SouthamptonPort'] = (embarked == 'S') * 1.0
  #Extract the title from the name
  titles = data.pop('Name').str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
  data['Mr'] = (titles == 'Mr') * 1.0
  data['Miss/Mrs/Ms/Lady'] = ((titles == 'Miss') | (titles == 'Mrs') | (titles == 'Ms') | (titles == 'Lady') | (titles == 'Mme') | (titles == 'Mlle')) * 1.0
  data['Master'] = (titles == 'Master') * 1.0
  data['Rev/Dr/Major/Col/Contess/Capt/Sir/Don/Jonkheer'] = ((titles == 'Rev') | (titles == 'Dr') | (titles == 'Major') | (titles == 'Col') | (titles == 'the Countess') | (titles == 'Capt') | (titles == 'Sir') | (titles == 'Don') | (titles == 'Jonkheer') ) * 1.0
  return data

def normalize_data(raw_data: pd.DataFrame, stats: pd.DataFrame):
  data = raw_data
  #Data normalization based only on the statistics
  for col in ['Age', 'SiblingsSpouses', 'ParentChildren', 'Fare']:
    data[col] = (data[col] - stats['mean'][col]) / stats['std'][col]
  return data

def preprocess_train_val(raw_train_df: pd.DataFrame, raw_val_df: pd.DataFrame):
  # Preprocess data
  train_df = preprocess_raw_data(raw_train_df)
  val_df = preprocess_raw_data(raw_val_df)

  # Extract statistics for data normalization only from the training set
  train_stats = train_df.describe()
  train_stats = train_stats.transpose()

  train_df = normalize_data(train_df, train_stats)
  val_df = normalize_data(val_df, train_stats)
  return (train_df,val_df)

# train_stats = raw_data_train.describe()
# train_stats.pop("Survived") # the column we want to predict its values
# train_stats.pop("PassengerId")
# train_stats = train_stats.transpose()
# train_data = preprocess_raw_data(raw_data_train,train_stats)
# train_data.tail()


### Training and validation extraction

In [11]:
dataset = raw_data_train.copy()
train_data = dataset.sample(frac=0.8, random_state=0)
val_data = dataset.drop(train_data.index)

(train_data, val_data) = preprocess_train_val(train_data, val_data)
train_labels = train_data.pop("Survived")
val_labels = val_data.pop("Survived")
# train_data.describe()

## Model definition

In [12]:
from torch.utils.data import TensorDataset
def df2tensor(df):
    return torch.from_numpy(df.values).float().cuda()
train_dataset = TensorDataset(df2tensor(train_data), df2tensor(train_labels))
val_dataset = TensorDataset(df2tensor(val_data), df2tensor(val_labels))

num_train_samples, num_features = train_data.shape

In [13]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_size=num_features, hidden_size=[64, 64], output_size=1):
        # init function executed once when the nn is instantiated
        super().__init__() # execute the nn.Module init function

        # layers with trainable parameters
        all_size = [input_size, ] + hidden_size + [output_size, ] # [num_features, 64, 64, 1]
        linears = []
        for in_size, out_size in zip(all_size[:-1], all_size[1:]):
            linears.append(nn.Linear(in_size, out_size)) # fully-connected layer
        self.linears = nn.ModuleList(linears)

        # layers without trainable parameters
        self.sigmoid = nn.Sigmoid()                      # sigmoid activation layer

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, verbose=False):
        # forward function executed when an input is passed to the nn
        for i, l in enumerate(self.linears):
            x = l(x)                     # apply the i-th fully-connected layer
            if i < len(self.linears) - 1:
                x = self.sigmoid(x)   # apply the sigmoid activation layer, but for the output
            else: #Output layer
                x = self.softmax(x)
        return x.squeeze()               # model output

### Model training

In [14]:
def one_epoch(model, lossFunction, optimizer, train_loader, val_loader, writer, epoch_num):
  i_start = epoch_num * len(train_loader)
  for i, (X, y) in enumerate(train_loader):
    X = X.cuda()
    y = y.cuda()

    optimizer.zero_grad()

    o = model(X)
    l = lossFunction(o, y)

    l.backward()
    optimizer.step()

    mae = (o.detach() - y.detach()).abs().mean()

    # print("- batch loss and accuracy : {:.7f}\t{:.4f}".format(l.detach().item(), acc))
    writer.add_scalar('train/loss', l.detach().item(), i_start+i)
    writer.add_scalar('train/mae', mae, i_start+i)

  with torch.no_grad():
    err = []
    for X, y in val_loader:
      X = X.cuda()
      y = y.cuda()

      err.append(y - model(X))

    err = torch.concatenate(err)
    val_loss = (err**2).mean().item()
    val_mae = err.abs().mean().item()

    # print("Validation loss and accuracy : {:.7f}\t{:.4f}".format(val_loss, val_accuracy))
    writer.add_scalar('val/loss', val_loss, i_start+i)
    writer.add_scalar('val/mae', val_mae, i_start+i)
  return val_loss, val_mae

MultiLayerPerceptron(
  (linears): ModuleList(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=-1)
)


In [1]:
import os
from torch.utils.tensorboard import SummaryWriter
from tensorboard import notebook

def start_tensorboard(log_dir):
  writer = SummaryWriter(os.path.join("runs", log_dir))

  # run tensorboard in background
  ! killall tensorboard
  %load_ext tensorboard
  %tensorboard --logdir ./runs

  notebook.list() # View open TensorBoard instances

  return writer

In [None]:
from torch.utils.data import DataLoader

lossFunction = nn.CrossEntropyLoss()
batch_size = 512
lr = .01
momentum = .9
lambda_reg = 0

epochs = 200
early_stopping_patience = 40

# dataloader, network, optimizer for each fold
train_loader = DataLoader(train_dataset, shuffle=True)
val_loader = DataLoader(val_dataset, shuffle=False)

model = MultiLayerPerceptron().cuda()
optimizer = torch.optim.SGD(model.parameters(),
                            lr=lr,
                            weight_decay=lambda_reg,
                            momentum=momentum)

# early stopping and best model saving
early_stopping_counter = early_stopping_patience
min_val_loss = 1e10

experiment_name = 'test'
writer = start_tensorboard(experiment_name)

val_losses = torch.zeros(epochs, 1)
val_accuracies = torch.zeros(epochs, 1)

for e in range(epochs):
    print("FOLD {} - EPOCH {}".format(0, e))
    val_loss, val_accuracy = one_epoch(model, lossFunction, optimizer, train_loader, val_loader, writer, e)

    # store the validation metrics
    val_losses[e] = val_loss
    val_accuracies[e] = val_accuracy
    torch.save(val_losses, os.path.join(experiment_name,'val_losses.pth'))
    torch.save(val_accuracies, os.path.join(experiment_name,'val_accuracies.pth'))

    # save the best model and check the early stopping criteria
    if val_loss < min_val_loss: # save the best model
        min_val_loss = val_loss
        early_stopping_counter = early_stopping_patience # reset early stopping counter
        torch.save(model.state_dict(), os.path.join(experiment_name,'fold_{}_best_model.pth'.format(0)))
        print("- saved best model")

    if e>0: # early stopping counter update
        if val_losses[e, 0] > val_losses[e-1, 0]:
            early_stopping_counter -= 1 # update early stopping counter
        else:
            early_stopping_counter = early_stopping_patience # reset early stopping counter
    if early_stopping_counter == 0: # early stopping
        break
