Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

### Importing packages

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

import torch
from torch import nn

In [4]:
import torch
print('PyTorch version:', torch.__version__)
if torch.cuda.is_available():
  print("You have %d GPUs" % torch.cuda.device_count())
  gpu_id = torch.cuda.current_device()
  print("The selected GPU is GPU", gpu_id)
  print("- Name:", torch.cuda.get_device_name(gpu_id))
  print("- All properties:",torch.cuda.get_device_properties(gpu_id))

PyTorch version: 2.0.0
You have 1 GPUs
The selected GPU is GPU 0
- Name: NVIDIA GeForce RTX 3080 Ti
- All properties: _CudaDeviceProperties(name='NVIDIA GeForce RTX 3080 Ti', major=8, minor=6, total_memory=12287MB, multi_processor_count=80)


### Data loading and preprocessing

In [29]:
column_names = ['PassengerId','Survived','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_train = pd.read_csv('train.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

column_names = ['PassengerId','TicketClass','Name','Sex','Age','SiblingsSpouses','ParentChildren','TicketNumber','Fare','Cabin','Embarked']
raw_data_test = pd.read_csv('train.csv', header=0, names=column_names, delimiter=",", skipinitialspace=True) #header=0 + names allows to override headers

raw_data_train.tail()

Unnamed: 0,PassengerId,Survived,TicketClass,Name,Sex,Age,SiblingsSpouses,ParentChildren,TicketNumber,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Now we'll analyze the data in the training set:

In [25]:
raw_data_train.isna().sum()/len(raw_data_train)

PassengerId        0.000000
Survived           0.000000
TicketClass        0.000000
Name               0.000000
Sex                0.000000
Age                0.198653
SiblingsSpouses    0.000000
ParentChildren     0.000000
TicketNumber       0.000000
Fare               0.000000
Cabin              0.771044
Embarked           0.002245
dtype: float64

In [60]:
train_stats = raw_data_train.describe()
train_stats.pop("Survived") # the column we want to predict its values
train_stats.pop("PassengerId")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TicketClass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SiblingsSpouses,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
ParentChildren,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


After inspecting the dataset, the following considerations can be made:
- PassengerId and TicketNumer are apparently useless features for the task so we'll drop them
- The cabin attribute is unknown for 77% of the samples, so we'll drop the feature
- We'll drop the two samples without embarkment informations
- Even if for almost 20% of the dataset the age is unknown, it is probably an important feature, so we'll drop the samples without age info for semplicity
- We must use One-Hot Encoding for the categorical features 'TicketClass', 'Sex' and 'Embarked'
- We can extract useful information about the social status of the person from the Name attribute
- Numeric features must be normalized

We define a function to preprocess raw data. It will be used to preprocess both the training set and later the test set:

In [67]:
def preprocess_raw_data(raw_data: pd.DataFrame, stats: pd.DataFrame):
  data = raw_data.copy()
  #Drop unnecessary features
  data = data.drop(['PassengerId','TicketNumber','Cabin'],axis=1)
  #Drop samples without age or without embarkment info
  data = data.dropna()
  #One-Hot encode categorical features
  sex = data.pop('Sex')
  data['Male'] = (sex == 'male') * 1.0
  data['Female'] = (sex == 'female') * 1.0
  tClass = data.pop('TicketClass')
  data['UpperClass'] = (tClass == 1) * 1.0
  data['MiddleClass'] = (tClass == 2) * 1.0
  data['LowerClass'] = (tClass == 3) * 1.0
  embarked = data.pop('Embarked')
  data['CherbourgPort'] = (embarked == 'C') * 1.0
  data['QueenstownPort'] = (embarked == 'Q') * 1.0
  data['SouthamptonPort'] = (embarked == 'S') * 1.0
  #Extract the title from the name
  titles = data.pop('Name').str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
  data['Mr'] = (titles == 'Mr') * 1.0
  data['Miss/Mrs/Ms/Lady'] = ((titles == 'Miss') | (titles == 'Mrs') | (titles == 'Ms') | (titles == 'Lady') | (titles == 'Mme') | (titles == 'Mlle')) * 1.0
  data['Master'] = (titles == 'Master') * 1.0
  data['Rev/Dr/Major/Col/Contess/Capt/Sir/Don/Jonkheer'] = ((titles == 'Rev') | (titles == 'Dr') | (titles == 'Major') | (titles == 'Col') | (titles == 'the Countess') | (titles == 'Capt') | (titles == 'Sir') | (titles == 'Don') | (titles == 'Jonkheer') ) * 1.0
  #Data normalization based only on the statistics
  for col in ['Age', 'SiblingsSpouses', 'ParentChildren', 'Fare']:
    data[col] = (data[col] - stats['mean'][col]) / stats['std'][col]
  return data

def preprocess_train_val(raw_train_df: pd.DataFrame, raw_val_df: pd.DataFrame):
  # Extract statistics for data normalization only from the training set
  train_stats = raw_data_train.describe()
  train_stats = train_stats.transpose()
  # Preprocess and normalize data with the stats from the training set
  train_df = preprocess_raw_data(raw_train_df, train_stats)
  val_df = preprocess_raw_data(raw_val_df, train_stats)
  return (train_df,val_df)

train_stats = raw_data_train.describe()
train_stats.pop("Survived") # the column we want to predict its values
train_stats.pop("PassengerId")
train_stats = train_stats.transpose()
train_data = preprocess_raw_data(raw_data_train,train_stats)
train_data.tail()


Unnamed: 0,Survived,Age,SiblingsSpouses,ParentChildren,Fare,Male,Female,UpperClass,MiddleClass,LowerClass,CherbourgPort,QueenstownPort,SouthamptonPort,Mr,Miss/Mrs/Ms/Lady,Master,Rev/Dr/Major/Col/Contess/Capt/Sir/Don/Jonkheer
885,0,0.64027,-0.474279,5.729626,-0.061964,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
886,0,-0.185807,-0.474279,-0.473408,-0.386454,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
887,1,-0.736524,-0.474279,-0.473408,-0.044356,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
889,1,-0.254646,-0.474279,-0.473408,-0.044356,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
890,0,0.158392,-0.474279,-0.473408,-0.492101,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [75]:
dataset = raw_data_train.copy()
train_data = dataset.sample(frac=0.8, random_state=0)
val_data = dataset.drop(train_data.index)

(train_data, val_data) = preprocess_train_val(train_data, val_data)
val_data.describe(train_data) #Perché non si sono normalizzati i dati???

Unnamed: 0,Survived,Age,SiblingsSpouses,ParentChildren,Fare,Male,Female,UpperClass,MiddleClass,LowerClass,CherbourgPort,QueenstownPort,SouthamptonPort,Mr,Miss/Mrs/Ms/Lady,Master,Rev/Dr/Major/Col/Contess/Capt/Sir/Don/Jonkheer
count,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0,138.0
mean,0.471014,0.039464,-0.007722,0.119926,0.078253,0.630435,0.369565,0.304348,0.297101,0.398551,0.224638,0.021739,0.753623,0.565217,0.369565,0.043478,0.021739
std,0.500978,0.960775,0.703584,1.093492,0.799548,0.484445,0.484445,0.461807,0.458646,0.491383,0.418864,0.146362,0.43247,0.497534,0.484445,0.204674,0.146362
min,0.0,-1.998356,-0.474279,-0.473408,-0.51734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.581635,-0.474279,-0.473408,-0.471727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.020713,-0.474279,-0.473408,-0.12485,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1.0,0.60585,0.43255,0.767199,0.119146,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
max,1.0,2.430103,3.153038,5.729626,3.608038,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
