In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [18]:
train = pd.read_csv("/kaggle/input/titanic/train.csv", index_col = 0)
test= pd.read_csv("/kaggle/input/titanic/test.csv", index_col = 0)
train.shape, test.shape

((891, 11), (418, 10))

In [19]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# 1. Basic EDA

In [21]:
desc = pd.DataFrame(index = list(train))
desc['count'] = train.count()
desc['nunique'] = train.nunique()
desc['%unique'] = desc['nunique'] / len(train) * 100
desc['null'] = train.isnull().sum()
desc['type'] = train.dtypes
desc = pd.concat([desc, train.describe().T], axis = 1)
desc

Unnamed: 0,count,nunique,%unique,null,type,count.1,mean,std,min,25%,50%,75%,max
Survived,891,2,0.224467,0,int64,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,3,0.3367,0,int64,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891,891,100.0,0,object,,,,,,,,
Sex,891,2,0.224467,0,object,,,,,,,,
Age,714,88,9.876543,177,float64,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891,7,0.785634,0,int64,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891,7,0.785634,0,int64,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891,681,76.430976,0,object,,,,,,,,
Fare,891,248,27.833895,0,float64,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Cabin,204,147,16.498316,687,object,,,,,,,,


Based on this table, we can notice a few details:
* 'Age', 'Cabin' and 'Embarked' have null values. We have to decide whether to remove these columns or to replace null values with mean/median.
* 'Pclass' is a categorical feature, but its dtype is 'int64'. We would have to change the dtype to 'category' or 'object'.

In [22]:
train['Pclass'] = train['Pclass'].astype('category')
test['Pclass'] = test['Pclass'].astype('category')

The 'Name', 'Ticket' and 'Cabin' do not provide any useful information, so we can remove it.

In [23]:
useless_columns = ['Name', 'Ticket', 'Cabin']
train = train.drop(useless_columns, axis =1)
test = test.drop(useless_columns, axis = 1)
train.shape,test.shape

((891, 8), (418, 7))

In [24]:
X = train.drop('Survived', axis = 1)
y = train['Survived'].values

In [25]:
categorical_cols = [col for col in X.columns if X[col].nunique() < 15 and X[col].dtype in ["object","category"]]
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]
training_cols = categorical_cols + numerical_cols
training_cols

['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Fare']

# 2. Data preprocessing

In [26]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In this section, we will have to deal with the null values and also convert the categorical data using one hot encoder

In [27]:
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ("impute",SimpleImputer(strategy="most_frequent")),
    ('encode', OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(transformers=[
    ('numerical',numerical_transformer,numerical_cols),
    ('categorical',categorical_transformer,categorical_cols)
], remainder = 'passthrough')

In [28]:
X = preprocessor.fit_transform(X)
test_processed = preprocessor.fit_transform(test)
X.shape, test.shape

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

((891, 12), (418, 7))

Next, we split the data into training and validation sets in the ratio of 80:20.

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X,
                                                 y,
                                                 test_size = 0.2,
                                                 random_state = 42)
len(X_train), len(X_val), len(y_train), len(y_val)

(712, 179, 712, 179)

In [30]:
print(X_train[:5], X_train.shape)

[[45.5     0.      0.     28.5     1.      0.      0.      0.      1.
   0.      0.      1.    ]
 [23.      0.      0.     13.      0.      1.      0.      0.      1.
   0.      0.      1.    ]
 [32.      0.      0.      7.925   0.      0.      1.      0.      1.
   0.      0.      1.    ]
 [26.      1.      0.      7.8542  0.      0.      1.      0.      1.
   0.      0.      1.    ]
 [ 6.      4.      2.     31.275   0.      0.      1.      1.      0.
   0.      0.      1.    ]] (712, 12)


We have to standardise the input variables before putting them in the neural network.

In [31]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
test_values = sc.transform(test_processed)
print(X_train[:5], X_train.shape)

[[ 1.22920747 -0.47072241 -0.47934164 -0.07868358  1.8352379  -0.51880845
  -1.1258401  -0.7243102   0.7243102  -0.46146201 -0.30335547  0.59248936]
 [-0.50350514 -0.47072241 -0.47934164 -0.37714494 -0.54488848  1.92749365
  -1.1258401  -0.7243102   0.7243102  -0.46146201 -0.30335547  0.59248936]
 [ 0.18957991 -0.47072241 -0.47934164 -0.47486697 -0.54488848 -0.51880845
   0.8882256  -0.7243102   0.7243102  -0.46146201 -0.30335547  0.59248936]
 [-0.27247679  0.37992316 -0.47934164 -0.47623026 -0.54488848 -0.51880845
   0.8882256  -0.7243102   0.7243102  -0.46146201 -0.30335547  0.59248936]
 [-1.81266577  2.93185988  2.04874166 -0.02524937 -0.54488848 -0.51880845
   0.8882256   1.38062393 -1.38062393 -0.46146201 -0.30335547  0.59248936]] (712, 12)


In [32]:
print(y[:5])

[0 1 1 1 0]


# 3. Set up PyTorch ANN

In [33]:
# Device agnostic code
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

device

'cuda'

In [34]:
# Transform the data into tensors
X_train = torch.tensor(X_train, dtype = torch.float32)
X_val = torch.tensor(X_val, dtype = torch.float32)
y_train = torch.tensor(y_train, dtype = torch.float32)
y_val = torch.tensor(y_val, dtype = torch.float32)
test_tensor = torch.tensor(test_values, dtype = torch.float32)
print(X_train[:5], X_train.dtype, X.shape)
print(y_train[:5], y_train.dtype, y.shape)

tensor([[ 1.2292, -0.4707, -0.4793, -0.0787,  1.8352, -0.5188, -1.1258, -0.7243,
          0.7243, -0.4615, -0.3034,  0.5925],
        [-0.5035, -0.4707, -0.4793, -0.3771, -0.5449,  1.9275, -1.1258, -0.7243,
          0.7243, -0.4615, -0.3034,  0.5925],
        [ 0.1896, -0.4707, -0.4793, -0.4749, -0.5449, -0.5188,  0.8882, -0.7243,
          0.7243, -0.4615, -0.3034,  0.5925],
        [-0.2725,  0.3799, -0.4793, -0.4762, -0.5449, -0.5188,  0.8882, -0.7243,
          0.7243, -0.4615, -0.3034,  0.5925],
        [-1.8127,  2.9319,  2.0487, -0.0252, -0.5449, -0.5188,  0.8882,  1.3806,
         -1.3806, -0.4615, -0.3034,  0.5925]]) torch.float32 (891, 12)
tensor([0., 0., 0., 0., 0.]) torch.float32 (891,)


Subclass nn.Module to create our own PyTorch model

In [35]:
from torch import nn
class TitanicClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features = 12, out_features = 100),
            nn.ReLU(),
            nn.Linear(in_features = 100, out_features = 100),
            nn.ReLU(),
            nn.Linear(in_features = 100, out_features = 100),
            nn.ReLU(),
            nn.Linear(in_features = 100, out_features = 1)
        )
    def forward(self,x):
        return self.layers(x)

# Instantiate a model and send it to the GPU
model_0 = TitanicClassifier().to(device)
model_0

TitanicClassifier(
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): ReLU()
    (4): Linear(in_features=100, out_features=100, bias=True)
    (5): ReLU()
    (6): Linear(in_features=100, out_features=1, bias=True)
  )
)

Set up loss function and optimizer. Since we are dealing with a binary classification problem, we will be using nn.BCEWithLogitsLoss() as the loss function.

In [36]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(params = model_0.parameters(),
                           lr = 0.03)

In [37]:
# pip install torchmetrics

Create an accuracy metric to evaluate our model_0 later on

In [38]:
from torchmetrics.classification import BinaryAccuracy
accuracy = BinaryAccuracy().to(device)

# 4. Create a training and validation loop for our PyTorch Neural Network

In [39]:
# Send data to the GPU
X_train, X_val = X_train.to(device), X_val.to(device)
y_train, y_val = y_train.to(device), y_val.to(device)
test_tensor = test_tensor.to(device)

epochs = 1000

for epoch in range(epochs):
    model_0.train()
    
    y_logits = model_0(X_train).squeeze(dim=1)
    y_pred = torch.round(torch.sigmoid(y_logits))
    
    loss = loss_fn(y_logits, y_train)
    acc = accuracy(y_pred, y_train)*100
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()
    
    model_0.eval()
    with torch.inference_mode():
        test_logits = model_0(X_val).squeeze(dim=1)
        test_pred = torch.round(torch.sigmoid(test_logits))
        
        test_loss = loss_fn(test_logits, y_val)
        test_acc = accuracy(test_pred, y_val)*100
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.4f} | Acc: {acc:.2f}% | Test_loss: {test_loss:.4f} | Test_acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.6842 | Acc: 62.36% | Test_loss: 0.6879 | Test_acc: 58.66%
Epoch: 100 | Loss: 0.6333 | Acc: 62.36% | Test_loss: 0.6452 | Test_acc: 58.66%
Epoch: 200 | Loss: 0.5436 | Acc: 77.39% | Test_loss: 0.5493 | Test_acc: 75.42%
Epoch: 300 | Loss: 0.4546 | Acc: 80.90% | Test_loss: 0.4543 | Test_acc: 79.89%
Epoch: 400 | Loss: 0.4248 | Acc: 82.72% | Test_loss: 0.4280 | Test_acc: 81.01%
Epoch: 500 | Loss: 0.4109 | Acc: 83.57% | Test_loss: 0.4180 | Test_acc: 82.12%
Epoch: 600 | Loss: 0.4019 | Acc: 83.71% | Test_loss: 0.4130 | Test_acc: 82.68%
Epoch: 700 | Loss: 0.3950 | Acc: 83.71% | Test_loss: 0.4111 | Test_acc: 82.12%
Epoch: 800 | Loss: 0.3896 | Acc: 83.99% | Test_loss: 0.4116 | Test_acc: 82.68%
Epoch: 900 | Loss: 0.3852 | Acc: 84.41% | Test_loss: 0.4133 | Test_acc: 83.24%


In [40]:
model_0.eval()
with torch.inference_mode():
    pred = torch.round(torch.sigmoid(model_0(test_tensor)))
final_pred = pred.squeeze(1).cpu().numpy()
final_pred = final_pred.astype(int)
final_pred[:5]

array([0, 0, 0, 0, 0])

In [41]:
output = pd.DataFrame({"PassengerId":test.index,
                      "Survived":final_pred})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [42]:
output.to_csv("submission.csv", index=False)