# Prepare dataset

In [1]:
import numpy as np
import pandas as pd 

import os
print(os.listdir("../input/titanic"))

['train.csv', 'test.csv', 'gender_submission.csv']


In [2]:
# read to DataFrame
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
# submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [3]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
# train(target drop), test를 행 방향으로 합치기
data_set = pd.concat((train.drop(['Survived'], axis=1), test), axis=0)

In [9]:
# 사용하지 않을 features drop하기
data_set = data_set.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
# 결측값을 평균으로 채우기
data_set = data_set.fillna(data_set.mean())

In [10]:
# train set, test set

n_train = train.shape[0]
train_x, test_x = data_set[:n_train], data_set[n_train:]
train_y = train['Survived']

# numpy array
# .keys(): all the names of DataFrame columns
# values: pandas data to numpy array
train_x = train_x[train_x.keys()].values
test_x = test_x[test_x.keys()].values
train_y = train_y.values

In [11]:
print(train_x.shape)

(891, 5)


# Model

In [12]:
import torch
from torch import nn
from torch.utils.data import DataLoader

In [13]:
class SimpleNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            # layer 1
            # features: 5
            nn.Linear(5, 128),
            # Batch Norm between Layer and Activation function (before Activiation function)
            nn.BatchNorm1d(128),
            nn.ReLU(),
            # Drop out after Activation function
            # drop prob = 0.1
            nn.Dropout(0.1),
            # layer 2
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            # layer 3
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            # output layer
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # view(): reshape tensor
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Train

In [14]:
import torch.optim as optim

simple_nn = SimpleNN()

# Optimization algorithm : Adam
optimizer = optim.Adam(simple_nn.parameters(), lr=0.01)

# Loss function: 
error = nn.BCELoss()

# minibatch size: 99
batch_size = 99
# number of minibatches 
batch_count = int(len(train_x) / batch_size)

for epoch in range(300):
    train_loss = 0
    num_right = 0
    for i in range(batch_count):
        start = i * batch_size
        end = start + batch_size
        
        # Float Tensor (minibatch)
        tensor_x = torch.Tensor(train_x[start:end])
        tensor_y = torch.Tensor(train_y[start:end]).reshape(-1, 1)
        
        optimizer.zero_grad()
        
        output = simple_nn(tensor_x)
        
        loss = error(output, tensor_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * batch_size
        # threshold: 0.5
        result = [1 if out >= 0.5 else 0 for out in output]
        num_right += np.sum(np.array(result) == train_y[start:end])
        
    train_loss = train_loss / len(train_x)
    accuracy = num_right / len(train_x)
    
    if epoch % 25 == 0:
        print('Loss: {} Accuracy: {}% Epoch:{}'.format(train_loss, accuracy, epoch))

        
print('End')

Loss: 0.6314508318901062 Accuracy: 0.6655443322109988% Epoch:0
Loss: 0.5201640692022111 Accuracy: 0.7508417508417509% Epoch:25
Loss: 0.4699500732951694 Accuracy: 0.77665544332211% Epoch:50
Loss: 0.41628362072838676 Accuracy: 0.8159371492704826% Epoch:75
Loss: 0.3736910820007324 Accuracy: 0.8327721661054994% Epoch:100
Loss: 0.33269308010737103 Accuracy: 0.8451178451178452% Epoch:125
Loss: 0.35124723778830635 Accuracy: 0.8451178451178452% Epoch:150
Loss: 0.32327385909027523 Accuracy: 0.8597081930415263% Epoch:175
Loss: 0.29045361942715114 Accuracy: 0.8843995510662177% Epoch:200
Loss: 0.24633292688263786 Accuracy: 0.8933782267115601% Epoch:225
Loss: 0.247249788708157 Accuracy: 0.9012345679012346% Epoch:250
Loss: 0.23800834516684213 Accuracy: 0.9046015712682379% Epoch:275
End


# Prediction

In [15]:
tensor_test_x = torch.Tensor(test_x)

# No gradient update
with torch.no_grad():
    test_output = simple_nn(tensor_test_x)
    
    result = np.array([1 if out >= 0.5 else 0 for out in test_output])
    submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': result})
    print(submission)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
