# Import

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Load

In [None]:
train = pd.read_csv('/content/drive/MyDrive/필사 코드/titanic_training.csv')
test = pd.read_csv('/content/drive/MyDrive/필사 코드/titanic_test.csv')
submission = pd.read_csv('/content/drive/MyDrive/필사 코드/gender_submission.csv')

In [None]:
train.shape

(891, 12)

In [None]:
print(train.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [None]:
print(test.columns)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


# SimpleNN
>Batch normalization

In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(5, 128),

            # batch normalization between layer and activation function
            nn.BatchNorm1d(128),
            nn.ReLU(), # activation function

            # dropout after activation function
            nn.Dropout(0.1),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1),
            nn.Sigmoid()

        )

    def forward(self, x):
        '''
        input: 5개의 column을 가진 99개의 데이터(한 개의 mini-batch)가 SimpleNN으로 들어옴
        x: torch.tensor([99, 5])이므로 x = x.view(x.size(0), -1)을
        하면 그대로 (99, 5) 크기의 tensor이다.

        __init__의 nn.Linear(128, 1) 부분에서 데이터가 1차원으로 flatten된다.
        '''
        x = x.view(x.size(0), -1) # 텐서 x를 2차원 텐서로 변환; x.size(0)은 일반적으로 batch size
        x = self.classifier(x)

        return x

In [None]:
data_set = pd.concat((train.drop(['Survived'], axis = 1), test), axis = 0) # 행을 결합

data_set = data_set.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
# 각 열의 평균값으로 각 열의 결측치 대체
data_set = data_set.fillna(data_set.mean())

n_train = train.shape[0] # train의 행
train_x, test_x = data_set[:n_train], data_set[n_train:]
train_y = train['Survived']

# train_x.keys(): train_x의 열 이름을 반환
# values: DataFrame을 numpy array로 변환
train_x = train_x[train_x.keys()].values
test_x = test_x[test_x.keys()].values
train_y = train_y.values

print(type(train_y))
print(data_set.columns)

<class 'numpy.ndarray'>
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [None]:
simple_nn = SimpleNN()
optimizer = optim.Adam(simple_nn.parameters(), lr = 0.01)
error = nn.BCELoss() # Binary Cross Entropy Loss

batch_size = 99 # mini-batch 한 개의 크기
batch_count = int(len(train_x) / batch_size) # mini-batch 개수

In [None]:
for epoch in range(300):
    train_loss = 0
    num_right = 0

    for i in range(batch_count):
        start = i * batch_size
        end = start + batch_size
        # numpy array를 pytorch tensor로 변환
        tensor_x = torch.FloatTensor(train_x[start:end])
        tensor_y = torch.FloatTensor(train_y[start:end]).reshape(-1, 1)

        optimizer.zero_grad()
        # train_x(tensor_x)에는 5개의 열이 있으므로 simple_nn의 입력 데이터 차원과 일치
        output = simple_nn(tensor_x) # (batch_size, 1)
        loss = error(output, tensor_y)
        loss.backward()
        optimizer.step()

        # item(): pytorch tensor의 단일 값을 스칼라로 변환
        # batch_size를 곱합으로써 mini-batch 손실값과 전체 데이터셋의 손실값을 동일한 스케일로 만듦
        train_loss += loss.item() * batch_size
        result = [1 if out >= 0.5 else 0 for out in output]
        # ==이면 1, !=이면 0 반
        num_right += np.sum(np.array(result) == train_y[start:end])

    # 한 번의 epoch이 끝남
    train_loss = train_loss / len(train_x)
    accuracy = num_right / len(train_x)

    if epoch % 25 == 0:
        print(f"Loss: {train_loss} Accuracy: {accuracy} Epoch: {epoch}")

print("Training Ended")

Loss: 0.20111455023288727 Accuracy: 0.9135802469135802 Epoch: 0
Loss: 0.211076520383358 Accuracy: 0.9046015712682379 Epoch: 25
Loss: 0.2037688179148568 Accuracy: 0.9135802469135802 Epoch: 50
Loss: 0.20752364065912035 Accuracy: 0.9102132435465768 Epoch: 75
Loss: 0.1875345375802782 Accuracy: 0.9135802469135802 Epoch: 100
Loss: 0.23576268057028452 Accuracy: 0.8945005611672279 Epoch: 125
Loss: 0.16080969075361887 Accuracy: 0.9337822671156004 Epoch: 150
Loss: 0.18608319097095066 Accuracy: 0.9191919191919192 Epoch: 175
Loss: 0.17840919726424748 Accuracy: 0.920314253647587 Epoch: 200
Loss: 0.175366240243117 Accuracy: 0.9225589225589226 Epoch: 225
Loss: 0.15760375228193071 Accuracy: 0.9292929292929293 Epoch: 250
Loss: 0.16110478341579437 Accuracy: 0.9236812570145904 Epoch: 275
Training Ended


In [None]:
# test_x has no y label
tensor_test_x = torch.FloatTensor(test_x)

# 역전파 계산(gradient)을 비활성화
with torch.no_grad():
    test_output = simple_nn(tensor_test_x)
    result = np.array([1 if out >= 0.5 else 0 for out in test_output])
    submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survivied': result})
    # submssion.csv에 pathway를 써주면 저장됨
    submission.to_csv('submission.csv', index = False)