In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# import
import torch
import torch.nn as nn
from torch import optim
import random

In [3]:
# GPU 사용 여부
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(777)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [4]:
# data load
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
submit = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [5]:
# train data의 x, y data split
# survived - 예측 할 data (범주형 데이터 - classification)
x_data = train_data.drop(['Survived'], axis = 1)
y_data = train_data['Survived'].values

In [6]:
# total x, test data 개수
print(x_data.shape)
print(test_data.shape)

(891, 11)
(418, 11)


In [7]:
# data 확인
print(x_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Nam

In [8]:
print(x_data.columns)
# Survived : 0 = No, 1 = Yes -> int
# Pclass(Ticket class) : 1 = 1st, 2 = 2nd, 3 = 3rd (1에 가까울 수록 높음) -> int
# Sex : 'male', 'female' -> object :: 범주형으로 전처리 필요
# Age -> float :: Nan값 존재 => train/test data 모두 전처리 필요
# SibSp(배우자나 형제 자매 명 수의 총 합) -> int
# Parch(부모 자식 명 수의 총 합) -> int :: SibSp와 Parch를 사용해서 함께 탑승한 총 가족수 추가
# Ticket(Ticket number) -> object :: 학습/예측 시 불필요한 데이터 drop하기
# Fare -> float :: test data에 Nan값 존재 => test data 전처리 필요
# Cabin -> object :: Nan값 존재 => train/test data 모두 전처리 필요 but 학습/예측 시 불필요한 데이터 drop하기
# Embarked(승선항) : C = Cherbourg, Q = Queenstown, S = Southampton -> object :: train data에 Nan값 존재 => 전처리필요

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [9]:
# Age
print(x_data['Age'].unique())

# Embarked
print(x_data['Embarked'].unique())

[22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
['S' 'C' 'Q' nan]


In [10]:
# data preprocessing
## 1) Nan값 채우기
### Age -> 나이별로 평균값으로 채우기
### Fare -> Pclass로 group화 해서 중간값으로 채우기
### Embarked -> 최빈값으로 넣어주기
x_data["Age"].fillna(x_data.groupby("Sex")["Age"].transform("mean"), inplace=True)
test_data["Age"].fillna(test_data.groupby("Sex")["Age"].transform("mean"), inplace=True)
test_data["Fare"].fillna(test_data.groupby("Pclass")["Fare"].transform("median"), inplace=True)
x_data['Embarked'].fillna(x_data['Embarked'].mode()[0], inplace=True)

## 2) Object 데이터 숫자로 바꿔주기
### Sex
sex_map = {"male": 0, "female": 1}
x_data['Sex'] = x_data['Sex'].map(sex_map)
test_data['Sex'] = test_data['Sex'].map(sex_map)
### Embarked
embarked_map = {'S':0, 'C':1, 'Q':2}
x_data['Embarked'] = x_data['Embarked'].map(embarked_map)
test_data['Embarked'] = test_data['Embarked'].map(embarked_map)

## 3) SibSp와 Parch를 사용해서 함께 탑승한 총 가족 수
x_data["Family"] = x_data["Parch"] + x_data["SibSp"]
test_data["Family"] = test_data["Parch"] + test_data["SibSp"]

## 4) 필요 없다고 판단한 데이터 drop
x_data = x_data.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'SibSp', 'Parch'], axis = 1)
test_data = test_data.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'SibSp', 'Parch'], axis = 1)

In [11]:
print(x_data.describe())
print(test_data.describe())

           Pclass         Sex         Age        Fare    Embarked      Family
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     2.308642    0.352413   29.736034   32.204208    0.361392    0.904602
std      0.836071    0.477990   13.014897   49.693429    0.635673    1.613459
min      1.000000    0.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000    0.000000   22.000000    7.910400    0.000000    0.000000
50%      3.000000    0.000000   30.000000   14.454200    0.000000    0.000000
75%      3.000000    1.000000   35.000000   31.000000    1.000000    1.000000
max      3.000000    1.000000   80.000000  512.329200    2.000000   10.000000
           Pclass         Sex         Age        Fare    Embarked      Family
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000
mean     2.265550    0.363636   30.272597   35.560845    0.464115    0.839713
std      0.841838    0.481622   12.634534   55.856972    0.68551

In [12]:
# 추가 data preprocessing
## 1) 나이 카테고리화
def age_category(x):
    if x <= 10 :
        return 0
    elif x <= 20 :
        return 1
    elif x <= 30 :
        return 2
    elif x <= 40 :
        return 3
    elif x <= 50 :
        return 4
    elif x <= 60:
        return 5
    else:
        return 6

x_data["Age"] = x_data['Age'].apply(age_category)
test_data["Age"] = test_data['Age'].apply(age_category)

In [13]:
# 추가 전처리를 위해 지금까지 전처리 했던 데이터의 분포 보기
print(x_data.describe())
print(test_data.describe())

           Pclass         Sex         Age        Fare    Embarked      Family
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     2.308642    0.352413    2.473625   32.204208    0.361392    0.904602
std      0.836071    0.477990    1.308627   49.693429    0.635673    1.613459
min      1.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      2.000000    0.000000    2.000000    7.910400    0.000000    0.000000
50%      3.000000    0.000000    2.000000   14.454200    0.000000    0.000000
75%      3.000000    1.000000    3.000000   31.000000    1.000000    1.000000
max      3.000000    1.000000    6.000000  512.329200    2.000000   10.000000
           Pclass         Sex         Age        Fare    Embarked      Family
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000
mean     2.265550    0.363636    2.588517   35.560845    0.464115    0.839713
std      0.841838    0.481622    1.268721   55.856972    0.68551

In [14]:
## 2) 운임요금 카테고리화
def fare_category(x):
    if x <= 10 :
        return 0
    elif x <= 20 :
        return 1
    elif x <= 40 :
        return 2
    else:
        return 3

x_data["Fare"] = x_data['Fare'].apply(fare_category)
test_data["Fare"] = test_data['Fare'].apply(fare_category)

In [15]:
# 전처리된 데이터
print(x_data.describe())
print(test_data.describe())

           Pclass         Sex         Age        Fare    Embarked      Family
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     2.308642    0.352413    2.473625    1.242424    0.361392    0.904602
std      0.836071    0.477990    1.308627    1.155172    0.635673    1.613459
min      1.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      2.000000    0.000000    2.000000    0.000000    0.000000    0.000000
50%      3.000000    0.000000    2.000000    1.000000    0.000000    0.000000
75%      3.000000    1.000000    3.000000    2.000000    1.000000    1.000000
max      3.000000    1.000000    6.000000    3.000000    2.000000   10.000000
           Pclass         Sex         Age        Fare    Embarked      Family
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000
mean     2.265550    0.363636    2.588517    1.267943    0.464115    0.839713
std      0.841838    0.481622    1.268721    1.169138    0.68551

In [16]:
print(x_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Pclass    891 non-null    int64
 1   Sex       891 non-null    int64
 2   Age       891 non-null    int64
 3   Fare      891 non-null    int64
 4   Embarked  891 non-null    int64
 5   Family    891 non-null    int64
dtypes: int64(6)
memory usage: 41.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Pclass    418 non-null    int64
 1   Sex       418 non-null    int64
 2   Age       418 non-null    int64
 3   Fare      418 non-null    int64
 4   Embarked  418 non-null    int64
 5   Family    418 non-null    int64
dtypes: int64(6)
memory usage: 19.7 KB
None


In [17]:
# 판다스 데이터프레임 형태의 데이터를 텐서로 변환
x_data_tensor=torch.FloatTensor(np.array(x_data)).to(device)
y_data_tensor=torch.LongTensor(np.array(y_data)).to(device).reshape(-1, 1) # data의 차원 맞추기
test_data_tensor=torch.FloatTensor(np.array(test_data)).to(device)

In [18]:
# 차원
print(x_data_tensor.shape)
print(y_data_tensor.shape)
print(y_data_tensor.max()) # 0, 1

torch.Size([891, 6])
torch.Size([891, 1])
tensor(1, device='cuda:0')


In [19]:
# MLP 구현 - seed 고정
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)
    
# 학습 파라미터 설정
learning_rate = 1e-3
training_epochs = 100

# model 설계
class MLP(torch.nn.Module):
    def __init__(self):
        super(MLP,self).__init__()
        self.linear1 = nn.Linear(6,256,bias=True)
        self.linear2 = nn.Linear(256,128,bias=True)
        self.linear3 = nn.Linear(128,64,bias=True)
        self.linear4 = nn.Linear(64,2,bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        out = self.linear1(x)
        out = self.sigmoid(out)
        out = self.linear2(out)
        out = self.sigmoid(out)
        out = self.linear3(out)
        out = self.sigmoid(out)
        out = self.linear4(out)
        return out

model = MLP().to(device)

In [20]:
# 오차함수 선택
loss_fn = nn.CrossEntropyLoss().to(device)
# 가중치를 학습하기 위한 최적화 기법 선택
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [21]:
# train
from tqdm.notebook import tqdm
model.train()
for epoch in tqdm(range(training_epochs)):
    total_cost = 0
    for X, Y in zip(x_data_tensor, y_data_tensor):
        # 그래디언트 초기화
        optimizer.zero_grad()
        # Forward 계산
        y_pred = model(X).reshape(1, 2)
        Y = Y.reshape(1)

        # Error 계산
        cost = loss_fn(y_pred, Y)
        # Back paropagation
        cost.backward()
        # 가중치 갱신
        optimizer.step()
        # Error 합계
        total_cost += cost
    if not (epoch % 10):
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(total_cost))
    
print('Learning finished')


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0001 cost = 528.424011230
Epoch: 0011 cost = 396.360595703
Epoch: 0021 cost = 383.101043701
Epoch: 0031 cost = 371.555480957
Epoch: 0041 cost = 357.287445068
Epoch: 0051 cost = 342.325805664
Epoch: 0061 cost = 330.580108643
Epoch: 0071 cost = 317.733093262
Epoch: 0081 cost = 307.237304688
Epoch: 0091 cost = 298.842346191
Learning finished


In [22]:
# predict
# 모델 평가
model.eval()
with torch.no_grad(): # 임시로 required_grad = false로 설정하는 것
    predict = model(test_data_tensor)
    predict = torch.argmax(predict, 1)

In [23]:
# submit
submit['Survived'] = predict.cpu()
submit.to_csv('result.csv', index=False) 