In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=pd.read_csv('/kaggle/input/titanic/train.csv')
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#일단 불필요한 데이터 제거 : Remove useless data
#PassengerId는 별도로 저장해야 함. : pID to save result
pID = test_data['PassengerId']

#Name, PassengerId, Ticket이 필요가 없음.
#Name, PassengerId, Ticket are useless 
train_data.drop(['Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)
test_data.drop(['Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)


#Cabin도 맨 앞의 것만 남기고 제거
#We don't need Cabin either except the first word
train_data['Cabin'] = train_data['Cabin'].str[:1]
test_data['Cabin'] = test_data['Cabin'].str[:1]

print(train_data.head(5))
print(test_data.head(5))

In [None]:
#NaN 확인
#Check NaN

print(train_data.isnull().sum())
print('------------')
print(test_data.isnull().sum())

In [None]:
#age, cabin, Embarked, Fare의 NaN 제거
#remove NaN
train_data['Age'].fillna(train_data['Age'].mean(), inplace = True)
train_data['Cabin'].fillna('N', inplace = True)
train_data['Embarked'].fillna('N', inplace = True)
#train_data['Fare'].fillna('N', inplace = True)

test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
test_data['Cabin'].fillna('N', inplace = True)
test_data['Fare'].fillna(0, inplace = True)
#train_data['Fare'].fillna('N', inplace = True)

#확인
#Check
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#문자형 데이터도 숫자형으로 변환
#Sex, Cabin, Embarked가 문자형.
#Convert textual data to numeric data
from sklearn import preprocessing

features = ['Sex', 'Cabin', 'Embarked']
for f in features :
    le = preprocessing.LabelEncoder()
    concat_series = test_data[f]
    concat_series = concat_series.append(train_data[f])
    le = le.fit(concat_series)
    train_data[f] = le.transform(train_data[f])
    test_data[f] = le.transform(test_data[f])

#확인
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#나이랑 요금 범주로 변경
#Convert numeric data to categorical data => age, fare
def age_convert(age) :
    if age <= -1 : return 0
    return int(age/10+1) #0~9살부터 1

def fare_convert(fare) :
    return int(fare/10)

train_data['Age'] = train_data['Age'].apply(lambda x : age_convert(x))
train_data['Fare'] = train_data['Fare'].apply(lambda x : fare_convert(x))
test_data['Age'] = test_data['Age'].apply(lambda x : age_convert(x))
test_data['Fare'] = test_data['Fare'].apply(lambda x : fare_convert(x))

#확인
print(train_data.head(5))
print(test_data.head(5))

In [None]:
import torch

#테스트, 데이터 셋 구분
#Training data
train_x = train_data.drop('Survived', axis = 1)
train_y = train_data['Survived']

train_x = train_x.to_numpy()
train_x = torch.FloatTensor(train_x)
train_y = train_y.to_numpy()
train_y = torch.FloatTensor(train_y)

input_size = train_x.shape[1] #(891, 8) => 8


test_data = test_data.to_numpy()
test_data = torch.FloatTensor(test_data)

## LogisticRegression - 76.79% acc

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LogisticRegression(nn.Module) :
    def __init__(self, input_size, output_size) :
        super().__init__()
        self.linear1 = nn.Linear(input_size, output_size)
    def forward(self, x) :
        output = self.linear1(x)
        output = F.sigmoid(output)
        return output

model = LogisticRegression(input_size, 1)
#optimizer = optim.SGD(model.parameters(), lr = 1e-3, momentum = 0.9) #76.55
optimizer = optim.SGD(model.parameters(), lr = 1e-2, momentum = 0.9) #76.79
n_epochs = 2000

is_lr_changed = False
for epoch in range(n_epochs) :
    
    predict = model(train_x)
    predict = predict.squeeze()
    
    cost = F.binary_cross_entropy(predict, train_y)
    predict = torch.round(predict)
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    acc = predict == train_y
    acc = acc.sum() / len(train_x)
    
    print('Epoch {}/{} : Cost : {:.6f}, acc : {:.6f}'.format(
                                            epoch+1, n_epochs,
                                            cost.item(), acc))
    
    if acc > 0.85 : break
    if (acc > 0.7) and epoch >= 100 and not is_lr_changed : #정확도 70% 이상시 0.1로 변경
        for g in optimizer.param_groups : 
            g['lr'] = 0.1
            is_lr_changed = True
    

In [None]:
with torch.no_grad() :
    predict = model(test_data)
    predict = predict.squeeze()
    predict = torch.round(predict)
    predict = predict.long()
    
    predict = pd.Series(predict, name = 'Survived')
    save_data = pd.concat([pID, predict], axis = 1)
    save_data.to_csv('lr_result.csv', index = False)

## DNN - 77.27% acc

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class DNNModel(nn.Module) :
    def __init__(self, input_size, output_size) :
        super().__init__()
        self.linear1 = nn.Linear(input_size, input_size*2)
        self.linear2 = nn.Linear(input_size*2, input_size)
        self.linear3 = nn.Linear(input_size, output_size)
    def forward(self, x) :
        output = self.linear1(x)
        output = F.relu(output)
        output = self.linear2(output)
        output = F.relu(output)
        output = self.linear3(output)
        output = F.sigmoid(output)
        return output

model = DNNModel(input_size, 1)
optimizer = optim.SGD(model.parameters(), lr = 1e-2, momentum = 0.9)

n_epochs = 2000

is_lr_changed = False
for epoch in range(n_epochs) :
    predict = model(train_x)
    predict = predict.squeeze()
    
    cost = F.binary_cross_entropy(predict, train_y)
    predict = torch.round(predict)
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    acc = predict == train_y
    acc = acc.sum()
    
    print('Epoch {}/{} : Cost : {:.6f}, acc : {:.6f}'.format(
                                            epoch+1, n_epochs,
                                            cost.item(), acc/len(train_x)))
    
    if cost.item() < 0.1 : break
    if (acc > 0.7) and epoch >= 100 and not is_lr_changed : #정확도 70% 이상시 0.1로 변경
        for g in optimizer.param_groups : 
            g['lr'] = 0.1
            is_lr_changed = True

In [None]:
with torch.no_grad() :
    predict = model(test_data)
    predict = predict.squeeze()
    predict = torch.round(predict)
    predict = predict.long()
    
    predict = pd.Series(predict, name = 'Survived')
    save_data = pd.concat([pID, predict], axis = 1)
    save_data.to_csv('dnn_result.csv', index = False)