In [7]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F
import torch.optim as optim

##########################################################
def process_data(_df):
    df = _df.copy()
    if len(df) == 0:
        return df
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
    # fill missing values for ['Embarked', 'Fare', 'Age']
    freq_port = df['Embarked'].mode()[0]
    df['Embarked'].fillna(freq_port, inplace=True)
    fare_med = df['Fare'].median()
    df['Fare'].fillna(fare_med, inplace=True)
    
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    df['Embarked'] = df['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)
    
    guess_age = np.zeros(4)
    for i in range(4):
        guess_age[i] = df[df['Pclass']==i]['Age'].mean()
    for i in range(4):
        df.loc[df['Age'].isnull() & (df['Pclass']==i), 'Age'] = guess_age[i]

    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
    return df

##########################################################
def normalize(_df1, _df2):
    df1 = _df1.copy()
    df2 = _df2.copy()
    for col in ['Pclass', 'Age', 'Fare', 'FamilySize']:
        mean = df1[col].mean()
        std = df1[col].std()
        df1[col] = (df1[col] - mean) / std
        if len(df2) > 0:
            df2[col] = (df2[col] - mean) / std
    return df1,df2

##########################################################
def df_to_tensor(_df1, _df2):
    df1 = process_data(_df1)
    df2 = process_data(_df2)
    df1, df2 = normalize(df1, df2)
    x1 = torch.FloatTensor(df1.drop(['PassengerId','Survived'], axis=1).values)
    y1 = torch.FloatTensor(df1[['Survived']].values)
    x2 = torch.FloatTensor()
    y2 = torch.FloatTensor()
    if len(df2) > 0:
        x2 = torch.FloatTensor(df2.drop(['PassengerId','Survived'], axis=1).values)
        y2 = torch.FloatTensor(df2[['Survived']].values)
    return x1,y1,x2,y2

##########################################################
def evaluate(x, y, W, b):
    if len(x)==0 or len(x)!=len(y):
        return np.nan
    pred = torch.sigmoid(x.matmul(W)+b)
    cost = F.binary_cross_entropy(pred, y)
    return cost.item()

##########################################################
def logistic_regression(x_train, y_train, x_valid, y_valid, lr, wd):
    global epochs, log_freq
    # requires_grad를 True로 설정하면 역전파중에 해당 Tensor의 변화도를 추가로 계산함
    W = torch.zeros((x_train.shape[1],1), requires_grad=True)
    b = torch.zeros(1, requires_grad=True)
    optimizer = optim.SGD([W,b], lr=lr, weight_decay=wd)
    
    for epoch in range(1, epochs+1):
        pred = torch.sigmoid(x_train.matmul(W)+b)
        cost = F.binary_cross_entropy(pred, y_train)
        # pytorch는 미분을 통해 얻은 기울기를 이전에 계산된 기울기 값에 누적시키는 특징이 있음
        # 새로운 기울기를 계산하기 위해 기존 기울기를 0으로 초기화 해야함
        optimizer.zero_grad()
        # autograd를 사용하여 역전파를 수행한다. requires_grad=True인 모든 tensor에 대해 
        # 손실의 변화도를 계산하여 각 tensor의 .grad에 저장
        cost.backward()
        # 각 tensor의 .grad값을 이용해 tensor를 업데이트함
        optimizer.step()

        if epoch%(epochs//log_freq) == 0:
            cost_valid = evaluate(x_valid, y_valid, W, b)
            print('Train Cost: {:.4f} || Valid Cost: {:.4f}'\
                .format(cost.item(), cost_valid))
                
    print('-'*40)        
    cost_valid = evaluate(x_valid, y_valid, W, b)        
    return W,b,cost_valid
    
##########################################################
def kfold_cv(df, lr, wd, k=7):
    fold = df.shape[0]//k
    cv_error = 0.0
    for i in range(k):
        df_train = df.drop(df.index[i*fold:(i+1)*fold])
        df_valid = df[:][i*fold:(i+1)*fold]
        x_train, y_train, x_valid, y_valid = df_to_tensor(df_train, df_valid)
        W,b,error = logistic_regression(x_train, y_train, x_valid, y_valid, lr, wd)
        cv_error += error
    cv_error /= k
    return cv_error

##########################################################
df_train = pd.read_csv('train.csv').sample(frac=1) # shuffle
lr = 0.02
wd = 0.002
epochs = 6000
log_freq = 5

#cv_error = kfold_cv(df_train, lr, wd)
#print('cv_error: {:4f}'.format(cv_error))
#sys.exit(0)

#############################
df_valid = pd.DataFrame()
x_train, y_train, x_valid, y_valid = df_to_tensor(df_train, df_valid)
W,b,error = logistic_regression(x_train, y_train, x_valid, y_valid, lr, wd)

df_test = pd.read_csv('test.csv')
df_test['Survived'] = np.nan
x_train, y_train, x_test, y_test = df_to_tensor(df_train, df_test)

pred = torch.sigmoid(x_test.matmul(W)+b).detach().numpy()
output = pd.DataFrame({'PassengerId': df_test['PassengerId']})
output['Survived'] = 0
for idx, row in output.iterrows():
    output.loc[idx, 'Survived'] = 1 if pred[idx][0]>0.5 else 0
    
output.to_csv('submission.csv', index=False)
print('Fin')


Train Cost: 0.4691 || Valid Cost: nan
Train Cost: 0.4491 || Valid Cost: nan
Train Cost: 0.4438 || Valid Cost: nan
Train Cost: 0.4420 || Valid Cost: nan
Train Cost: 0.4413 || Valid Cost: nan
----------------------------------------
Fin
