# 두번째 수행평가

In [1]:
#Titanic Data
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_data = pd.read_csv('../data/titanic/train.csv')
test_data = pd.read_csv('../data/titanic/test.csv') # PassengerId = 892부터 시작

# train, test 통합
raw_data = pd.concat([train_data, test_data], axis=0, ignore_index='True')

## 1. Data Preprocessing

-  PassengerId : 종속변수에 영향 X
- Survived : 종속변수 0,1 이므로 그냥 사용해도 된다
- Pclass : 1,2,3 영향 있다
- Name : 이름 자체는 생존여부에 영향을 미치지 않는다
- Sex : 성별 문자를 숫자로 변경해야 함
- Age : 나이에 결측치가 있다 Miss인 사람의 평균으로 or 전체 사람 평균, 0-99 범위를 가지는 실수값, 0-10 : 소아(0), 11-25 : 청년(1), 26-49 : 중년(2), 50- : 노년(3)
- Ticket : 굳이 포함시키지 않아도 될것 같음
- Fare : 여러 요소에 영향을 받음(class, 탑승지역...)
- Cabin : 객실번호
- Embarked : 데이터 분석해보면 생존여부에 영향이 있다

In [3]:
# 결측치 확인
print(raw_data.isnull().sum())
# Age에 263건, Cabin에 1014건, Embarked에 2건, Survived는 test_data

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [4]:
raw_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [5]:
display(raw_data[raw_data['Embarked'].isnull()])
display(raw_data[raw_data['Fare']==80.0])
# Embarked가 NaN인 두 건에 대해서 요금을 이용해 Embarked를 찾으려 했으나 
# 요금이 80인 사람도 유일하게 이 두명이라 이 데이터는 포기해야함

# Embarked 결측치 제거
raw_data.dropna(subset=['Embarked'], inplace=True)
display(raw_data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1.0,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1.0,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1.0,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1.0,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,8.0500,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,8.0500,S


In [6]:
# Name에서 먼저 공백 (' ')을 제거하고
# 그 다음 이름 분리하고 (, 을 기준으로 앞이 이름 뒤가 성)
# 그 다음 성의 앞부분을 분리(. 을 기준 뒤가 성 앞이 Mr, Mrs Miss)
print('Mr' in raw_data['Name'][5])

True


In [7]:
for i in raw_data.index:
    name_list = raw_data['Name'][i]
    
    if 'Mr' in name_list:
        raw_data.loc[i, ["Name_m"]] = 'Mr'
        
    elif 'Mrs' in name_list:
        raw_data.loc[i, ["Name_m"]] = 'Mrs'
        
    elif 'Miss' in name_list:
        raw_data.loc[i, ["Name_m"]] = 'Miss'
        
    else:
        if raw_data.loc[i,'Sex'] == 'male':
            raw_data.loc[i, ["Name_m"]] = 'Mr'
        else:
            # 일단 여성을 Miss로 취급
            raw_data.loc[i, ["Name_m"]] = 'Miss'

In [8]:
# 나이 결측치 처리
for i in raw_data.loc[raw_data['Age'].isnull()].index:
    if raw_data.loc[i,'Name_m'] == 'Mr':
        # 남성이면 남성의 평균으로
        raw_data.loc[i, ['Age']] = raw_data.loc[raw_data['Sex']=='male','Age'].mean()
    
    elif raw_data.loc[i,'Name_m'] == 'Mrs':
        # Mrs면 Mrs의 평균으로
        raw_data.loc[i, ['Age']] = raw_data.loc[raw_data['Name_m']=='Mrs','Age'].mean()
    
    elif raw_data.loc[i,'Name_m'] == 'Miss':
        # Miss면 Miss의 평균으로
        raw_data.loc[i, ['Age']] = raw_data.loc[raw_data['Name_m']=='Miss','Age'].mean()

In [9]:
# 나이 binning
raw_data.loc[raw_data['Age'] < 10, 'Age'] = 0  # 10살 미만 0
raw_data.loc[(raw_data['Age'] < 25) & (raw_data['Age'] >= 10), 'Age'] = 1  # 10-25살 1
raw_data.loc[(raw_data['Age'] < 40) & (raw_data['Age'] >= 20), 'Age'] = 2  # 25-40 2
raw_data.loc[(raw_data['Age'] < 60) & (raw_data['Age'] >= 40), 'Age'] = 3  # 40-60 3
raw_data.loc[raw_data['Age'] >= 60, 'Age'] = 4  # 60이상 3

In [10]:
# 나이 mapping 처리 test중
'''
name_mapping = {'Mr': raw_data.loc[raw_data['Sex']=='male','Age'].mean(),
                'Mrs': raw_data.loc[raw_data['Name_m']=='Mrs','Age'].mean(),
                'Miss': raw_data.loc[raw_data['Name_m']=='Miss','Age'].mean()}

raw_data.loc[raw_data['Age'].isnull(), 'Name_m'].map(name_mapping)
'''

"\nname_mapping = {'Mr': raw_data.loc[raw_data['Sex']=='male','Age'].mean(),\n                'Mrs': raw_data.loc[raw_data['Name_m']=='Mrs','Age'].mean(),\n                'Miss': raw_data.loc[raw_data['Name_m']=='Miss','Age'].mean()}\n\nraw_data.loc[raw_data['Age'].isnull(), 'Name_m'].map(name_mapping)\n"

In [11]:
# 성별 전처리 남성 0, 여성 1
gender_mapping = {'male': 0, 'female': 1}
raw_data['Sex'] = raw_data['Sex'].map(gender_mapping)

In [12]:
# Embarked 전처리 C = 0, Q = 1, S = 2 
Embarked_mapping = {'S':0, 'C':1, 'Q':2}
raw_data['Embarked'] = raw_data['Embarked'].map(Embarked_mapping)

In [13]:
# 전처리를 위해 사용한 Name, Name_m column들 제거
raw_data.drop(['Name', 'Name_m'], axis=1, inplace=True)

In [14]:
display(raw_data)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,0,1.0,1,0,7.2500,0
1,2,1.0,1,1,2.0,1,0,71.2833,1
2,3,1.0,3,1,2.0,0,0,7.9250,0
3,4,1.0,1,1,2.0,1,0,53.1000,0
4,5,0.0,3,0,2.0,0,0,8.0500,0
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,0,2.0,0,0,8.0500,0
1305,1306,,1,1,2.0,0,0,108.9000,1
1306,1307,,3,0,2.0,0,0,7.2500,0
1307,1308,,3,0,2.0,0,0,8.0500,0


In [16]:
# train data만 보기
display(raw_data.loc[raw_data['PassengerId'] <= 891].corr())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005028,-0.03533,-0.043136,0.041897,-0.057686,-0.001657,0.012703,-0.030555
Survived,-0.005028,1.0,-0.335549,0.541585,-0.103833,-0.03404,0.083151,0.25529,0.108669
Pclass,-0.03533,-0.335549,1.0,-0.127741,-0.30477,0.081656,0.016824,-0.548193,0.043835
Sex,-0.043136,0.541585,-0.127741,1.0,-0.161508,0.116348,0.247508,0.179958,0.118593
Age,0.041897,-0.103833,-0.30477,-0.161508,1.0,-0.249861,-0.196523,0.065092,-0.035057
SibSp,-0.057686,-0.03404,0.081656,0.116348,-0.249861,1.0,0.414542,0.160887,-0.060606
Parch,-0.001657,0.083151,0.016824,0.247508,-0.196523,0.414542,1.0,0.217532,-0.07932
Fare,0.012703,0.25529,-0.548193,0.179958,0.065092,0.160887,0.217532,1.0,0.063462
Embarked,-0.030555,0.108669,0.043835,0.118593,-0.035057,-0.060606,-0.07932,0.063462,1.0


- 우선 Survived와 상관관계가 높은건 Sex, Pclass와 Fare
- Pclass와 Fare간의 상관관계가 있다 - Class가 작아지면(높은 Class면) Fare가 높아진다
- 이 Fare는 SibSp, Parchd와 상관관계가 있다 - 함께 탄 사람이 많을 수록 Fare가 높아진다
- Pclass와 SibSp, Parchd간의 상관관계는 없다
- 따라서 Fare는 Pclass, SibSp, Parchd에 영향을 받은 변수이다(?)
- 하지만 Survived와 상관관계를 살펴보면 Pclass가 더 상관계수가 크다
- 그럼 Pclass만을 사용할 것인가? 아님 상관계수는 작지만 다양한 정보를 가지고 있을 수 있는 Fare만 사용할 것인가?
- 아님 Pclass와 SibSp, Parchd를 Fare 대신으로 사용할 것인가?
- 이 3가지 모두 해보면 될것 같긴함
---
## 2-1. Fare만 사용하기

In [17]:
data_f = raw_data.drop(['PassengerId', 'Pclass', 'SibSp', 'Parch'], axis=1, inplace=False)

x_data_f = data_f.drop(['Survived'], axis=1, inplace=False).values.reshape(-1,4)
t_data_f = data_f['Survived'].values.reshape(-1,1)

In [18]:
# 정규화
scaler_x = MinMaxScaler()
scaler_x.fit(x_data_f)
scaled_x_data_f = scaler_x.transform(x_data_f)

# train & test split
train_x_data_f = scaled_x_data_f[:889]
train_t_data_f = t_data_f[:889]

test_x_data_f = scaled_x_data_f[889:]
test_t_data_f = t_data_f[889:]

In [20]:
x_dim = train_x_data_f.shape[1]

# placehlode
X = tf.placeholder(shape=[None, x_dim], dtype=tf.float32)
T = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# W & b
W = tf.Variable(tf.random.normal([x_dim, 1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# H
logit = tf.matmul(X, W) + b
H = tf.sigmoid(logit)

# loss func
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, 
                                                              labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# sess
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [21]:
# learning
for step in range(300000):
    
    _,W_val,b_val,loss_val = sess.run([train,W,b,loss], 
                                      feed_dict={X : train_x_data_f, T : train_t_data_f})
    
    if step % 30000 == 0:
        print('W : {}, b : {}, loss : {}'.format(W_val,b_val,loss_val))

W : [[ 1.6345788 ]
 [ 0.78721493]
 [-0.07858507]
 [-1.4335763 ]], b : [0.7426499], loss : 0.9696022272109985
W : [[ 1.5390911 ]
 [ 0.37501135]
 [-0.10086065]
 [-1.4617058 ]], b : [-0.10108343], loss : 0.6584292054176331
W : [[ 1.5631591 ]
 [ 0.1736108 ]
 [-0.09578206]
 [-1.4079874 ]], b : [-0.49255726], loss : 0.5893247127532959
W : [[ 1.6409975 ]
 [ 0.06765655]
 [-0.07867505]
 [-1.3267978 ]], b : [-0.68267065], loss : 0.5686644315719604
W : [[ 1.7332091 ]
 [ 0.00278025]
 [-0.05700057]
 [-1.2389625 ]], b : [-0.7889252], loss : 0.5578556656837463
W : [[ 1.8229762 ]
 [-0.04293587]
 [-0.03374409]
 [-1.1531318 ]], b : [-0.8584123], loss : 0.5501924753189087
W : [[ 1.904915  ]
 [-0.07851711]
 [-0.01007202]
 [-1.070652  ]], b : [-0.9102211], loss : 0.5441778302192688
W : [[ 1.977353  ]
 [-0.10791667]
 [ 0.01357458]
 [-0.9930941 ]], b : [-0.95243275], loss : 0.5393405556678772
W : [[ 2.041028  ]
 [-0.13302357]
 [ 0.03702807]
 [-0.9199607 ]], b : [-0.9885967], loss : 0.5353761315345764
W : [[ 

In [27]:
# Accuarcy
# 정답
submission = pd.read_csv('../data/titanic/gender_submission.csv')
data = submission['Survived'].values.reshape(-1,1)

# predict용 node
predict = tf.cast(H > 0.5, dtype=tf.float32)

# predict와 val_t_data와 비교
correct = tf.equal(predict, T)

# 정확도 계산 node
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

accuracy_val = sess.run(accuracy, feed_dict={X: test_x_data_f, T:data})
print('Model의 정확도 : {}'.format(accuracy_val))

Model의 정확도 : 0.9976076483726501


In [26]:
# 예측결과 저장
submission['Survived'] = sess.run(predict, feed_dict={X: test_x_data_f})
submission.to_csv('../data/titanic/submission.csv', index=False)