# 두번째 수행평가

In [1]:
#Titanic Data
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_data = pd.read_csv('../data/titanic/train.csv')
display(train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## 1. Data Preprocessing

-  PassengerId : 종속변수에 영향 X
- Survived : 종속변수 0,1 이므로 그냥 사용해도 된다
- Pclass : 1,2,3 영향 있다
- Name : 이름 자체는 생존여부에 영향을 미치지 않는다
- Sex : 성별 문자를 숫자로 변경해야 함
- Age : 나이에 결측치가 있다 Miss인 사람의 평균으로 or 전체 사람 평균, 0-99 범위를 가지는 실수값, 0-10 : 소아(0), 11-25 : 청년(1), 26-49 : 중년(2), 50- : 노년(3)
- Ticket : 굳이 포함시키지 않아도 될것 같음
- Fare : 여러 요소에 영향을 받음(class, 탑승지역...)
- Cabin : 객실번호
- Embarked : 데이터 분석해보면 생존여부에 영향이 있다

In [3]:
# 결측치 확인
print(train_data.isnull().sum())

# Age에 177건, Cabin에 687건, Embarked에 2건

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
len(train_data['Name'].unique())
# 이름이 같은 사람은 없음

891

In [5]:
len(train_data['Ticket'].unique())
# Ticket은 겹치는게 있음 - 같이 탑승한 사람끼리 겹치는가?

681

In [6]:
len(train_data['Fare'].unique())
# Fare도 겹치는게 있는데, 여긴 영향 받는게 많아서 따로 봐야함

248

In [7]:
# Cabin, Ticket 제외
train_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [8]:
display(train_data[train_data['Embarked'].isnull()])
display(train_data[train_data['Fare']==80.0])
# Embarked가 NaN인 두 건에 대해서 요금을 이용해 Embarked를 찾으려 했으나 
# 요금이 80인 사람도 유일하게 이 두명이라 이 데이터는 포기해야함

# Embarked 결측치 제거
train_data.dropna(subset=['Embarked'], inplace=True)
display(train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C


In [9]:
# Name에서 먼저 공백 (' ')을 제거하고
# 그 다음 이름 분리하고 (, 을 기준으로 앞이 이름 뒤가 성)
# 그 다음 성의 앞부분을 분리(. 을 기준 뒤가 성 앞이 Mr, Mrs Miss)
print('Mr' in train_data['Name'][5])

True


In [10]:
for i in train_data.index:
    name_list = train_data['Name'][i]
    
    if 'Mr' in name_list:
        train_data.loc[i, ["Name_m"]] = 'Mr'
        
    elif 'Mrs' in name_list:
        train_data.loc[i, ["Name_m"]] = 'Mrs'
        
    elif 'Miss' in name_list:
        train_data.loc[i, ["Name_m"]] = 'Miss'
        
    else:
        if train_data.loc[i,'Sex'] == 'male':
            train_data.loc[i, ["Name_m"]] = 'Mr'
        else:
            # 일단 여성을 Miss로 취급
            train_data.loc[i, ["Name_m"]] = 'Miss'

In [11]:
# 나이 결측치 처리
for i in train_data.loc[train_data['Age'].isnull()].index:
    if train_data.loc[i,'Name_m'] == 'Mr':
        # 남성이면 남성의 평균으로
        train_data.loc[i, ['Age']] = train_data.loc[train_data['Sex']=='male','Age'].mean()
    
    elif train_data.loc[i,'Name_m'] == 'Mrs':
        # Mrs면 Mrs의 평균으로
        train_data.loc[i, ['Age']] = train_data.loc[train_data['Name_m']=='Mrs','Age'].mean()
    
    elif train_data.loc[i,'Name_m'] == 'Miss':
        # Miss면 Miss의 평균으로
        train_data.loc[i, ['Age']] = train_data.loc[train_data['Name_m']=='Miss','Age'].mean()

In [12]:
# 성별 전처리 남성 0, 여성 1
for i in train_data.index:
    
    if train_data.loc[i,'Sex'] == 'male':
        train_data.loc[i,'Sex'] = 0
        
    else:
        train_data.loc[i,'Sex'] = 1

In [13]:
# Embarked 전처리 C = 0, Q = 1, S = 2 
for i in train_data.index:
    
    if train_data.loc[i,'Embarked'] == 'C':
        train_data.loc[i,'Embarked'] = 0
        
    elif train_data.loc[i,'Embarked'] == 'Q':
        train_data.loc[i,'Embarked'] = 1
        
    else:
        train_data.loc[i,'Embarked'] = 2

In [14]:
# 전처리를 위해 사용한 Name, Name_m column들 제거
train_data.drop(['Name', 'Name_m'], axis=1, inplace=True)

In [15]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.0,1,0,7.2500,2
1,2,1,1,1,38.0,1,0,71.2833,0
2,3,1,3,1,26.0,0,0,7.9250,2
3,4,1,1,1,35.0,1,0,53.1000,2
4,5,0,3,0,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.0,0,0,13.0000,2
887,888,1,1,1,19.0,0,0,30.0000,2
888,889,0,3,1,22.0,1,2,23.4500,2
889,890,1,1,0,26.0,0,0,30.0000,0


In [18]:
display(train_data.corr())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005028,-0.03533,0.034633,-0.057686,-0.001657,0.012703
Survived,-0.005028,1.0,-0.335549,-0.089708,-0.03404,0.083151,0.25529
Pclass,-0.03533,-0.335549,1.0,-0.338842,0.081656,0.016824,-0.548193
Age,0.034633,-0.089708,-0.338842,1.0,-0.244649,-0.181038,0.091161
SibSp,-0.057686,-0.03404,0.081656,-0.244649,1.0,0.414542,0.160887
Parch,-0.001657,0.083151,0.016824,-0.181038,0.414542,1.0,0.217532
Fare,0.012703,0.25529,-0.548193,0.091161,0.160887,0.217532,1.0


- 우선 Survived와 상관관계가 높은건 Pclass와 Fare
- Pclass와 Fare간의 상관관계가 있다 - Class가 작아지면(높은 Class면) Fare가 높아진다
- 이 Fare는 SibSp, Parchd와 상관관계가 있다 - 함께 탄 사람이 많을 수록 Fare가 높아진다
- Pclass와 SibSp, Parchd간의 상관관계는 없다
- 따라서 Fare는 Pclass, SibSp, Parchd에 영향을 받은 변수이다(?)
- 하지만 Survived와 상관관계를 살펴보면 Pclass가 더 상관계수가 크다
- 그럼 Pclass만을 사용할 것인가? 아님 상관계수는 작지만 다양한 정보를 가지고 있을 수 있는 Fare만 사용할 것인가?
- 아님 Pclass와 SibSp, Parchd를 Fare 대신으로 사용할 것인가?
- 이 3가지 모두 해보면 될것 같긴함
---
## 2-1. Fare만 사용하기

In [32]:
train_data_f = train_data[['Survived', 'Sex', 'Age', 'Fare', 'Embarked']].copy()

x_data = train_data_f[['Sex', 'Age', 'Fare', 'Embarked']].values
t_data = train_data_f['Survived'].values.reshape(-1,1)

In [33]:
# 정규화
scaler_x = MinMaxScaler()
scaler_x.fit(x_data)
scaled_x_data = scaler_x.transform(x_data)

In [34]:
x_dim = x_data.shape[1]

# placehlode
X = tf.placeholder(shape=[None, x_dim], dtype=tf.float32)
T = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# W & b
W = tf.Variable(tf.random.normal([x_dim, 1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# H
logit = tf.matmul(X, W) + b
H = tf.sigmoid(logit)

# loss func
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, 
                                                              labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# sess
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [35]:
# learning
for step in range(300000):
    
    _,W_val,b_val,loss_val = sess.run([train,W,b,loss], 
                                      feed_dict={X : scaled_x_data, T : t_data})
    
    if step % 30000 == 0:
        print('W : {}, b : {}, loss : {}'.format(W_val,b_val,loss_val))

W : [[ 0.26131278]
 [-0.08479415]
 [ 1.9319526 ]
 [ 1.0636238 ]], b : [-0.7617254], loss : 0.7290471792221069
W : [[ 0.4796221 ]
 [-0.20472774]
 [ 1.940749  ]
 [ 0.6727854 ]], b : [-1.0434504], loss : 0.6223654747009277
W : [[ 0.729001  ]
 [-0.24399799]
 [ 1.9597892 ]
 [ 0.48227617]], b : [-1.111597], loss : 0.5867899060249329
W : [[ 0.9561722 ]
 [-0.26248255]
 [ 1.9807792 ]
 [ 0.35260287]], b : [-1.1278691], loss : 0.5635631084442139
W : [[ 1.152707  ]
 [-0.27464354]
 [ 1.9986606 ]
 [ 0.24865592]], b : [-1.130858], loss : 0.5468807816505432
W : [[ 1.3208792 ]
 [-0.2842459 ]
 [ 2.015358  ]
 [ 0.16018336]], b : [-1.130858], loss : 0.5346922278404236
W : [[ 1.4645789 ]
 [-0.29234415]
 [ 2.029663  ]
 [ 0.08325367]], b : [-1.1278062], loss : 0.5257132649421692
W : [[ 1.5876188 ]
 [-0.29948092]
 [ 2.0439682 ]
 [ 0.01531545]], b : [-1.1242299], loss : 0.5190215110778809
W : [[ 1.6935188 ]
 [-0.3057394 ]
 [ 2.0582733 ]
 [-0.04478059]], b : [-1.1206536], loss : 0.5139859914779663
W : [[ 1.7848

In [41]:
# prediction
test_data = pd.read_csv('../data/titanic/test.csv')

In [42]:
# 똑같이 데이터 정제
print(test_data.isnull().sum())

# Cabin, Ticket 제외
test_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

# Fare 결측치 제거
test_data.dropna(subset=['Embarked'], inplace=True)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [43]:
# 나이 결측치 처리용 cloumn 만들기
for i in test_data.index:
    name_list = test_data['Name'][i]
    
    if 'Mr' in name_list:
        test_data.loc[i, ["Name_m"]] = 'Mr'
        
    elif 'Mrs' in name_list:
        test_data.loc[i, ["Name_m"]] = 'Mrs'
        
    elif 'Miss' in name_list:
        test_data.loc[i, ["Name_m"]] = 'Miss'
        
    else:
        if test_data.loc[i,'Sex'] == 'male':
            test_data.loc[i, ["Name_m"]] = 'Mr'
        else:
            # 일단 여성을 Miss로 취급
            test_data.loc[i, ["Name_m"]] = 'Miss'
            
# 나이 결측치 처리
for i in test_data.loc[test_data['Age'].isnull()].index:
    if test_data.loc[i,'Name_m'] == 'Mr':
        # 남성이면 남성의 평균으로
        test_data.loc[i, ['Age']] = test_data.loc[test_data['Sex']=='male','Age'].mean()
    
    elif test_data.loc[i,'Name_m'] == 'Mrs':
        # Mrs면 Mrs의 평균으로
        test_data.loc[i, ['Age']] = test_data.loc[test_data['Name_m']=='Mrs','Age'].mean()
    
    elif test_data.loc[i,'Name_m'] == 'Miss':
        # Miss면 Miss의 평균으로
        test_data.loc[i, ['Age']] = test_data.loc[test_data['Name_m']=='Miss','Age'].mean()

In [44]:
# 성별 전처리 남성 0, 여성 1
for i in test_data.index:
    
    if test_data.loc[i,'Sex'] == 'male':
        test_data.loc[i,'Sex'] = 0
        
    else:
        test_data.loc[i,'Sex'] = 1
        
        
# Embarked 전처리 C = 0, Q = 1, S = 2 
for i in test_data.index:
    
    if test_data.loc[i,'Embarked'] == 'C':
        test_data.loc[i,'Embarked'] = 0
        
    elif test_data.loc[i,'Embarked'] == 'Q':
        test_data.loc[i,'Embarked'] = 1
        
    else:
        test_data.loc[i,'Embarked'] = 2
        
# 전처리를 위해 사용한 Name, Name_m column들 제거
test_data.drop(['Name', 'Name_m'], axis=1, inplace=True)

In [45]:
display(test_data)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,34.500000,0,0,7.8292,1
1,893,3,1,47.000000,1,0,7.0000,2
2,894,2,0,62.000000,0,0,9.6875,1
3,895,3,0,27.000000,0,0,8.6625,2
4,896,3,1,22.000000,1,1,12.2875,2
...,...,...,...,...,...,...,...,...
413,1305,3,0,30.272732,0,0,8.0500,2
414,1306,1,1,39.000000,0,0,108.9000,0
415,1307,3,0,38.500000,0,0,7.2500,2
416,1308,3,0,30.272732,0,0,8.0500,2


In [47]:
# 데이터 정규화
test_data_f = test_data[['Sex', 'Age', 'Fare', 'Embarked']].copy()
test_data_set = test_data_f[['Sex', 'Age', 'Fare', 'Embarked']].values
display(test_data_f.describe())
display(train_data_f.describe())

Unnamed: 0,Age,Fare
count,418.0,417.0
mean,29.977181,35.627188
std,12.727216,55.907576
min,0.17,0.0
25%,22.039846,7.8958
50%,30.0,14.4542
75%,35.75,31.5
max,76.0,512.3292


Unnamed: 0,Survived,Age,Fare
count,889.0,889.0,889.0
mean,0.382452,29.504642,32.096681
std,0.48626,13.065737,49.697504
min,0.0,0.42,0.0
25%,0.0,22.0,7.8958
50%,0.0,30.0,14.4542
75%,1.0,35.0,31.0
max,1.0,80.0,512.3292


In [None]:
scaler_x.fit(test_data_set)
scaled_test_data = scaler_x.transform(x_data)

scaled_test_data = scaler_x.transform(test_data)
print(sess.run(H, feed_dict={X : scaled_test_data}))