## 用 Keras 建構模型來預測 Titanic 資料集，並將結果與 HW03 比較

### 1.資料缺失值處理與特徵選取(同HW3)

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

In [99]:
# 載入資料
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv('gender_submission.csv')

#### 觀察資料

In [100]:
train.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [101]:
train.info() # Age、Cabin、Embarked有缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [102]:
test.info() # Age、Fare、Cabin有缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [103]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [104]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


#### 選特徵&補缺值

In [105]:
Y_train = train['Survived']
X_train = train.append(test) #合併表格處理，之後會再分開
X_train.shape

(1309, 12)

In [106]:
X_train['Family'] = X_train['SibSp'] + X_train['Parch'] # 把親屬合併起來，成為一個新特徵

X_train.drop('SibSp', 1, inplace=True)
X_train.drop('Parch', 1, inplace=True)

In [107]:
# 選特徵(我覺得簡單重要的&網站已分析的)
selected_features = ['Pclass', 'Age', 'Family', 'Fare', 'Sex', 'Embarked']

# 補缺值(簡單做或參考網站作法)，之後應仔細分析資料再個別處理
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_train['Fare'].fillna(X_train['Fare'].median(), inplace=True)
X_train['Embarked'].fillna("C", inplace=True)

In [108]:
# One-Hot Encoding
X_train['Pclass'] = X_train['Pclass'].astype('category').cat.codes
X_train['Sex'] = X_train['Sex'].astype('category').cat.codes
X_train['Embarked'] = X_train['Embarked'].astype('category').cat.codes

In [109]:
# 準備訓練
X_test = X_train.iloc[891:]
X_train = X_train.head(891)

X_train.shape, Y_train.shape, X_test.shape

((891, 11), (891,), (418, 11))

In [110]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]
X_train.head()

Unnamed: 0,Pclass,Age,Family,Fare,Sex,Embarked
0,2,22.0,1,7.25,1,2
1,0,38.0,1,71.2833,0,0
2,2,26.0,0,7.925,0,2
3,0,35.0,1,53.1,0,2
4,2,35.0,0,8.05,1,2


### 2.DecisionTree(同HW3)

In [111]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

tree_clf = DecisionTreeClassifier(max_depth=3)
tree_clf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [112]:
predict = tree_clf.predict(X_test)

#### 成績
![img](https://i.imgur.com/D8Ue5wD.jpg)

### 3.Keras
#### [參考資料1.將TensorFlow與Keras安裝在Windows系統](http://tensorflowkeras.blogspot.com/2017/08/tensorflowkeraswindows_29.html)
#### [參考資料2.Keras深度學習(Deep Learning) 預測鐵達尼號旅客生存機率](http://tensorflowkeras.blogspot.com/2017/09/kerasdeep-learning_32.html)

In [113]:
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.layers import Dense, Activation

In [114]:
# 建立模型
model = Sequential()

In [115]:
# 第一層
model.add(Dense(units = 20, input_dim = 6, kernel_initializer = 'uniform', activation='relu'))
# 第二層
model.add(Dense(units = 30, kernel_initializer = 'uniform', activation='relu'))
# 輸出層
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

In [116]:
# 訓練模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [123]:
model.fit(X_train, Y_train, validation_split=0.2, epochs=50)

Train on 712 samples, validate on 179 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1d716d0f5c0>

In [124]:
# 測試模型
acc = model.evaluate(X_train, Y_train)
acc[1]



0.8226711562051784

In [125]:
# 預測模型
survived_predict = model.predict(X_test)
survived_predict = np.where(survived_predict > 0.5 , 1 ,0)
survived_predict = survived_predict.flatten()# 轉成1維
survived_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [126]:
# 將結果輸出成規定的格式
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':survived_predict})
submission.to_csv('submission.csv', index=False)

#### 成績 >> 相較HW3決策樹結果沒有改善
![img](https://i.imgur.com/7OKadBp.jpg)