# 使用分類演算法預測鐵達尼號生存率
 *黃啟維*
### 讀入資料 
 使用 pandas 套件的 read_csv() 方法讀入資料集。   

In [4]:
import pandas as pd

url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_train.csv"
titanic = pd.read_csv(url)

titanic.head() #發現 Cabin欄位有遺漏值(NaN)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 了解變數意義

針對資料集中看不懂的變數(例如:Pclass)，可到[Data - Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/data)了解變數的意義。

In [5]:
# 描述性統計
titanic.describe() # 發現Age數值比別人少 有 NaN



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 填補年齡欄位的遺漏值
使用中位數來填補 Age 變數的遺漏值。


In [6]:
import numpy as np
import matplotlib.pyplot as plt

age_median = np.nanmedian(titanic.Age) # 用median方法會計算到NaN nanmedian則會忽略NaN
new_Age = np.where(titanic.Age.isnull(), age_median, titanic.Age)
# 用where方法判斷 if age變數為null(遺漏)則使用median填補 else使用原本的值

titanic.Age = new_Age
print(titanic.describe())

# 觀看填補完後的情形
plt.hist(titanic.Age)
plt.show()


       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [7]:
print(titanic.Embarked.value_counts()) #發現 S港 最多
new_Embarked = np.where(titanic.Embarked.isnull(), "S", titanic.Embarked)


S    644
C    168
Q     77
Name: Embarked, dtype: int64


### 建立dummy variables(擬變數)
因為我們使用的Logistic迴歸無法辨識類別型變數(ex.Sex, Embarked)，因此須將這些變數轉為可辨識的擬變數

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder() #初始化欄位
encoded_Sex = label_encoder.fit_transform(titanic.Sex) #利用fit_transform
encoded_Embarked = label_encoder.fit_transform(new_Embarked)

#展示一下轉換後的擬變數
print(encoded_Sex)  
print(encoded_Embarked)


[1 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 1 0 0 1 1 1 1
 1 0 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1
 1 1 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 1 1 1 0
 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0
 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1
 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1
 1 0 1 1 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0
 1 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 0 1 0 1 1
 1 1 0 1 1 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0
 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1
 1 1 0 1 0 0 1 1 1 1 1 1 

### 建立X與y
Scikit-learn 完全使用矩陣運算。
<img src="http://chart.googleapis.com/chart?cht=tx&chl= y=xw" style="border:none;">
使用Pclass,Sex,Age,Embarked做預測

In [15]:
titanic_X = pd.DataFrame([titanic.Pclass,encoded_Sex,new_Age,encoded_Embarked]).T

titanic_y = titanic.Survived

### 切分訓練與測試資料
切分70%資料建立模型(訓練資料)，再用30%資料驗證模型(測試資料)

In [16]:
from sklearn.cross_validation import train_test_split

# 切分訓練與測試資料
train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = 0.3)

## 建立Logistic回歸模型
使用Sex, Age, Embarked預測

In [17]:
from sklearn import metrics, linear_model

#建立模型
logistic_regr = linear_model.LogisticRegression() #初始化分類器
logistic_regr.fit(train_X, train_y)

#用predict方法預測y值
test_y_predicted = logistic_regr.predict(test_X)

#用accuracy_score方法預測準確率
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

0.772388059701


## 將模型應用在沒已答案的資料

In [18]:

#用read_csv方法讀入資料
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_test.csv"
submit = pd.read_csv(url)

# 跟訓練資料作一樣的整理
submit_encoded_Sex = label_encoder.fit_transform(submit.Sex)
submit_encoded_Embarked = label_encoder.fit_transform(submit.Embarked)
age_median = np.nanmedian(submit.Age) # 要忽略 NaN
submit_new_Age = np.where(submit.Age.isnull(), age_median, submit.Age)

submit_X = pd.DataFrame([submit.Pclass,
                         submit_encoded_Sex,
                         submit_new_Age,
                         submit_encoded_Embarked
]).T

# 預測並印出來觀看
submit_y = logistic_regr.predict(submit_X)
print(submit_y)

# 將資料轉換成資料框
submit_dict = {
    "PassengerId": submit["PassengerId"],
    "Survived": submit_y
}
submit_df = pd.DataFrame(submit_dict)

# 使用to_csv方法將資料框輸出成 csv
submit_df.to_csv("submit.csv", index = False)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
