# Titanic: Machine Learning from Disaster
### _陳佳吟 Chia-Yin Chen_
### _2017-11-05_
---

# 1 Introduction
這份報告專注在機器學習模型的選擇及參數上的調整，使用Python的Scikit-Learn套件來實作。在這邊將用我已在R中做過[資料預處理(特徵工程及遺漏值處理的部分)](https://chiayinchen.github.io/Kaggle/titanic/analysis_with_randomForest)的titanic資料集，運用機器學習(machine learning)技術來預測鐵達尼號沉船後哪些乘客會存活下來。
# 2 Feature selection

In [1]:
import numpy as np
import pandas as pd

# 資料讀取
full_data = pd.read_csv("data\\titanic_data.csv")

full_data["travel_group"] = full_data["travel_group"].astype(str)
print(full_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 20 columns):
PassengerId     1309 non-null int64
Survived        891 non-null float64
Pclass          1309 non-null int64
Name            1309 non-null object
Sex             1309 non-null object
Age             1309 non-null float64
SibSp           1309 non-null int64
Parch           1309 non-null int64
Ticket          1309 non-null object
Fare            1309 non-null float64
Cabin           295 non-null object
Embarked        1309 non-null object
Set             1309 non-null object
Title           1309 non-null object
Child           1309 non-null object
identity        1309 non-null object
family_size     1309 non-null int64
family_type     1309 non-null object
travel_group    1309 non-null object
group_size      1309 non-null int64
dtypes: float64(3), int64(6), object(11)
memory usage: 204.6+ KB
None


In [2]:
# 轉換類別型變數為數值（One-hot encoding）
dummies_variables = pd.get_dummies(full_data.loc[:, ["Sex", "Embarked", "Title", "Child", "family_type"]]).astype("int64")

full_data = full_data.join(dummies_variables)
full_data = full_data.drop(["Sex", "Embarked", "Title", "Child", "family_type"], axis=1)

# 整理 features 跟 target
train = full_data[full_data.Set == "Train"]
train["Survived"] = train["Survived"].astype(int)
train["Survived"] = train["Survived"].astype(str)
test = full_data[full_data.Set == "Test"]

# define training and testing sets
X_train = train.drop(["PassengerId", "Name", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Set", "identity", "family_size", "travel_group", "Survived"], axis = 1)
Y_train = train["Survived"]
X_test = test.drop(["PassengerId", "Name", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Set", "identity", "family_size", "travel_group", "Survived"], axis = 1)

X_train.shape, Y_train.shape, X_test.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


((891, 20), (891,), (418, 20))

# 3 Classifier Comparison
## 3.1 支持向量機 (Support Vector Machine)

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, ShuffleSplit

## Support Vector Machine
#svm_kernels = ["linear", "rbf"]

#for kernel in svm_kernels:
#    svm = SVC(kernel = kernel)
#    #cv = ShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 87)
#    svm_acc = cross_val_score(svm, X_train, Y_train, cv = 5, scoring = "accuracy").mean()
#    print("[%s] accurary: %.2f%%" % (kernel, svm_acc * 100))


In [4]:
#svm = SVC(kernel = "linear")
#svm.fit(X_train, Y_train)
#Y_pred = svm.predict(X_test)

#submission = pd.DataFrame({
#    "PassengerId": test["PassengerId"],
#    "Survived": Y_pred
#})
#submission.to_csv("result\\svm_submission.csv", index = False)

In [5]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

svm = SVC()

parameters = [{"C": [1, 10, 100, 1000], "kernel": ["linear"]},
              {"C": [1, 10, 100, 1000], "gamma": [0.01, 1, 10, 100], "kernel": ["rbf"]}]
acc_scorer = make_scorer(accuracy_score)
svm_grid = GridSearchCV(svm, parameters, scoring = acc_scorer)
svm_grid = svm_grid.fit(X_train, Y_train)
print(svm_grid.best_estimator_)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [6]:
print(svm_grid.best_score_)
print(svm_grid.best_params_)

0.83164983165
{'C': 100, 'kernel': 'linear'}


In [7]:
svm = svm_grid.best_estimator_
svm.fit(X_train, Y_train)
Y_pred = svm.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv("result\\svm_grid_submission.csv", index = False)

## 3.2 羅吉斯迴歸 (Logistic Regression)

In [8]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
print("Accuracy: %.2f%%" % (lr.score(X_train, Y_train) * 100))

Accuracy: 82.94%


In [9]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv("result\\lr_submission.csv", index = False)

## 3.3 K最近鄰 (K-Nearest Neighbor)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

# k-Nearest Neighbor
ks = [3, 5, 7, 9, 11]
for k in ks:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn_acc = cross_val_score(knn, X_train, Y_train, cv = 5, scoring = "accuracy").mean()
    print("[n_neighbors = %i] accurary: %.2f%%" % (k, knn_acc * 100))

[n_neighbors = 3] accurary: 78.57%
[n_neighbors = 5] accurary: 78.35%
[n_neighbors = 7] accurary: 79.92%
[n_neighbors = 9] accurary: 79.58%
[n_neighbors = 11] accurary: 77.45%


In [11]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv("result\\knn_submission.csv", index = False)

## 3.4 決策樹 (Decision Tree)

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
criterions = ["gini", "entropy"]
max_depths = [2, 3, 4, 5]
for max_depth in max_depths:
    for criterion in criterions:
        tree = DecisionTreeClassifier(criterion = criterion, max_depth = max_depth)
        tree_acc = cross_val_score(tree, X_train, Y_train, cv = 5, scoring = "accuracy").mean()
        print("[%s with max_depth = %i] accurary: %.2f%%" % (criterion, max_depth, tree_acc * 100))

[gini with max_depth = 2] accurary: 78.34%
[entropy with max_depth = 2] accurary: 78.91%
[gini with max_depth = 3] accurary: 82.61%
[entropy with max_depth = 3] accurary: 83.17%
[gini with max_depth = 4] accurary: 82.38%
[entropy with max_depth = 4] accurary: 83.05%
[gini with max_depth = 5] accurary: 79.91%
[entropy with max_depth = 5] accurary: 80.25%


In [13]:
tree = DecisionTreeClassifier(criterion = "entropy", max_depth = 3)
tree.fit(X_train, Y_train)
Y_pred = tree.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv("result\\tree_submission.csv", index = False)

## 3.5 隨機森林 (Random Forest)

In [14]:
from sklearn.ensemble import RandomForestClassifier

## Random Forests
#estimators = [100, 200, 300, 500, 700, 1000]
#criterions = ["gini", "entropy"]
#max_depths = [2, 3, 4, 5]

#for criterion in criterions:
#    for estimator in estimators:
#        rf = RandomForestClassifier(max_depth = 3, n_estimators = estimator, criterion = criterion)
#        rf_acc = cross_val_score(rf, X_train, Y_train, cv = 5, scoring = "accuracy").mean()
#        print("[%s with %i estimator] accurary: %.2f%%" % (criterion, estimator, rf_acc * 100))


In [15]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

forest = RandomForestClassifier()

parameters = {"n_estimators": [50, 100, 200, 300], 
              "criterion": ["entropy", "gini"],
              "max_depth": [3, 5, 10, 15, 20], 
              "min_samples_split": [2, 5, 10, 15, 20]
             }
acc_scorer = make_scorer(accuracy_score)
rf_grid = GridSearchCV(forest, parameters, scoring = acc_scorer)
rf_grid = rf_grid.fit(X_train, Y_train)
print(rf_grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [16]:
print(rf_grid.best_score_)
print(rf_grid.best_params_)

0.835016835017
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 20, 'n_estimators': 300}


In [17]:
rf = rf_grid.best_estimator_
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv("result\\rf_grid_submission.csv", index = False)