In [1]:
# D26_RandomForest_SVM
# 隨機森林 (Random Forest) 與 支持向量機 (Support vector machine, SVM)

In [16]:
# ================= 隨機森林 (Random Forest) ======================
"""
隨機森林演算法會對資料從列方向（觀測值方向）與欄方向（變數方向）進行 Bootstrap sampling，
得到不同的訓練資料，然後根據這些訓練資料得到一系列的決策樹分類器，假如產生了 5 個決策樹分類器，
它們對某個觀測值的預測結果分別為 1, 0, 1, 1, 1，那麼隨機森林演算法的輸出結果就會是 1，
這個過程與 Bagging 演算法相同，同樣稱為基本分類器的投票。
隨機森林演算法在面對變數具有多元共線性或者不平衡資料（Unbalanced data）的情況時，
是倍受青睞的演算法。
""";

In [8]:
import numpy as np
import pandas as pd
from sklearn import model_selection, ensemble, preprocessing, metrics

# 載入資料
data = "/Users/Ensyuan/IThelp/tonykuoyj/titanic_train.csv"
titanic_train = pd.read_csv(data)

# 填補遺漏值
age_median = np.nanmedian(titanic_train['Age'])
new_Age = np.where(titanic_train['Age'].isnull(), age_median, titanic_train['Age'])
titanic_train['Age'] = new_Age

# 創造 dummy variables # ??
label_encoder = preprocessing.LabelEncoder()
encoder_Sex = label_encoder.fit_transform(titanic_train['Sex'])

# 建立訓練與測試資料
titanic_X = pd.DataFrame([titanic_train['Pclass'],
                        encoder_Sex,
                        titanic_train['Age']
]).T

titanic_y = titanic_train['Survived']
train_X, test_X, train_y, test_y = model_selection.train_test_split(titanic_X, titanic_y, test_size = 0.3)

# 建立 random forest 模型
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(train_X, train_y)


# 預測
test_y_predicted = forest.predict(test_X)

# 績效
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

0.8059701492537313


In [13]:
# ============== 支持向量機 (Support vector machine, SVM) ===================

"""
資料科學家將分類器在訓練樣本可能過度配適(overfitting)的風險稱為 Empirical risk，
分類器的推廣能力不足的風險稱為 Generalization risk，兩者的總和即為結構風險，
而 支持向量機 就是在兩者之間取得最佳平衡點，進而得到一個在訓練資料績效不錯，
亦能推廣適用的類似模型。
""";


In [33]:
import numpy as np
import pandas as pd
from sklearn import model_selection, svm, preprocessing, metrics

# 載入資料
data = "/Users/Ensyuan/IThelp/tonykuoyj/titanic_train.csv"
titanic_train = pd.read_csv(data)

# 填補遺漏值
age_median = np.nanmedian(titanic_train['Age'])
new_Age = np.where(titanic_train['Age'].isnull(), age_median, titanic_train['Age'])
titanic_train['Age'] = new_Age

# 創造 dummy variables # ??
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train['Sex'])

# 建立訓練與測試資料
titanic_X = pd.DataFrame([titanic_train['Pclass'],
                        encoder_Sex,
                        titanic_train['Age']
]).T

titanic_y = titanic_train['Survived']
train_X, test_X, train_y, test_y = model_selection.train_test_split(titanic_X, titanic_y, test_size = 0.3)

# 建立 SVC 模型
svc = svm.SVC()
svc_fit = svc.fit(train_X, train_y)

# 預測
test_y_predicted = svc.predict(test_X)

# 績效
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

0.6305970149253731


In [17]:
# ============== AUC (Area Under Curve) ===================
"""
AUC 是一個常見指標，
它同時考慮假警報率（False alarm rate）與命中率（True positive rate），
AUC 愈接近 1，就表示分類效果愈好；愈接近 0.5 就表示分類效果愈不好。
"""


'\nAUC 是一個常見指標，\n它同時考慮假警報率（False alarm rate）與命中率（True positive rate），\nAUC 愈接近 1，就表示分類效果愈好；愈接近 0.5 就表示分類效果愈不好。\n'

In [22]:
# ============== 隨機森林分類器的 AUC ===================

import numpy as np
import pandas as pd
from sklearn import model_selection, ensemble, preprocessing, metrics

# 載入資料
data = "/Users/Ensyuan/IThelp/tonykuoyj/titanic_train.csv"
titanic_train = pd.read_csv(data)

# 填補遺漏值
age_median = np.nanmedian(titanic_train['Age'])
new_Age = np.where(titanic_train['Age'].isnull(), age_median, titanic_train['Age'])
titanic_train['Age'] = new_Age

# 創造 dummy variables # ??
label_encoder = preprocessing.LabelEncoder()
encoder_Sex = label_encoder.fit_transform(titanic_train['Sex'])

# 建立訓練與測試資料
titanic_X = pd.DataFrame([titanic_train['Pclass'],
                        encoder_Sex,
                        titanic_train['Age']
]).T

titanic_y = titanic_train['Survived']
train_X, test_X, train_y, test_y = model_selection.train_test_split(titanic_X, titanic_y, test_size = 0.3)

# 建立 random forest 模型
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(train_X, train_y)


# 預測
test_y_predicted = forest.predict(test_X)

# 績效 1
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

# 績效 2
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print(auc)


0.832089552238806
0.8228797543113631


In [37]:
# ============== 支持向量機分類器的 AUC ===================

# 載入資料
data = "/Users/Ensyuan/IThelp/tonykuoyj/titanic_train.csv"
titanic_train = pd.read_csv(data)

# 填補遺漏值
age_median = np.nanmedian(titanic_train['Age'])
new_Age = np.where(titanic_train['Age'].isnull(), age_median, titanic_train['Age'])
titanic_train['Age'] = new_Age

# 創造 dummy variables # ??
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train['Sex'])

# 建立訓練與測試資料
titanic_X = pd.DataFrame([titanic_train['Pclass'],
                        encoder_Sex,
                        titanic_train['Age']
]).T

titanic_y = titanic_train['Survived']
train_X, test_X, train_y, test_y = model_selection.train_test_split(titanic_X, titanic_y, test_size = 0.3)

# 建立 SVC 模型
svc = svm.SVC()
svc_fit = svc.fit(train_X, train_y)

# 預測
test_y_predicted = svc.predict(test_X)

# 績效 1
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

# 績效 2
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print(auc)

0.6268656716417911
0.5141666666666668
