# [範例重點]
了解隨機森林的建模方法及其中超參數的意義

In [1]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
clf = RandomForestClassifier(n_estimators=20, max_depth=4)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [3]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9736842105263158


In [4]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [5]:
print("Feature importance: ", clf.feature_importances_)

Feature importance:  [0.11181223 0.03744344 0.3662185  0.48452583]


# [作業重點]
確保你了解隨機森林模型中每個超參數的意義，並觀察調整超參數對結果的影響

# 作業
1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [6]:
# 讀取乳癌資料集
breast_cancer = datasets.load_breast_cancer()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.25, random_state=4)

# 建立模型
clf = RandomForestClassifier(n_estimators=20,
                             criterion='gini',
                             max_features='auto',
                             max_depth=10,
                             min_samples_split=5,
                             min_samples_leaf=2)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [7]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9300699300699301


In [8]:
print(breast_cancer.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [9]:
print("Feature importance: ", clf.feature_importances_)

Feature importance:  [3.38247231e-02 1.23729891e-02 6.12130170e-02 2.47660931e-02
 3.99564967e-03 2.44595958e-03 9.44910068e-03 4.55070846e-02
 1.72716224e-03 9.89439864e-04 4.37764682e-03 3.05317808e-03
 8.03479376e-03 1.14986037e-02 2.21618214e-03 5.17764836e-03
 1.80074572e-02 1.90504805e-03 1.68971754e-04 1.62637173e-03
 1.72403810e-01 3.62068553e-02 1.82653656e-01 1.47504387e-01
 5.90728504e-03 2.20582518e-02 1.65641183e-02 1.52876071e-01
 7.28762328e-03 4.18082293e-03]
