# Lab03: Decision Trees & Random Forests
<hr>

110062802 呂宸漢


## 1. Load Data and Make Prediction by Random Forest


In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200,
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred) / X_test.shape[1]))


Accuracy: 0.98
Accuracy per feature: 0.03


## 2. Get Feature Importances


In [2]:
import numpy as np

feature_names = init_data['feature_names']
importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feature_names[indices[f]],
                            importances[indices[f]]))


 1) worst concave points           0.141849
 2) mean concave points            0.117697
 3) worst radius                   0.110919
 4) worst perimeter                0.105243
 5) mean concavity                 0.077163
 6) worst area                     0.073364
 7) area error                     0.041760
 8) mean perimeter                 0.036943
 9) mean radius                    0.035625
10) worst concavity                0.035205
11) mean area                      0.032890
12) worst texture                  0.022736
13) mean texture                   0.018584
14) worst compactness              0.016786
15) radius error                   0.015408
16) worst symmetry                 0.014770
17) worst smoothness               0.014235
18) mean compactness               0.010085
19) perimeter error                0.010040
20) worst fractal dimension        0.009015
21) concave points error           0.008085
22) compactness error              0.007420
23) mean smoothness             

## 3. Reduce Features


In [3]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.117, prefit=True)
X_train_reduced = sfm.transform(X_train)
X_test_reduced = sfm.transform(X_test)

for f in range(X_train_reduced.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feature_names[indices[f]],
                            importances[indices[f]]))


 1) worst concave points           0.141849
 2) mean concave points            0.117697


## 4. Use Reduced Features to Train Model


In [4]:
forest.fit(X_train_reduced, y_train)

y_pred = forest.predict(X_test_reduced)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred) / X_test_reduced.shape[1]))


Accuracy: 0.89
Accuracy per feature: 0.44
