## 随机森林训练登月数据

* RandomForestClassifier
* RandomForestRegressor

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import numpy as np

In [4]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [8]:
y_pred_rf = rf_clf.predict(X_test)

In [11]:
np.sum(y_test == y_pred_rf) / len(y_test) # 计算准确率

0.92

## 极端随机树

* ExtraTreesClassifier
* 极端随机树在分裂时对每个特征随机使用阈值，而非搜索得出最佳阈值

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
et_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=16, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [15]:
y_pred_et = et_clf.predict(X_test)

In [16]:
np.sum(y_test == y_pred_et) / len(y_test) # 计算准确率

0.92

## 特征重要性

* feature_importances_ 属性
* Sklearn 通过含该特征的树节点平均减少不纯度的程度衡量特征重要性。

In [17]:
from sklearn.datasets import load_iris

iris = load_iris()
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf_clf.fit(iris['data'], iris['target'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [18]:
for name, score in zip(iris['feature_names'], rf_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09845848526823775
sepal width (cm) 0.024858802760713526
petal length (cm) 0.42708361042260257
petal width (cm) 0.4495991015484461


In [19]:
rf_clf.feature_importances_

array([0.09845849, 0.0248588 , 0.42708361, 0.4495991 ])