## 构建随机森林回归模型


导入工具库

In [20]:
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import numpy as np

加载数据

In [21]:

from sklearn.datasets import load_boston

In [22]:
boston_house = load_boston()

In [23]:
boston_feature_name = boston_house.feature_names
boston_features = boston_house.data
boston_target = boston_house.target

In [24]:
boston_feature_name

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

查看数据描述

In [25]:
print(boston_house.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [26]:
boston_features

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [27]:
boston_target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

### 构建模型

In [28]:
help(RandomForestRegressor)

Help on class RandomForestRegressor in module sklearn.ensemble._forest:

class RandomForestRegressor(ForestRegressor)
 |  A random forest regressor.
 |  
 |  A random forest is a meta estimator that fits a number of classifying
 |  decision trees on various sub-samples of the dataset and uses averaging
 |  to improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : int, default=100
 |      The number of trees in the forest.
 |  
 |      .. versionchanged:: 0.22
 |         The default value of ``n_estimators`` changed from 10 to 100
 |         in 0.22.
 |  
 |  criterion : {"mse", "mae"}, default="mse"
 |      The function to measure the quality of a split. Supported criteria
 |      are "mse" for the mean squared er

In [29]:
rgs = RandomForestRegressor(n_estimators=5)
rgs = rgs.fit(boston_features,boston_target)

In [30]:
rgs

RandomForestRegressor(n_estimators=5)

In [36]:
pred = rgs.predict(boston_features)
for i in range(pred.shape[0]):
    print(pred[i],'-->',boston_target[i])
print("均方误差",np.sqrt(((pred-boston_target)**2).mean()))

26.580000000000002 --> 24.0
22.059999999999995 --> 21.6
34.5 --> 34.7
34.18 --> 33.4
36.620000000000005 --> 36.2
27.22 --> 28.7
22.1 --> 22.9
23.000000000000004 --> 27.1
16.56 --> 16.5
18.880000000000003 --> 18.9
18.94 --> 15.0
18.9 --> 18.9
21.7 --> 21.7
20.060000000000002 --> 20.4
18.2 --> 18.2
19.9 --> 19.9
22.5 --> 23.1
17.72 --> 17.5
21.5 --> 20.2
18.6 --> 18.2
13.24 --> 13.6
19.6 --> 19.6
16.68 --> 15.2
14.14 --> 14.5
15.6 --> 15.6
14.780000000000001 --> 13.9
16.48 --> 16.6
14.800000000000002 --> 14.8
19.6 --> 18.4
21.64 --> 21.0
14.079999999999998 --> 12.7
17.5 --> 14.5
13.2 --> 13.2
13.84 --> 13.1
13.440000000000001 --> 13.5
19.859999999999996 --> 18.9
19.48 --> 20.0
21.080000000000002 --> 21.0
22.78 --> 24.7
30.660000000000004 --> 30.8
33.24 --> 34.9
29.440000000000005 --> 26.6
24.34 --> 25.3
24.22 --> 24.7
20.660000000000004 --> 21.2
19.58 --> 19.3
19.9 --> 20.0
19.4 --> 16.6
15.559999999999999 --> 14.4
19.939999999999998 --> 19.4
19.7 --> 19.7
20.9 --> 20.5
24.88000000000000

### 使用决策树比较

In [32]:
from sklearn import tree

In [33]:
dtr = tree.DecisionTreeRegressor()
dtr.fit(boston_features,boston_target)

DecisionTreeRegressor()

In [37]:
pred = dtr.predict(boston_features)
for i in range(pred.shape[0]):
    print(pred[i],'-->',boston_target[i])
print("均方误差",np.sqrt(((pred-boston_target)**2).mean()))

24.0 --> 24.0
21.6 --> 21.6
34.7 --> 34.7
33.4 --> 33.4
36.2 --> 36.2
28.7 --> 28.7
22.9 --> 22.9
27.1 --> 27.1
16.5 --> 16.5
18.9 --> 18.9
15.0 --> 15.0
18.9 --> 18.9
21.7 --> 21.7
20.4 --> 20.4
18.2 --> 18.2
19.9 --> 19.9
23.1 --> 23.1
17.5 --> 17.5
20.2 --> 20.2
18.2 --> 18.2
13.6 --> 13.6
19.6 --> 19.6
15.2 --> 15.2
14.5 --> 14.5
15.6 --> 15.6
13.9 --> 13.9
16.6 --> 16.6
14.8 --> 14.8
18.4 --> 18.4
21.0 --> 21.0
12.7 --> 12.7
14.5 --> 14.5
13.2 --> 13.2
13.1 --> 13.1
13.5 --> 13.5
18.9 --> 18.9
20.0 --> 20.0
21.0 --> 21.0
24.7 --> 24.7
30.8 --> 30.8
34.9 --> 34.9
26.6 --> 26.6
25.3 --> 25.3
24.7 --> 24.7
21.2 --> 21.2
19.3 --> 19.3
20.0 --> 20.0
16.6 --> 16.6
14.4 --> 14.4
19.4 --> 19.4
19.7 --> 19.7
20.5 --> 20.5
25.0 --> 25.0
23.4 --> 23.4
18.9 --> 18.9
35.4 --> 35.4
24.7 --> 24.7
31.6 --> 31.6
23.3 --> 23.3
19.6 --> 19.6
18.7 --> 18.7
16.0 --> 16.0
22.2 --> 22.2
25.0 --> 25.0
33.0 --> 33.0
23.5 --> 23.5
19.4 --> 19.4
22.0 --> 22.0
17.4 --> 17.4
20.9 --> 20.9
24.2 --> 24.2
21.7 -