# 决策树解决回归问题

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets

In [2]:
# 数据加载
boston = datasets.load_boston()
x = boston.data
y = boston.target

In [3]:
# 数据split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 666)

## 使用基本的sklearn中的decision tree regression

In [4]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train, y_train)
print("train score : ", dt_reg.score(x_train, y_train))
print("test score : ", dt_reg.score(x_test, y_test))

train score :  1.0
test score :  0.5950249082432981


## 参数调节

In [10]:
from sklearn.model_selection import GridSearchCV

grid_param = [
    {
        "max_depth":[i for i in range(1,20)],
        "max_leaf_nodes":[i for i in range(3,12)],
        "max_features":[i for i in range(3,12)],
        "min_samples_split":[i for i in range(3,12)],
        "min_samples_leaf":[i for i in range(3,12)],
        "min_weight_fraction_leaf":np.arange(0, 0.5, 0.05)
    }
]

In [11]:
dt_reg_search = DecisionTreeRegressor()
grid_search = GridSearchCV(dt_reg_search, param_grid=grid_param, n_jobs=-1, verbose=1, cv=5)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 1246590 candidates, totalling 6232950 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 8354 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 48354 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 104354 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done 176354 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 264354 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 368354 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 488354 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 624354 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 776354 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 944354 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1128354 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1328354 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1544354 tasks      | elapsed:  7.2min
[Parallel(n_jobs=

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=..., 12,
                                        13, 14, 15, 16, 17, 18, 19],
                          'max_features': [3, 4, 5, 6, 7, 8, 9, 10, 11],
                          'max_leaf_nod

In [12]:
grid_search.best_params_

{'max_depth': 13,
 'max_features': 10,
 'max_leaf_nodes': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0}

In [13]:
grid_search.best_score_

0.7957836976796565

In [29]:
grid_search.best_estimator_.score(x_test, y_test)

0.546597031652077

# 总结：

* 1、决策树可以做回归
* 2、决策树做回归效果不尽人意