# 决策树(回归)的应用

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
# 导入房价预测数据集
from sklearn.datasets.california_housing import fetch_california_housing

# 实例化数据集类并命名
housing = fetch_california_housing()
# 查看训练集，目标集数据维度
housing.data.shape , housing.target.shape, type(housing.data), type(housing.target)
# df = pd.DataFrame(housing.data, columns = housin)

((20640, 8), (20640,), numpy.ndarray, numpy.ndarray)

In [2]:
from sklearn import tree
# 实例化决策树回归模型实例并命名
dtr = tree.DecisionTreeRegressor(max_depth=2)
# 把数据及喂给模型
dtr.fit(housing.data[:,[6,7]], housing.target)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [3]:
# 可视化展示，需要首先安装graphviz
import graphviz
dot_data = tree.export_graphviz(dtr, 
                               feature_names =housing.feature_names[6:9],
                               filled=True,
                               impurity=False,
                               rounded=True)
graph1 = graphviz.Source(dot_data)
graph1.render('housing1')

'housing1.pdf'

# 随机森林

In [4]:
# 首先对数据进行数据拆分
from sklearn.model_selection import train_test_split
# 用拆包的方法接收train_test_split函数的结果, test_size指的是取测试集占训练集的比例
data_train,data_test,target_train,target_test = \
    train_test_split(housing.data, housing.target, test_size=0.1, random_state=23)

#### 下面创建3个模型
##### 1.单树DT
##### 2.随机森林RF
##### 3.自动优化参数的随机森林RF-grid
##### 分别用以上分割的数据集来进行准确率得分计算（保证用同一随机数种子来确保数据来自同一份split数据）

In [5]:
"""单树模型（回归）"""

# 实例化一个新的树模型来记录决策树（单树）的训练分数,这里保持随机数种子是23不变
dtr_test = tree.DecisionTreeRegressor(random_state=23)
# 喂入分割好的训练集
dtr_test.fit(data_train,target_train)
# 用测试集计算准确率
dtr_test.score(data_test,target_test)

0.6274613003586686

In [6]:
"""
随机森林模型（回归）

随机：1.在训练集中有放回地、随机地选取一定比例的样本进行模型训练
     2.在训练的过程中随机选取一定比例的特征（不可重复）
森林：以以上的12步选取的树为元素，建立多棵树，最后的模型参数采取所有树中的平均值
"""
from sklearn.ensemble import RandomForestRegressor
# 实例化模型并命名（保持随机数种子random_state不变,树数量n_estimators默认为10棵）
rfr = RandomForestRegressor(random_state=23)
# 把训练数据喂给随机森林模型
rfr.fit(data_train,target_train)
# 用测试集计算准确率
rfr.score(data_test,target_test)



0.7742669601400837

In [7]:
"""
优化参数的随机森林（树的数量，节点分裂最小样本数）

GridSearchCV是一个通过内置for循环来帮助选择模型参数的工具

"""
from sklearn.model_selection import GridSearchCV
# 把参数及其候选值通过dict的格式传递给一个变量tree_parm_grid
tree_parm_grid = {'min_samples_split':list((3,6,9)), 'n_estimators':list((10,50,100))}
# 实例化并命名grid模型
grid = GridSearchCV(RandomForestRegressor(random_state=23),param_grid = tree_parm_grid,cv=5,return_train_score=False)
# 把训练集喂给grid模型
grid.fit(data_train,target_train,)
# 打印分数列表、最优参数及其分数
grid.cv_results_, grid.best_params_, grid.best_score_

({'mean_fit_time': array([0.93880816, 4.6374886 , 9.39000931, 0.89419236, 4.45629401,
         8.70843906, 0.89559312, 4.19346924, 8.17265029]),
  'std_fit_time': array([0.01030952, 0.07521436, 0.19227996, 0.04398108, 0.05589763,
         0.04018129, 0.12316048, 0.07159344, 0.08660677]),
  'mean_score_time': array([0.01040692, 0.04643183, 0.09466386, 0.00840607, 0.04002652,
         0.07965245, 0.00760608, 0.03602409, 0.06924553]),
  'std_score_time': array([0.00149791, 0.0008019 , 0.00280169, 0.00049015, 0.00178934,
         0.00492536, 0.0004906 , 0.00167525, 0.0014702 ]),
  'param_min_samples_split': masked_array(data=[3, 3, 3, 6, 6, 6, 9, 9, 9],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_n_estimators': masked_array(data=[10, 50, 100, 10, 50, 100, 10, 50, 100],
               mask=[False, False, False, False, False, False, False, False,
                     F

In [8]:
# 把分数最好的参数扔给随机森林模型
rfr_best = RandomForestRegressor(min_samples_split=3,n_estimators=100,random_state=23)
rfr_best.fit(data_train,target_train)
# rfr_best.score(data_test,target_test)
target_test_pred = rfr_best.predict(data_test)

# 查看残差平方均值MSE(The mean squared error)
print("残差平方均值MSE: %.2f" % mean_squared_error(target_test, target_test_pred))

# Explained variance score: 1 is perfect prediction 
#  R2 决定系数（拟合优度）
# 模型越好：r2→1
# 模型越差：r2→0
print('拟合优度R2: %.2f' % r2_score(target_test,target_test_pred))


NameError: name 'mean_squared_error' is not defined

In [None]:
# 显示最佳模型rfr_best的参数parameters
pd.Series(rfr_best.feature_importances_,index=housing.feature_names)

In [None]:
"""数据可视化"""

import matplotlib.pyplot as plt
import numpy as np
y_importances = rfr_best.feature_importances_
x_importances = housing.feature_names
y_pos = np.arange(len(x_importances))
# 横向柱状图
plt.barh(y_pos, y_importances, align='center')
plt.yticks(y_pos, x_importances)
plt.xlabel('Importances')
plt.xlim(0,1)
plt.title('Features Importances')
plt.show()