## 数据准备

In [1]:
import numpy as np

from sklearn.datasets import load_boston
data=load_boston()
X,Y=data.data,data.target
del data

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

# 把X，Y拼起来便于操作
training_data=np.c_[X_train,Y_train]
testing_data=np.c_[X_test,Y_test]

# print(training_data.shape,testing_data.shape)

(404, 13) (102, 13) (404,) (102,)


## 模型基础
RF跟普通树模型的区别很明显也很简单，每棵树在一个随机抽样的子数据集上训练，并且每次分裂时只在一个随机子空间上做test。为了简便，在抽样数据子集时同时随机选取$\sqrt{m}$个子特征。

注意，为了保持训练与预测时数据的一致性，这里没有丢弃未抽到的特征，而是将未抽到的特征列全部置零，相当于做了一个掩盖操作，不过弊端就是内存占用大。

In [2]:
def RandomPatches(data):
    '''
    随机抽样函数，同时对样本与特征抽样
    '''
    n_samples, n_features = data.shape
    n_features -= 1
    sub_data=np.copy(data)

    random_f_idx = np.random.choice(
        n_features, size=int(np.sqrt(n_features)), replace=False)
    mask_f_idx=[i for i in range(n_features) if i not in random_f_idx]    # 未抽到的特征idx
    
    random_data_idx = np.random.choice(n_samples, size=n_samples, replace=True)
    sub_data=data[random_data_idx]
    sub_data[:,mask_f_idx]=0    # 未抽到的特征列全部置零
    return sub_data

# RandomPatches(training_data)

然后就可以实现一个简单的串行版本RF模型了。

In [6]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

from tree.DecisionTreeRegressor import DecisionTreeRegressor

def RF_Reg(data,n_estimators=5):
    trees=[]

    for _ in range(n_estimators):
        tree=DecisionTreeRegressor()
        sub_data=RandomPatches(data)
        tree.fit(sub_data[:,:-1],sub_data[:,-1])
        trees.append(tree)
        
    return trees

trees=RF_Reg(training_data)
# print(trees)

[<tree.DecisionTreeRegressor.DecisionTreeRegressor object at 0x000002E0D1245B70>, <tree.DecisionTreeRegressor.DecisionTreeRegressor object at 0x000002E0D1245B38>, <tree.DecisionTreeRegressor.DecisionTreeRegressor object at 0x000002E0D1245C18>, <tree.DecisionTreeRegressor.DecisionTreeRegressor object at 0x000002E0D1245BA8>, <tree.DecisionTreeRegressor.DecisionTreeRegressor object at 0x000002E0B2CDADA0>]


串行预测

In [7]:
def predict(X_test, trees):
    raw_pred = np.array([tree.predict(X_test) for tree in trees]).T
    return raw_pred    # 返回原始结果
#     return np.mean(raw_pred,axis=1)    # 返回均化结果

Y_pred = predict(testing_data[:, :-1], trees)

# 输出每一棵树的单独预测的MSE
for i in range(len(trees)):
    cur_pred = Y_pred[:, i]
    print('tree_{} MSE:{}'.format(i, np.mean(np.square(cur_pred - Y_test))))

# 输出RF均化后的MSE
mean_pred = np.mean(Y_pred,axis=1)
print('rf MSE:{}'.format(np.mean(np.square(mean_pred - Y_test))))

tree_0 MSE:46.83130390406163
tree_1 MSE:57.09848394901172
tree_2 MSE:50.00339508202974
tree_3 MSE:23.285920485462704
tree_4 MSE:33.48145667501045
rf MSE:20.062179981348898


多次运行可以发现，RF的MSE有时甚至还会劣于单颗CART树，这是因为这里做的是等权重的均化。实际上应该为每一棵树的预测结果赋一个权重系数，MSE低的树预测结果权重大。下面使用sklearn中的RF作对比：

In [13]:
from sklearn.ensemble import RandomForestRegressor
rf_reg=RandomForestRegressor(n_estimators=5,min_samples_split=5,min_samples_leaf=5)
rf_reg.fit(X_train,Y_train)
Y_pred=rf_reg.predict(X_test)
print('sklearn MSE:{}'.format(np.mean(np.square(Y_pred - Y_test))))

sklearn MSE:12.277904301558097


差的有点大，恩，预测结果得做一下加权才行，待完善。。。