## 数据准备

In [1]:
import numpy as np

from sklearn.datasets import load_breast_cancer
data=load_breast_cancer()
X,Y=data.data,data.target
del data

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

# 把X，Y拼起来便于操作
training_data=np.c_[X_train,Y_train]
testing_data=np.c_[X_test,Y_test]

# print(training_data.shape,testing_data.shape)

(455, 30) (114, 30) (455,) (114,)


## 模型基础
RF跟普通树模型的区别很明显也很简单，每棵树在一个随机抽样的子数据集上训练，并且每次分裂时只在一个随机子空间上做test。为了简便，在抽样数据子集时同时随机选取$\sqrt{m}$个子特征。

注意，为了保持训练与预测时数据的一致性，这里没有丢弃未抽到的特征，而是将未抽到的特征列全部置零，相当于做了一个掩盖操作，不过弊端就是内存占用大。

In [2]:
def RandomPatches(data):
    '''
    随机抽样函数，同时对样本与特征抽样
    '''
    n_samples, n_features = data.shape
    n_features -= 1
    sub_data=np.copy(data)

    random_f_idx = np.random.choice(
        n_features, size=int(np.sqrt(n_features)), replace=False)
    mask_f_idx=[i for i in range(n_features) if i not in random_f_idx]    # 未抽到的特征idx
    
    random_data_idx = np.random.choice(n_samples, size=n_samples, replace=True)
    sub_data=data[random_data_idx]
    sub_data[:,mask_f_idx]=0    # 未抽到的特征列全部置零
    return sub_data

# RandomPatches(training_data)

然后就可以实现一个简单的串行版本RF模型了。

In [3]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

from tree.DecisionTreeClassifier import DecisionTreeClassifier

def RF_Clf(data,n_estimators=5):
    trees=[]

    for _ in range(n_estimators):
        tree=DecisionTreeClassifier()
        sub_data=RandomPatches(data)
        tree.fit(sub_data[:,:-1],sub_data[:,-1])
        trees.append(tree)
        
    return trees

trees=RF_Clf(training_data)
# print(trees)

串行预测

In [4]:
from scipy import stats    # numpy未提供mode方法，借助scipy


def predict(X_test, trees):
    raw_pred = np.array([tree.predict(X_test) for tree in trees]).T
    return raw_pred    # 返回原始结果
#     return np.array([stats.mode(y_pred)[0][0] for y_pred in raw_pred])    # 返回投票结果

Y_pred = predict(testing_data[:, :-1], trees)

# 输出每一棵树的单独预测准确率
for i in range(len(trees)):
    cur_pred = Y_pred[:, i]
    print('tree_{} acc:{}'.format(i, np.sum(cur_pred == Y_test)/len(Y_test)))

# 输出RF投票后的准确率
vote_pred = np.array([stats.mode(y_pred)[0][0] for y_pred in Y_pred])
print('rf acc:{}'.format(np.sum(vote_pred == Y_test)/len(Y_test)))

# trees[1].predict(testing_data[:,:-1])

tree_0 acc:0.8508771929824561
tree_1 acc:0.9473684210526315
tree_2 acc:0.8947368421052632
tree_3 acc:0.9473684210526315
tree_4 acc:0.9298245614035088
rf acc:0.956140350877193


通过上述结果可以看到(可多次运行)，RF的准确率不会低于任意一颗单CART树。然后对比sklearn中的RF准确率：

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier(n_estimators=5)
rf_clf.fit(X_train,Y_train)
Y_pred=rf_clf.predict(X_test)
print('sklearn acc:{}'.format(np.sum(Y_pred == Y_test)/len(Y_test)))

sklearn acc:0.9298245614035088


由于没做过Python开发，对Python的并发操作并不是很了解，这里还未实现RF的并行训练，待后期补充。