In [20]:
import numpy as np
import pandas as pd
import random 
from sklearn.model_selection import train_test_split,KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
def get_random_subset(iterable,k):
    subsets = [] #存放
    iteration = 0
    np.random.shuffle(iterable)
    subset = 0
    limit = len(iterable)/k #分成多組具有k個特徵的特徵子集
    while iteration < limit:
        if k <= len(iterable):
            subset = k
        else:
            subset = len(iterable)
        subsets.append(iterable[-subset:])
        del iterable[-subset:]
        iteration+=1
    return subsets

In [4]:
def rotation_forest(X,Y,n_tree,k,max_depth):
    r_matrices = []
    models = []
    for tree in range(n_tree):
        x,_,_,_ = train_test_split(X,Y,test_size=0.25) #保留75%的資料
        feature_index = list(range(X.shape[1]))
        k_subset = get_random_subset(feature_index,k) #每個子集有k個特徵，每個子集特徵不重複
        rotation_matrix = np.zeros((X.shape[1],X.shape[1]),dtype=float) #建立n*n大小的矩陣，n為樣本數
        for each_subset in k_subset:
            pca = PCA()
            X_subset = x.iloc[:,each_subset]
            pca.fit(X_subset)
            for i in range(0,len(pca.components_)):
                for j in range(0,len(pca.components_)):
                    rotation_matrix[each_subset[i],each_subset[j]] = pca.components_[i,j]

        x_transformed = X.dot(rotation_matrix)
        model = DecisionTreeClassifier(max_depth = max_depth)
        model.fit(x_transformed,Y) 
        models.append(model) #存放每個樹的模型
        r_matrices.append(rotation_matrix) #存放每個樹的旋轉矩陣

    return models,r_matrices

In [5]:
def model_predict(models,r_matrices,x):
    predicted_ys = []
    for i,model in enumerate(models):
        x_mod =  x.dot(r_matrices[i])
        predicted_y = model.predict(x_mod) #每個樹的預測值
        predicted_ys.append(predicted_y) #存放所有樹的預測值
    
    predicted_matrix = np.asmatrix(predicted_ys)
    prediction = pd.DataFrame(predicted_matrix).T
    final_prediction = []

    for i in range(len(x)):
        outcomes = list( prediction.iloc[i,:].values )
        vote_result = max(set(outcomes), key=outcomes.count) 
        final_prediction.append(vote_result)
    return final_prediction

In [21]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df['label'] = df['label'].astype(int)
X = df.drop(['label'],axis=1)
Y = df['label']

In [28]:
kf = KFold(n_splits=5, shuffle=True, random_state=10)
cnt = 1
rf_accuracy_set ,rotf_accuracy_set= [],[]
for train_index, valid_index in kf.split(X, Y):
    print('fold',cnt)
    train_x = X.iloc[train_index,:]
    train_y = df.iloc[train_index,:]['label']
    valid_x = X.iloc[valid_index,:]
    valid_y = df.iloc[valid_index,:]['label']

    model,r_matrice = rotation_forest(X = train_x ,Y = train_y,n_tree=100,k=3,max_depth=10)
    rot_pred = model_predict(model,r_matrice,valid_x)
    rotf_accuracy_set.append(accuracy_score(valid_y,rot_pred))

    rf = RandomForestClassifier(max_depth = 10 ,n_estimators= 100)
    rf.fit(train_x,train_y)
    rf_pred = rf.predict(valid_x)
    rf_accuracy_set.append(accuracy_score(valid_y,rf_pred))
    cnt += 1
    
print("Random forest 5-fold average accuracy" , np.mean(rf_accuracy_set) )
print("Rotation forest 5-fold average accuracy" , np.mean(rotf_accuracy_set) )

fold 1
fold 2
fold 3
fold 4
fold 5
Random forest 5-fold average accuracy 0.9533333333333334
Rotation forest 5-fold average accuracy 0.9666666666666668
