In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import shapiro,f_oneway,kruskal

data = pd.read_csv("Electrical Grid.csv")

data_x = data.drop(['stab','stabf'],axis=1)
data_y = data['stab']

scaler = MinMaxScaler()
scaled_x = scaler.fit_transform(data_x)
scaled_x = pd.DataFrame(scaled_x, columns=data_x.columns)

In [None]:
depth_data = pd.read_csv("depth.csv")
tree_data = pd.read_csv("tree.csv")
max_feature_data = pd.read_csv("max feature.csv")
bootstrap_size_data = pd.read_csv("bootstrap size.csv")
batch_data = pd.read_csv("batch size.csv")
iteration_data = pd.read_csv("iteration.csv")

best_parameter = pd.read_csv("bast parameter.csv")
anova_data = pd.read_csv("anova result.csv")

In [None]:
def normal_test(data):
    test_stat,p_value = shapiro(data)
    return np.round(p_value,3)

In [None]:
rf_paremater_set = {"trees":[100,300, 500, 700],"sampling_sizes":[0.5,0.7,0.9],'depths':[10,20,30,40,50],'features':[4,6,8,10,12]}
rotf_paremater_set = {"trees":[100,300,500,700],"sampling_sizes":[0.5,0.7,0.9],'depths':[10,20,30,40,50]}
spca_rotf_paremater_set = {"trees": [300, 500, 700, 900], "sampling_sizes": [0.5, 0.7, 0.9], 'depths': [10, 20, 30, 40, 50],
               'batch': [3, 5, 7, 9], 'iterations ': [100, 300, 500, 700]}

# RF

In [None]:
def rf_kfold(tree,depth,max_sample,feature,seed):
    RMSE_set_rf = []
    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    for train_index, valid_index in kf.split(scaled_x, data_y): # split()  method generate indices to split data intSo training and test set.
        #print('fold',cnt)
        train_x = scaled_x.iloc[train_index,:]
        train_y = data.iloc[train_index,:]['stab']
        valid_x = scaled_x.iloc[valid_index,:]
        valid_y = data.iloc[valid_index,:]['stab']
        rf_pred = RandomForestRegressor(n_estimators=tree,max_depth=depth,max_samples=max_sample,n_jobs=-1,max_features=feature).fit(train_x,train_y).predict(valid_x)
        RMSE_set_rf.append( np.sqrt( mean_squared_error(valid_y,rf_pred)))
    return RMSE_set_rf

In [None]:
for param_name, parameter in rf_paremater_set.items():
    print('================',param_name,'================')
    if param_name == 'trees':
        tree_rmse  = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = rf_kfold(tree=para,depth=None,max_sample=0.7,feature=12,seed = 47 ) #23 31 47 52
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的樹參數
            if np.mean(parm_set_2) < best_rmse:
              best_tree = para
              best_rmse = np.mean(parm_set_2)
            tree_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            tree_data[str(para)].iloc[0] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('f_oneway:',test_stat,anova_p)

        anova_data['trees'].iloc[0] = np.round(anova_p,3)    # 紀錄該參數的anova result
        best_parameter['trees'].iloc[0] = best_tree   # 紀錄該參數的最佳參數
        tree_result = [np.mean(tree_rmse[0]),np.mean(tree_rmse[1]),np.mean(tree_rmse[2]),np.mean(tree_rmse[3])]   # 紀錄每個參數之平均rmse

    elif param_name == 'sampling_sizes':
        sample_rmse = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = rf_kfold(tree=300,depth=20,max_sample=para,feature=4,seed = 47)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            sample_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            bootstrap_size_data[str(para)].iloc[0] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('f_oneway:',test_stat,anova_p)

        anova_data['bootstrap size'].iloc[0] = np.round(anova_p,3)

        best_parameter['bootstrap size'].iloc[0] = best_para   # 紀錄該參數的最佳參數
        bootstrap_result = [np.mean(sample_rmse[0]),np.mean(sample_rmse[1]),np.mean(sample_rmse[2])]  # 紀錄每個參數之平均rmse

    elif param_name == 'depths':
        depth_rmse = []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = rf_kfold(tree=300,depth=para,max_sample=0.7,feature='auto',seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            depth_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            depth_data[str(para)].iloc[0] = normal_test(parm_set_2)

        print(f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4])
        anova_data['max depth'].iloc[0] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max depth'].iloc[0] = best_para

        # 紀錄每個參數之平均rmse
        depth_result = [np.mean(depth_rmse[0]),np.mean(depth_rmse[1]),np.mean(depth_rmse[2]),np.mean(depth_rmse[3]),np.mean(depth_rmse[4])]

    elif param_name == 'features':
        feature_rmse= []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = rf_kfold(tree=300,depth=30,max_sample=0.7,feature=para,seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            feature_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            max_feature_data[str(para)].iloc[0] = normal_test(parm_set_2)

        print(f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4])
        anova_data['max feature'].iloc[0] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max feature'].iloc[0] = best_para

        # 紀錄每個參數之平均rmse
        feature_result = [np.mean(feature_rmse[0]),np.mean(feature_rmse[1]),np.mean(feature_rmse[2]),np.mean(feature_rmse[3]),np.mean(feature_rmse[4])]

print(tree_result)
print(bootstrap_result)
print(depth_result)
print(feature_result)

100 0.365 [0.011957633267011464, 0.011816719016102302, 0.011816374254953807, 0.0115751927958254, 0.011883251048778163, 0.011704967605190951, 0.011897351007726195, 0.011810037824681438, 0.011553465513726012, 0.011769632382042452]
300 0.046 [0.011821636198734551, 0.011835353276957127, 0.01174495338871394, 0.011491388017077064, 0.01177933161232889, 0.011557129350539685, 0.011797840903428131, 0.011615135362457408, 0.011502829765341645, 0.011788256916596882]
500 0.061 [0.01174678215489854, 0.011780438733438313, 0.011719019652598157, 0.011517342519258867, 0.011732047460743652, 0.011477023387818691, 0.011836157901903701, 0.011536893152410785, 0.01149468618712398, 0.011792094729034454]
700 0.107 [0.011755563967699544, 0.0117148009325921, 0.011704120879376955, 0.011419787389937841, 0.011720892618359307, 0.011532435915159026, 0.011817635430019756, 0.011480624313067083, 0.011474235460417181, 0.011774654693396721]
kruskal_oneway: 6.604390243902429 0.08563528359403802
0.5 0.924 [0.01290833000116219

# TWRF

In [None]:
def TWRF_kfold(tree,depth,max_sample,feature,seed):
    RMSE_set_twrf = []
    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    for train_index, valid_index in kf.split(scaled_x, data_y): # split()  method generate indices to split data intSo training and test set.
        #print('fold',cnt)
        train_x = scaled_x.iloc[train_index,:]
        train_y = data.iloc[train_index,:]['stab']
        valid_x = scaled_x.iloc[valid_index,:]
        valid_y = data.iloc[valid_index,:]['stab']

        TWRF = Trees_Weighting_Random_Forest(train_x , train_y, valid_x,max_depth=depth,sample_size=max_sample,n_trees=tree, n_features = feature )
        RMSE_set_twrf.append( np.sqrt( mean_squared_error(valid_y,TWRF)))

    mean_rmse = np.mean(RMSE_set_twrf)
    return RMSE_set_twrf

def Trees_Weighting_Random_Forest(training_x , training_y, test_x, max_depth, sample_size, n_trees, n_features):
    trees = []
    OOB_MSE = []
    Prediction_set = []
    for i in range(n_trees):
        train_x, valid_x ,train_y ,valid_y = train_test_split(training_x, training_y , train_size = sample_size) #隨機選擇n%的樣本

        DT = DecisionTreeRegressor(max_depth = max_depth , max_features=n_features )
        DT.fit(train_x,train_y)
        dt_pred = DT.predict(valid_x)

        OOB_MSE.append(  mean_squared_error(valid_y,dt_pred) ) #MSE
        oob_mse = OOB_MSE/np.sum(OOB_MSE)

        test_pred = DT.predict(test_x)
        Prediction_set.append(test_pred)

    final_results = []
    for i in range(len(Prediction_set[0])): #總共要預測的Y有幾個 #2000個預測值
        predict_result = [] #存放每棵樹的預測值 #100棵樹有100個預測值
        final_result = 0
        for j in range(len(Prediction_set)): #每棵樹的預測值 #100棵樹
            predict_result.append( (Prediction_set[j][i]) )

        for k in range(len(predict_result)): #加權預測值
            final_result =  final_result + predict_result[k] * oob_mse[k]
        final_results.append(final_result)
    return final_results

In [None]:
num = 6

for param_name, parameter in rf_paremater_set.items():
    print('================',param_name,'================')
    if param_name == 'trees':
        tree_rmse  = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=para,depth=None,max_sample=0.7,feature=12,seed = 47 ) #23 31 47 52
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的樹參數
            if np.mean(parm_set_2) < best_rmse:
              best_tree = para
              best_rmse = np.mean(parm_set_2)
            tree_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            tree_data[str(para)].iloc[num] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('f_oneway:',test_stat,anova_p)

        anova_data['trees'].iloc[num] = np.round(anova_p,3)    # 紀錄該參數的anova result
        best_parameter['trees'].iloc[num] = best_tree   # 紀錄該參數的最佳參數
        tree_result = [np.mean(tree_rmse[0]),np.mean(tree_rmse[1]),np.mean(tree_rmse[2]),np.mean(tree_rmse[3])]   # 紀錄每個參數之平均rmse

########################################################################################################################################

    elif param_name == 'sampling_sizes':
        sample_rmse = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=20,max_sample=para,feature=12,seed = 47)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            sample_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            bootstrap_size_data[str(para)].iloc[num] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('f_oneway:',test_stat,anova_p)

        anova_data['bootstrap size'].iloc[num] = np.round(anova_p,3)

        best_parameter['bootstrap size'].iloc[num] = best_para   # 紀錄該參數的最佳參數
        bootstrap_result = [np.mean(sample_rmse[0]),np.mean(sample_rmse[1]),np.mean(sample_rmse[2])]  # 紀錄每個參數之平均rmse

########################################################################################################################################

    elif param_name == 'depths':
        depth_rmse = []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=para,max_sample=0.7,feature='auto',seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            depth_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            depth_data[str(para)].iloc[num] = normal_test(parm_set_2)

        print(f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4])
        anova_data['max depth'].iloc[num] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max depth'].iloc[num] = best_para

        # 紀錄每個參數之平均rmse
        depth_result = [np.mean(depth_rmse[0]),np.mean(depth_rmse[1]),np.mean(depth_rmse[2]),np.mean(depth_rmse[3]),np.mean(depth_rmse[4])]

########################################################################################################################################

    elif param_name == 'features':
        feature_rmse= []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=30,max_sample=0.7,feature=para,seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            feature_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            max_feature_data[str(para)].iloc[num] = normal_test(parm_set_2)

        print(f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4])
        anova_data['max feature'].iloc[num] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max feature'].iloc[num] = best_para

        # 紀錄每個參數之平均rmse
        feature_result = [np.mean(feature_rmse[0]),np.mean(feature_rmse[1]),np.mean(feature_rmse[2]),np.mean(feature_rmse[3]),np.mean(feature_rmse[4])]

print(tree_result)
print(bootstrap_result)
print(depth_result)
print(feature_result)

100 0.537 [0.012013170124137709, 0.011906800939276478, 0.011746788122548727, 0.011730708750787609, 0.011914772063426485, 0.011780762150451023, 0.011942307252324971, 0.011665893234568575, 0.01154575033586865, 0.01193168348142115]
300 0.958 [0.011829681996027498, 0.011787558707512096, 0.011859354534901327, 0.011730287656669666, 0.012031733291698187, 0.011727267783458806, 0.011926009921666056, 0.011573728186959092, 0.011620643280726246, 0.011945642578063872]
500 0.607 [0.011828194667142772, 0.011772956930445045, 0.011831656726780267, 0.011603411353642437, 0.011993569470439882, 0.011692283591906123, 0.01189062983702028, 0.011577147466866491, 0.01163855619282826, 0.011960277045973878]
700 0.377 [0.011782720732406498, 0.011783746449972966, 0.011771671824958951, 0.011664000221204167, 0.011920651007541746, 0.011726674191771211, 0.01189112237326094, 0.011484857835008296, 0.011506783116800558, 0.01195318625451897]
f_oneway: 0.40906555953670615 0.7474298419216996
0.5 0.083 [0.011891770316059315, 

# SRF

In [None]:
def SRF_kfold(tree,depth,max_sample,feature,seed):
    RMSE_set_srf = []
    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    for train_index, valid_index in kf.split(scaled_x, data_y): # split()  method generate indices to split data intSo training and test set.
        #print('fold',cnt)
        train_x = scaled_x.iloc[train_index,:]
        train_y = data.iloc[train_index,:]['stab']
        valid_x = scaled_x.iloc[valid_index,:]
        valid_y = data.iloc[valid_index,:]['stab']

        SRF = Strength_Random_Forest(train_x , train_y, valid_x,max_depth=depth,sample_size=max_sample,n_trees=tree, n_features = feature )
        RMSE_set_srf.append( np.sqrt( mean_squared_error(valid_y,SRF)))

    mean_rmse = np.mean(RMSE_set_srf)
    return RMSE_set_srf

def Strength_Random_Forest(training_x , training_y, test_x, max_depth, sample_size, n_trees, n_features):
    strength_set = []
    test_predicted_ys = []
    for i in range(n_trees):
        train_x, valid_x ,train_y ,valid_y = train_test_split(training_x, training_y , train_size = sample_size) #隨機選擇n%的樣本
        DT = DecisionTreeRegressor(max_depth = max_depth , max_features=n_features )
        DT.fit(train_x,train_y)
        dt_pred = DT.predict(valid_x)

        confidence = []
        margin = np.abs(dt_pred - valid_y)
        for j in range(len(margin)):
            confidence.append(1/ math.exp(margin.values[j]))
        strength = np.sum(confidence)/len(confidence)
        strength_set.append(strength)

        test_pred = DT.predict(test_x)
        test_predicted_ys.append(test_pred)

        final_result = []
    for i in range(len(test_predicted_ys[0])): #y的數量
        predict_result = [] #存放每棵樹的預測值 #100棵樹有100個預測值
        for j in range(len(test_predicted_ys)):
            predict_result.append( (test_predicted_ys[j][i]) )
        strength_predict_result = np.array(predict_result) * np.array(strength_set) #每顆樹的預測值*每個樹的strength
        final_result.append( np.mean(strength_predict_result)  ) #存放最後的2000個預測值

    return final_result

In [None]:
num = 3

for param_name, parameter in rf_paremater_set.items():
    print('================',param_name,'================')
    if param_name == 'trees':
        tree_rmse  = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=para,depth=None,max_sample=0.7,feature=12,seed = 47 ) #23 31 47 52
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的樹參數
            if np.mean(parm_set_2) < best_rmse:
              best_tree = para
              best_rmse = np.mean(parm_set_2)
            tree_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            tree_data[str(para)].iloc[num] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(tree_rmse[0],tree_rmse[1],tree_rmse[2],tree_rmse[3])
          print('f_oneway:',test_stat,anova_p)

        anova_data['trees'].iloc[num] = np.round(anova_p,3)    # 紀錄該參數的anova result
        best_parameter['trees'].iloc[num] = best_tree   # 紀錄該參數的最佳參數
        tree_result = [np.mean(tree_rmse[0]),np.mean(tree_rmse[1]),np.mean(tree_rmse[2]),np.mean(tree_rmse[3])]   # 紀錄每個參數之平均rmse

########################################################################################################################################

    elif param_name == 'sampling_sizes':
        sample_rmse = []
        best_rmse,best_para = 100,1000
        min_normality_p = 100
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=20,max_sample=para,feature=12,seed = 47)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            sample_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            bootstrap_size_data[str(para)].iloc[num] = normal_test(parm_set_2)

            if normal_test(parm_set_2) < min_normality_p:
              min_normality_p = normal_test(parm_set_2)

        if min_normality_p < 0.05 :
          test_stat,anova_p = kruskal(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('kruskal_oneway:',test_stat,anova_p)
        else:
          test_stat,anova_p = f_oneway(sample_rmse[0],sample_rmse[1],sample_rmse[2])
          print('f_oneway:',test_stat,anova_p)

        anova_data['bootstrap size'].iloc[num] = np.round(anova_p,3)

        best_parameter['bootstrap size'].iloc[num] = best_para   # 紀錄該參數的最佳參數
        bootstrap_result = [np.mean(sample_rmse[0]),np.mean(sample_rmse[1]),np.mean(sample_rmse[2])]  # 紀錄每個參數之平均rmse

########################################################################################################################################

    elif param_name == 'depths':
        depth_rmse = []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=para,max_sample=0.7,feature='auto',seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            depth_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            depth_data[str(para)].iloc[num] = normal_test(parm_set_2)

        print(f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(depth_rmse[0],depth_rmse[1],depth_rmse[2],depth_rmse[3],depth_rmse[4])
        anova_data['max depth'].iloc[num] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max depth'].iloc[num] = best_para

        # 紀錄每個參數之平均rmse
        depth_result = [np.mean(depth_rmse[0]),np.mean(depth_rmse[1]),np.mean(depth_rmse[2]),np.mean(depth_rmse[3]),np.mean(depth_rmse[4])]

########################################################################################################################################

    elif param_name == 'features':
        feature_rmse= []
        best_rmse,best_para = 100,1000
        for para in parameter:
            parm_set_2 = TWRF_kfold(tree=300,depth=30,max_sample=0.7,feature=para,seed = 42)
            print(para,normal_test(parm_set_2),parm_set_2)

            # 找出最好的參數
            if np.mean(parm_set_2) < best_rmse:
              best_para = para
              best_rmse = np.mean(parm_set_2)
            feature_rmse.append(parm_set_2)

            # 紀錄每個參數的常態檢定p-value
            max_feature_data[str(para)].iloc[num] = normal_test(parm_set_2)

        print(f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4]))

        # 紀錄該參數的anova result
        test_stat,anova_p = f_oneway(feature_rmse[0],feature_rmse[1],feature_rmse[2],feature_rmse[3],feature_rmse[4])
        anova_data['max feature'].iloc[num] = np.round(anova_p,3)

        # 紀錄該參數的最佳參數
        best_parameter['max feature'].iloc[num] = best_para

        # 紀錄每個參數之平均rmse
        feature_result = [np.mean(feature_rmse[0]),np.mean(feature_rmse[1]),np.mean(feature_rmse[2]),np.mean(feature_rmse[3]),np.mean(feature_rmse[4])]

print(tree_result)
print(bootstrap_result)
print(depth_result)
print(feature_result)

100 0.683 [0.011823254219488065, 0.011919221891035264, 0.011897707938725653, 0.011774322645500076, 0.01201010070962436, 0.011756940562924643, 0.011971514620222237, 0.01162372417646224, 0.011702198068090388, 0.011962342513005193]
300 0.649 [0.011864201290861296, 0.011786899372150574, 0.011834700061749867, 0.011664298945124966, 0.011978425019831259, 0.011695940248138574, 0.011886829882686106, 0.01149446603627862, 0.011579632779732865, 0.011890431629316882]
500 0.786 [0.011840667191075335, 0.01185073756447224, 0.011783493937587392, 0.011612748275932054, 0.011949854173442156, 0.011725523230848784, 0.011820465013315332, 0.011629981902920023, 0.01153834479287051, 0.011984653458980313]
700 0.398 [0.011817658367488744, 0.011795867352860472, 0.011794523322880834, 0.011635345310345453, 0.011970613546518714, 0.011726235317189224, 0.011933946552715262, 0.011515616316295468, 0.011530527367098762, 0.011985119649999073]
f_oneway: 0.6013153201196215 0.6183633995001558
0.5 0.035 [0.011873637487697219, 

In [None]:
depth_data.to_csv("depth.csv",index=False)
tree_data.to_csv("tree.csv",index=False)
max_feature_data.to_csv("max feature.csv",index=False)
bootstrap_size_data.to_csv("bootstrap size.csv",index=False)
best_parameter.to_csv("bast parameter.csv",index=False)
anova_data.to_csv("anova result.csv",index=False)