In [1]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
ROOT_DIR = pathlib.Path().resolve().parent
ERR_DIR = ROOT_DIR / 'results' / 'error_array'

In [3]:
DATA_NAMES = [
    'synthetic',
    'Mpg',
    'abalone',
    'automobile',
    'cpu',
    'liver',
    'servo',
    'student',
    'wine_quality',
]

In [1]:
MODELNAME_LIST = [
    'MTGB100_depth3',
    'MTGB100_depth5',
    'MTGB100_depth15',
    'GB100_depth3',
    'GB100_depth5',
    'GB100_depth15',
    ]

In [5]:
ERR_DIR

PosixPath('/Users/1nao/GitHub/Gradient_Boosting_Meta_Tree/results/error_array')

In [6]:
# load the error arrays
err_dict = {}
for data_name in DATA_NAMES:
    err_dict[data_name] = {}
    for modelname in MODELNAME_LIST:
        try:
            err_dict[data_name][modelname] = {
                'train_mse_arr': np.load(ERR_DIR / f'{data_name}_{modelname}_train_mse.npy'),
                'test_mse_arr': np.load(ERR_DIR / f'{data_name}_{modelname}_test_mse.npy'),
                }
        except:
            print(f'{data_name}_{modelname} does not exist.')
            err_dict[data_name][modelname] = None

In [7]:
# calculate the average and standard error of the mean
res_dict = {}
for data_name in DATA_NAMES:
    res_dict[data_name] = {}
    for modelname in MODELNAME_LIST:
        if err_dict[data_name][modelname] is None:
            res_dict[data_name][modelname] = None
            continue
        res_dict[data_name][modelname] = {
            'train_ms_avg': np.mean(err_dict[data_name][modelname]['train_mse_arr']),
            'train_mse_sem': np.std(err_dict[data_name][modelname]['train_mse_arr']) / np.sqrt(len(err_dict[data_name][modelname]['train_mse_arr'])),
            'test_mse_avg': np.mean(err_dict[data_name][modelname]['test_mse_arr']),
            'test_mse_sem': np.std(err_dict[data_name][modelname]['test_mse_arr']) / np.sqrt(len(err_dict[data_name][modelname]['test_mse_arr'])),
        }

In [8]:
# create a dataframe
# multicolumn of model and train/test mse/sem
model_list = []
for modelname in MODELNAME_LIST:
    model_list += [modelname]*4
mse_sem_list = ['train_mse_avg', 'train_mse_sem', 'test_mse_avg', 'test_mse_sem']
mse_sem_list = mse_sem_list * len(MODELNAME_LIST)
multi_columns = pd.MultiIndex.from_tuples(list(zip(model_list, mse_sem_list)))
df1 = pd.DataFrame(columns=multi_columns, index=DATA_NAMES)

for data_name in DATA_NAMES:
    for modelname in MODELNAME_LIST:
        if res_dict[data_name][modelname] is None:
            continue
        df1.loc[data_name, (modelname, 'train_mse_avg')] = res_dict[data_name][modelname]['train_ms_avg']
        df1.loc[data_name, (modelname, 'train_mse_sem')] = res_dict[data_name][modelname]['train_mse_sem']
        df1.loc[data_name, (modelname, 'test_mse_avg')] = res_dict[data_name][modelname]['test_mse_avg']
        df1.loc[data_name, (modelname, 'test_mse_sem')] = res_dict[data_name][modelname]['test_mse_sem']
df1

Unnamed: 0_level_0,MTGB100_depth3,MTGB100_depth3,MTGB100_depth3,MTGB100_depth3,MTGB100_depth5,MTGB100_depth5,MTGB100_depth5,MTGB100_depth5,MTGB100_depth15,MTGB100_depth15,...,GB100_depth3,GB100_depth3,GB100_depth5,GB100_depth5,GB100_depth5,GB100_depth5,GB100_depth15,GB100_depth15,GB100_depth15,GB100_depth15
Unnamed: 0_level_1,train_mse_avg,train_mse_sem,test_mse_avg,test_mse_sem,train_mse_avg,train_mse_sem,test_mse_avg,test_mse_sem,train_mse_avg,train_mse_sem,...,test_mse_avg,test_mse_sem,train_mse_avg,train_mse_sem,test_mse_avg,test_mse_sem,train_mse_avg,train_mse_sem,test_mse_avg,test_mse_sem
synthetic,0.083931,0.000681,0.219799,0.005657,0.139907,0.001841,0.195868,0.004173,0.083931,0.000681,...,0.343076,0.008529,0.055167,0.000513,0.161541,0.001497,0.035901,0.000425,0.21832,0.005888
Mpg,0.054719,0.000587,0.149934,0.00987,0.039592,0.000474,0.151148,0.011506,0.028372,0.000301,...,0.150332,0.010652,0.0,0.0,0.158844,0.009072,0.0,0.0,0.221483,0.015625
abalone,0.212894,0.002198,0.509555,0.009982,0.140417,0.002158,0.531685,0.0082,0.062034,0.000356,...,0.527353,0.00977,0.040851,0.000462,0.573839,0.008586,0.0,0.0,0.697888,0.013066
automobile,0.032296,0.000794,0.241296,0.035566,0.021816,0.000714,0.235646,0.028721,0.01975,0.000498,...,0.249778,0.047066,0.0,0.0,0.265317,0.036579,0.0,0.0,0.342059,0.049637
cpu,0.022579,0.000434,0.129173,0.020397,0.01797,0.000436,0.144998,0.025602,0.018132,0.000473,...,0.111945,0.018215,0.003011,0.000297,0.148009,0.032531,0.003011,0.000297,0.163203,0.028709
liver,0.142805,0.003246,1.100917,0.065315,0.101732,0.001931,1.082714,0.068062,0.073203,0.00087,...,1.199874,0.055715,0.0,0.0,1.150269,0.051022,0.0,0.0,1.590716,0.111241
servo,0.050759,0.001104,0.126409,0.026612,0.0347,0.000585,0.129125,0.041708,0.031382,0.000263,...,0.107246,0.026742,0.0,0.0,0.112905,0.04294,0.0,0.0,0.157185,0.060996
student,0.142334,0.001735,0.943903,0.032178,0.092852,0.001313,1.025132,0.030029,0.075709,0.000495,...,1.064209,0.038763,0.0,0.0,1.08782,0.025594,0.0,0.0,1.452249,0.059186
wine_quality,0.313948,0.002161,0.619211,0.011387,0.139346,0.003209,0.591242,0.011476,0.057918,0.000343,...,0.619368,0.012779,0.06411,0.000665,0.598546,0.012706,0.0,0.0,0.636302,0.014833


In [None]:
# create a dataframe with the diffrent multi columns

mse_sem_list = []
for val in ['train_mse_avg', 'train_mse_sem', 'test_mse_avg', 'test_mse_sem']:
    mse_sem_list += [val]*len(MODELNAME_LIST)
model_list = MODELNAME_LIST * 4
multi_columns = pd.MultiIndex.from_tuples(list(zip(mse_sem_list,model_list)))
df2 = pd.DataFrame(columns=multi_columns, index=DATA_NAMES)

for data_name in DATA_NAMES:
    for modelname in MODELNAME_LIST:
        if res_dict[data_name][modelname] is None:
            continue
        df2.loc[data_name, ('train_mse_avg', modelname)] = res_dict[data_name][modelname]['train_ms_avg']
        df2.loc[data_name, ('train_mse_sem', modelname)] = res_dict[data_name][modelname]['train_mse_sem']
        df2.loc[data_name, ('test_mse_avg', modelname)] = res_dict[data_name][modelname]['test_mse_avg']
        df2.loc[data_name, ('test_mse_sem', modelname)] = res_dict[data_name][modelname]['test_mse_sem']
df2

Unnamed: 0_level_0,train_mse_avg,train_mse_avg,train_mse_avg,train_mse_avg,train_mse_avg,train_mse_avg,train_mse_sem,train_mse_sem,train_mse_sem,train_mse_sem,...,test_mse_avg,test_mse_avg,test_mse_avg,test_mse_avg,test_mse_sem,test_mse_sem,test_mse_sem,test_mse_sem,test_mse_sem,test_mse_sem
Unnamed: 0_level_1,MTGB100_depth3,MTGB100_depth5,MTGB100_depth15,GB100_depth3,GB100_depth5,GB100_depth15,MTGB100_depth3,MTGB100_depth5,MTGB100_depth15,GB100_depth3,...,MTGB100_depth15,GB100_depth3,GB100_depth5,GB100_depth15,MTGB100_depth3,MTGB100_depth5,MTGB100_depth15,GB100_depth3,GB100_depth5,GB100_depth15
synthetic,0.083931,0.139907,0.083931,0.258823,0.055167,0.035901,0.000681,0.001841,0.000681,0.006686,...,0.219799,0.343076,0.161541,0.21832,0.005657,0.004173,0.005657,0.008529,0.001497,0.005888
Mpg,0.054719,0.039592,0.028372,0.001448,0.0,0.0,0.000587,0.000474,0.000301,5.4e-05,...,0.153591,0.150332,0.158844,0.221483,0.00987,0.011506,0.010362,0.010652,0.009072,0.015625
abalone,0.212894,0.140417,0.062034,0.190312,0.040851,0.0,0.002198,0.002158,0.000356,0.001562,...,0.592919,0.527353,0.573839,0.697888,0.009982,0.0082,0.00931,0.00977,0.008586,0.013066
automobile,0.032296,0.021816,0.01975,1e-06,0.0,0.0,0.000794,0.000714,0.000498,0.0,...,0.267247,0.249778,0.265317,0.342059,0.035566,0.028721,0.037128,0.047066,0.036579,0.049637
cpu,0.022579,0.01797,0.018132,0.003075,0.003011,0.003011,0.000434,0.000436,0.000473,0.000295,...,0.145782,0.111945,0.148009,0.163203,0.020397,0.025602,0.024671,0.018215,0.032531,0.028709
liver,0.142805,0.101732,0.073203,0.018459,0.0,0.0,0.003246,0.001931,0.00087,0.000536,...,1.190832,1.199874,1.150269,1.590716,0.065315,0.068062,0.080992,0.055715,0.051022,0.111241
servo,0.050759,0.0347,0.031382,0.00288,0.0,0.0,0.001104,0.000585,0.000263,0.000271,...,0.144345,0.107246,0.112905,0.157185,0.026612,0.041708,0.051507,0.026742,0.04294,0.060996
student,0.142334,0.092852,0.075709,0.017898,0.0,0.0,0.001735,0.001313,0.000495,0.000734,...,1.109965,1.064209,1.08782,1.452249,0.032178,0.030029,0.033131,0.038763,0.025594,0.059186
wine_quality,0.313948,0.139346,0.057918,0.310661,0.06411,0.0,0.002161,0.003209,0.000343,0.001598,...,0.614622,0.619368,0.598546,0.636302,0.011387,0.011476,0.012382,0.012779,0.012706,0.014833


In [10]:
RESULT_DIR = ROOT_DIR / 'results'
df2.to_csv(RESULT_DIR / 'boosting_experiment_results.csv')