In [1]:
import os 
import glob
import time
import warnings
import numpy as np
from joblib import Parallel, delayed

from datasets import *
from benchmarks import *

In [2]:
reps = 10
num_cores = 10 # number of cores for parallel computing
nterms_max = 10 # maximum number of ridge terms
slfn_nterms_list = [10,30,50]
aim_nterms_list = [3,5,7,10]

knot_num = 10
datanum = int(1e4)

reg_lambda = [0.2, 0.3, 0.4]
reg_gamma = "GCV"
knot_dist = 'quantile'

d_seq = [20]

In [3]:
data_set_list = ['Case1','Case2',
                 'Case3','Case4',
                 'Case5','Case6']
data_loader_list = [data_generator1,data_generator2,
                    data_generator3,data_generator4,
                    data_generator5,data_generator6]

In [4]:
results_train_latex_list = []
results_test_latex_list = []
results_tune_time_latex_list = [] # time including tuning cost
results_optparam_time_latex_list = [] # time for the optimal set of hyper-parameters
results_ortho_latex_list = []
results_nterms_latex_list = []

for d in d_seq:

    all_results_tune_time = []
    all_results_optparam_time = []
    all_results_ortho = []
    all_results_train = []
    all_results_test = []
    all_results_nterms = []
    
    for idx, data_loader in enumerate(data_loader_list[:1]): # only use the first data generator for illustration

        vb = data_set_list[idx]
        _, _, train_x, test_x, train_y, test_y, task_type, meta_info, get_metric = data_loader(datanum=datanum,d=d,
                                                                                               rand_seed=0)
        print('---------------------- Dimension:',d,'DataSet:',vb,'----------------------')
        
        folder = "./results/DiffDim/" + str(d) + '/' + vb + "/"
        
        if not os.path.exists(folder):
            os.makedirs(folder)

        start = time.time()
        if not os.path.exists(folder + 'seqstein_nonortho_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("SeqStein", data_loader, 
                                                                                       random_state=random_state, 
                                                                                       data_type='simulation',
                                                                                       d=d,datanum=datanum,
                                                                                       knot_num=knot_num,
                                                                                       reg_lambda = reg_lambda,
                                                                                       ortho_enhance=[False],
                                                                                       nterms=nterms_max) 
                                                               for task_id, random_state in enumerate(range(reps)))
            seqstein_nonortho_stat = pd.concat(stat).values
            np.save(folder + 'seqstein_nonortho_stat.npy', seqstein_nonortho_stat)
        print("SeqStein NonOrtho Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")
        
        if not os.path.exists(folder + 'seqstein_ortho_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("SeqStein", data_loader, 
                                                                                       random_state=random_state, 
                                                                                       data_type='simulation',
                                                                                       d=d,datanum=datanum,
                                                                                       knot_num=knot_num,
                                                                                       reg_lambda = reg_lambda,
                                                                                       ortho_enhance=[True],
                                                                                       nterms=nterms_max) 
                                                               for task_id, random_state in enumerate(range(reps)))
            seqstein_ortho_stat = pd.concat(stat).values
            np.save(folder + 'seqstein_ortho_stat.npy', seqstein_ortho_stat)
        print("SeqStein Ortho Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")
        
        if not os.path.exists(folder + 'aimlow_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("AIM", data_loader, 
                                                                      d=d,datanum=datanum,
                                                                              knot_dist=knot_dist,
                                                                              data_type='simulation',
                                                                              random_state=random_state,
                                                                              knot_num=knot_num, optlevel='low',
                                                                              nterms=aim_nterms_list) 
                                             for task_id, random_state in enumerate(range(reps)))
            aimlow_stat = pd.concat(stat).values
            np.save(folder + 'aimlow_stat.npy', aimlow_stat)
        print("AIMLow Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")

        if not os.path.exists(folder + 'aimhigh_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("AIM", data_loader, 
                                                                              knot_dist=knot_dist,
                                                                              d=d,datanum=datanum,
                                                                              data_type='simulation',
                                                                              random_state=random_state,
                                                                              knot_num=knot_num, optlevel='high',
                                                                              nterms=aim_nterms_list) 
                                             for task_id, random_state in enumerate(range(reps)))
            aimhigh_stat = pd.concat(stat).values
            np.save(folder + 'aimhigh_stat.npy', aimhigh_stat)
        print("AIMHigh Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")

        if not os.path.exists(folder + 'slfn_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("SLFN", data_loader,
                                                                      data_type='simulation',
                                                                      d=d,datanum=datanum,
                                                                       random_state=random_state, 
                                                                       nterms=slfn_nterms_list) 
                                         for task_id, random_state in enumerate(range(reps)))
            slfn_stat = pd.concat(stat).values
            np.save(folder + 'slfn_stat.npy', slfn_stat)
        print("SLFN Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")
        
        if not os.path.exists(folder + 'exnn_stat.npy'):
            try:
                stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("ExNN", data_loader,
                                                                          data_type='simulation',
                                                                          d=d,datanum=datanum,
                                                                           random_state=random_state, 
                                                                           nterms=nterms_max) 
                                             for task_id, random_state in enumerate(range(reps)))
            except:
                stat = Parallel(n_jobs=3)(delayed(batch_parallel)("ExNN", data_loader,
                                                                          data_type='simulation',
                                                                          d=d,datanum=datanum,
                                                                           random_state=random_state, 
                                                                           nterms=nterms_max) 
                                             for task_id, random_state in enumerate(range(reps)))
            exnn_stat = pd.concat(stat).values
            np.save(folder + 'exnn_stat.npy', exnn_stat)
        print("ExNN Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")
        
        if not os.path.exists(folder + 'mlp_stat.npy'):
            stat = Parallel(n_jobs=num_cores)(delayed(batch_parallel)("MLP", data_loader,
                                                                      data_type='simulation',
                                                                      d=d,datanum=datanum,
                                                                       random_state=random_state) 
                                         for task_id, random_state in enumerate(range(reps)))
            mlp_stat = pd.concat(stat).values
            np.save(folder + 'mlp_stat.npy', mlp_stat)
        print("MLP Finished!", "Time Cost ", np.round(time.time() - start, 2), " Seconds!")
        
        
        seqstein_nonortho_stat = np.load(folder + 'seqstein_nonortho_stat.npy')
        seqstein_ortho_stat = np.load(folder + 'seqstein_ortho_stat.npy')
        aimlow_stat = np.load(folder + 'aimlow_stat.npy')
        aimhigh_stat = np.load(folder + 'aimhigh_stat.npy')
        slfn_stat = np.load(folder + 'slfn_stat.npy')
        mlp_stat = np.load(folder + 'mlp_stat.npy')
        exnn_stat = np.load(folder + 'exnn_stat.npy') 
        print('Load the Savings.')
        
        with warnings.catch_warnings():
            print('Generate the statistics.')
            warnings.simplefilter("ignore", category=RuntimeWarning)
            stat = pd.DataFrame({"SeqStein-NonOrtho-Mean":np.nanmean(np.vstack(seqstein_nonortho_stat), 0),
                                 "SeqStein-Ortho-Mean":np.nanmean(np.vstack(seqstein_ortho_stat), 0),
                                 "AIMLow-Mean":np.nanmean(np.vstack(aimlow_stat), 0),
                                 "AIMHigh-Mean":np.nanmean(np.vstack(aimhigh_stat), 0),
                                 "SLFN-Mean":np.nanmean(np.vstack(slfn_stat), 0),
                                 "ExNN-Mean":np.nanmean(np.vstack(exnn_stat), 0),
                                 "MLP-Mean":np.nanmean(np.vstack(mlp_stat), 0),
                                 "SeqStein-NonOrtho-Std":np.nanstd(np.vstack(seqstein_nonortho_stat), 0),
                                 "SeqStein-Ortho-Std":np.nanstd(np.vstack(seqstein_ortho_stat), 0),
                                 "AIMLow-Std":np.nanstd(np.vstack(aimlow_stat), 0),
                                 "AIMHigh-Std":np.nanstd(np.vstack(aimhigh_stat), 0),
                                 "SLFN-Std":np.nanstd(np.vstack(slfn_stat), 0),
                                 "ExNN-Std":np.nanstd(np.vstack(exnn_stat), 0),
                                 "MLP-Std":np.nanstd(np.vstack(mlp_stat), 0)},
                                 index=["train_metric", "validation_metric", "test_metric", 
                                        'alltune_time_cost','optparam_time_cost','ortho_measure', 
                                       'nterms']).T
    
        results_train = stat[["train_metric"]].T
        results_train["Dataset"] = vb
        results_train["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_train["#Features"] = train_x.shape[1]
        results_train["Task"] = 'Regression'
        results_train = results_train.set_index("Dataset")

        results_test = stat[["test_metric"]].T
        results_test["Dataset"] = vb
        results_test["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_test["#Features"] = train_x.shape[1]
        results_test["Task"] = 'Regression'
        results_test = results_test.set_index("Dataset")

        results_tune_time = stat[["alltune_time_cost"]].T
        results_tune_time["Dataset"] = vb
        results_tune_time["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_tune_time["#Features"] = train_x.shape[1]
        results_tune_time["Task"] = 'Time'
        results_tune_time = results_tune_time.set_index("Dataset")   
        
        results_optparam_time = stat[["optparam_time_cost"]].T
        results_optparam_time["Dataset"] = vb
        results_optparam_time["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_optparam_time["#Features"] = train_x.shape[1]
        results_optparam_time["Task"] = 'Time'
        results_optparam_time = results_optparam_time.set_index("Dataset")   

        results_ortho = stat[["ortho_measure"]].T
        results_ortho["Dataset"] = vb
        results_ortho["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_ortho["#Features"] = train_x.shape[1]
        results_ortho["Task"] = 'Ortho'
        results_ortho = results_ortho.set_index("Dataset")  

        results_nterms = stat[['nterms']].T
        results_nterms["Dataset"] = vb
        results_nterms["#Samples"] = train_x.shape[0] + test_x.shape[0]
        results_nterms["#Features"] = train_x.shape[1]
        results_nterms["Task"] = 'Nterms'
        results_nterms = results_nterms.set_index("Dataset")  
        
        all_results_tune_time.append(results_tune_time)
        all_results_optparam_time.append(results_optparam_time)
        all_results_train.append(results_train)
        all_results_test.append(results_test)
        all_results_ortho.append(results_ortho)
        all_results_nterms.append(results_nterms)
        print('Generate and append the results for data sets.')
        
    results_train_latex = gen_reg_latex_results(pd.concat(all_results_train), 3)  
    results_test_latex = gen_reg_latex_results(pd.concat(all_results_test), 3)
    results_tune_time_latex = gen_reg_latex_results(pd.concat(all_results_tune_time), 3)
    results_optparam_time_latex = gen_reg_latex_results(pd.concat(all_results_optparam_time), 3)
    results_ortho_latex = gen_reg_latex_results(pd.concat(all_results_ortho), 3)
    results_nterms_latex = gen_reg_latex_results(pd.concat(all_results_nterms), 3)

    results_train_latex_list.append(results_train_latex)
    results_test_latex_list.append(results_test_latex)
    results_tune_time_latex_list.append(results_tune_time_latex)
    results_optparam_time_latex_list.append(results_optparam_time_latex)
    results_ortho_latex_list.append(results_ortho_latex)
    results_nterms_latex_list.append(results_nterms_latex)
    print('Generate and append the results for dimension.')

---------------------- Dimension: 20 DataSet: Case1 ----------------------
SeqStein NonOrtho Finished! Time Cost  15.39  Seconds!
SeqStein Ortho Finished! Time Cost  24.98  Seconds!
AIMLow Finished! Time Cost  37.62  Seconds!
AIMHigh Finished! Time Cost  145.4  Seconds!
SLFN Finished! Time Cost  281.63  Seconds!
ExNN Finished! Time Cost  5409.4  Seconds!
MLP Finished! Time Cost  5428.1  Seconds!
Load the Savings.
Generate the statistics.
Generate and append the results for data sets.
Generate and append the results for dimension.


In [5]:
# training error
results_train_df = pd.concat(results_train_latex_list).reset_index().set_index(['#Features','Dataset'])
results_train_df.to_csv('./results/test_train_diffd.csv')
results_train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,1.099$\pm$0.010,1.136$\pm$0.016,1.361$\pm$0.166,$\mathbf{0.994}$$\pm$0.006,1.021$\pm$0.012,1.012$\pm$0.018,1.004$\pm$0.008,10000


In [6]:
# test error
results_test_df = pd.concat(results_test_latex_list).reset_index().set_index(['#Features','Dataset'])
results_test_df.to_csv('./results/test_test_diffd.csv')
results_test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,1.104$\pm$0.021,1.135$\pm$0.023,1.376$\pm$0.178,1.017$\pm$0.010,1.049$\pm$0.026,1.036$\pm$0.011,$\mathbf{1.007}$$\pm$0.021,10000


In [7]:
# orthogonal measure
results_ortho_df = pd.concat(results_ortho_latex_list).reset_index().set_index(['#Features','Dataset'])
results_ortho_df.to_csv('./results/test_ortho_diffd.csv')
results_ortho_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,0.341$\pm$0.084,0.081$\pm$0.014,0.241$\pm$0.057,0.287$\pm$0.053,0.295$\pm$0.049,nan$\pm$nan,$\mathbf{0.002}$$\pm$0.001,10000


In [8]:
# time (include tuning cost)
results_tune_time_df = pd.concat(results_tune_time_latex_list).reset_index().set_index(['#Features','Dataset'])
results_tune_time_df.to_csv('./results/test_tune_time_diffd.csv')
results_tune_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,8.173$\pm$1.517,$\mathbf{7.152}$$\pm$1.679,9.959$\pm$1.227,93.785$\pm$9.254,124.351$\pm$10.449,10.223$\pm$2.735,1281.998$\pm$249.476,10000


In [9]:
# time (for optimal set of hyper-parameters)
results_optparam_time_df = pd.concat(results_optparam_time_latex_list).reset_index().set_index(['#Features','Dataset'])
results_optparam_time_df.to_csv('./results/test_optparam_time_diffd.csv')
results_optparam_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,8.173$\pm$1.517,7.152$\pm$1.679,$\mathbf{2.751}$$\pm$0.987,46.898$\pm$13.529,36.599$\pm$8.035,10.223$\pm$2.735,195.649$\pm$52.485,10000


In [10]:
# number of ridge terms
results_nterms_df = pd.concat(results_nterms_latex_list).reset_index().set_index(['#Features','Dataset'])
results_nterms_df.to_csv('./results/test_nterms_diffd.csv')
results_nterms_df

Unnamed: 0_level_0,Unnamed: 1_level_0,SeqStein-NonOrtho,SeqStein-Ortho,AIMLow,AIMHigh,SLFN,MLP,ExNN,#Samples
#Features,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20,Case1,7.800$\pm$1.833,6.900$\pm$2.256,7.300$\pm$2.900,9.400$\pm$1.200,14.000$\pm$8.000,nan$\pm$nan,$\mathbf{4.300}$$\pm$0.900,10000
