In [None]:
#Target variable for base-learning is assumed to be the last column.
#Can't automatically deal with date-type features.
#Can't automatically reverse one hot encoding.
#Can't automatically turn int-type encoding to enum-type encoding.
#1st line in dataset is assumed to be header.

In [None]:
import csv
import pandas as pd
import numpy as np
import os
import arff
import re
import codecs
import sklearn.tree as tr
from sklearn import preprocessing
from keras.models import load_model
import joblib
import h2o
import time

In [None]:
################ Import data & Set algorithm ################

In [None]:
#Master path
path_to_dataset_folder = '/home/User/Desktop/example_system'

#Set algorithm
algorithm = 'RF' #Options: 'GLM', 'RF', 'XGboost'.

print_model_params = True #Used to print info of the used meta-models.

#Set h2o cluster options.
h2o_mem = 64
h2o_threads =-1 #-1: max available cores.

In [None]:
new_dataset_path = path_to_dataset_folder + '/test_dataset'
first_nan_treat_path = path_to_dataset_folder + '/DATA/1st_NaN_Treatment'
CSV_path = path_to_dataset_folder + '/DATA/CSVs'
target_test_path = first_nan_treat_path
tree_meta_feature_path = path_to_dataset_folder + '/DATA/tree_metafeatures_for_test_CSVs'
manual_meta_feature_path = path_to_dataset_folder + '/DATA/manual_metafeatures_for_test_CSVs'
metaModel_data_path = path_to_dataset_folder + '/DATA/metaModel_Datasets'
model_path = path_to_dataset_folder + '/final_models'
regularization_path = path_to_dataset_folder + '/regularization'
save_to_path = path_to_dataset_folder + '/final_results_base_learning'

In [None]:
################ Create paths ################

In [None]:
for meth in ['drop', 'mean', 'median']:
    if not os.path.isdir(tree_meta_feature_path + '/' + meth):
        os.makedirs(tree_meta_feature_path + '/' + meth)
    if not os.path.isdir(manual_meta_feature_path + '/' + meth):
        os.makedirs(manual_meta_feature_path + '/' + meth)
for al in ['GLM', 'RF', 'XGboost']:
    if not os.path.isdir(save_to_path + '/' + al):
        os.makedirs(save_to_path + '/' + al)
if not os.path.isdir(first_nan_treat_path):
    os.mkdir(first_nan_treat_path)
if not os.path.isdir(CSV_path):
    os.mkdir(CSV_path)
if not os.path.isdir(target_test_path):
    os.mkdir(target_test_path)
if not os.path.isdir(metaModel_data_path):
    os.mkdir(metaModel_data_path)

In [None]:
################ Clear last Data ################

In [None]:
def clear_files(path):
    for file in os.listdir(path):
        os.remove(path + '/' + file)

In [None]:
for i,path in enumerate([tree_meta_feature_path, manual_meta_feature_path, save_to_path, metaModel_data_path,
                         CSV_path, first_nan_treat_path]):
    if i in [0,1]:
        for method in ['drop', 'mean', 'median']:
            clear_files(path + '/' + method)
    elif i == 2:
        for al in ['GLM', 'RF', 'XGboost']:
            clear_files(path + '/' + al)
    else:
        clear_files(path)

In [None]:
################ Preprocessing ################

In [None]:
#Preprocessing functions
def checkTargetDir(full_name):
    writeDir = ''.join('/'.join(full_name.split('/')[0:-2]) + '/DATA/CSVs')
    if not os.path.isdir(writeDir):
        os.mkdir(writeDir)
    return writeDir

def tabAndWhite(full_name):
    print('Dealing with tabs and whitespaces in: ' + full_name.split('/')[-1])
    with codecs.open(full_name, 'r', 'utf-8') as curFile:
        data = curFile.readlines()
    data = [re.sub('\t', ' ',x.strip()) for x in data]    #Turning tabs to whitespace,stripping start/end.
    data = [re.sub(' +',' ',x) for x in data]    #Turning multiple whitespaces to length of one.
    data = '\n'.join(data)
    tempFile = ''.join('/'.join(full_name.split('/')[0:-1]) + '/v2_' + full_name.split('/')[-1])
    with codecs.open(tempFile, 'w', 'utf-8') as curFile:
        for row in data:
            curFile.write(row)
    return tempFile

def process_csv_txt_data(original_name):
    filename = original_name.split('/')[-1]
    print('Processing ' + filename)
    full_name = tabAndWhite(original_name)
    file_size = os.stat(full_name).st_size
    with codecs.open(full_name, 'r', 'utf-8') as curFile:
        df = pd.read_table(curFile,header = 0, sep = None)
    os.remove(full_name)
    writeDir = checkTargetDir(full_name)
    df.to_csv(os.path.join(writeDir,'.'.join(filename.split('.')[0:-1]) +'.csv'),index =False)
    print('----------------------------------------------------------------------')
    
def process_excel(full_name):
    filename = full_name.split('/')[-1]
    print('Processing ' + filename)
    df = pd.read_excel(full_name)
    writeDir = checkTargetDir(full_name)
    df.to_csv(os.path.join(writeDir,'.'.join(filename.split('.')[0:-1]) +'.csv'),index =False)
    print('----------------------------------------------------------------------')

def process_arff(full_name):
    print('Processing ' + full_name.split('/')[-1])
    with codecs.open(full_name,encoding = 'utf-8') as curFile:
        df = arff.load(curFile)
    df = pd.DataFrame(df['data'])
    writeDir = checkTargetDir(full_name)
    df.to_csv(os.path.join(writeDir,'.'.join(filename.split('.')[0:-1]) +'.csv'),index =False)
    print('----------------------------------------------------------------------')

In [None]:
#Preprocessing
#Creating "files" as a one time assignment of listdir so as to
#avoid processing later-created-temp files as well.
files = os.listdir(new_dataset_path)
for filename in files:
    full_name = new_dataset_path +'/' + filename
    suf = filename.split('.')[-1]
    if suf in ["txt","csv","data","dat"]:
        process_csv_txt_data(full_name)
    elif suf in ["xls", "xlsx"]:
        process_excel(full_name)
    elif suf == "arff":
        process_arff(full_name)
    else:
        print('Unsupported file: ' + filename + ' ...ignoring...')

In [None]:
def linesWithNan(df):
    nansPerLine = df.isna().sum(axis=1)
    lines_with_nan = 0
    for l in nansPerLine:
        if l>0:
            lines_with_nan +=1
    return lines_with_nan

#Used to perform the 1st NaN treatment of the dataset. Any Feature with more than 20% missing
#values is discarded. After all such features have been discarded if the lines with at least one
#missing vallue are less than 10% of the total remaining lines, they are droped. The resulting
#datasets are saved in "1st_NaN_Treatment" folder of the "../DATAsets/DATA" dir.
def first_nan_treatment(path,exclusions = None, feature_threshold = 0.20,
                        lines_with_nan_threshold = 0.10, visual_info = True):
    files = os.listdir(path)
    for f in np.setdiff1d(files, exclusions,assume_unique=True):
        df = pd.read_table(path + '/' + f, header = 0, sep = ',')
        lines = df.shape[0]
        features = df.shape[1]
                
        #Nans on Feature basis
        nanPerFeature = df.isna().sum(axis=0)
        indecies_to_drop = []
        for i,npf in enumerate(nanPerFeature):
            if npf / lines >= feature_threshold:
                indecies_to_drop.append(i)
        df.drop(df.columns[indecies_to_drop],axis=1,inplace=True)
        
        #Nans on Line basis
        nansPerLine = df.isna().sum(axis=1)
        lines_with_nan = linesWithNan(df)
        if (lines_with_nan / lines) <= lines_with_nan_threshold:
            df.dropna(axis = 0, how='any',inplace=True)
        new_lines_with_nan = linesWithNan(df)
        if visual_info:
            if features != df.shape[1] or lines != df.shape[0] or new_lines_with_nan/df.shape[0] >=lines_with_nan_threshold:
                print('File:' + f)
                print('Lines:' + str(lines) + ' --> ' +str(df.shape[0]))
                print('Features: ' +str(features) + ' --> '+str(df.shape[1]))
                print('Lines with NaNs: ' + str(lines_with_nan) + ' ('+
                      str(round(lines_with_nan / lines * 10000)/100) + '%) --> ' +
                      str(new_lines_with_nan)+ '('+ str(round(new_lines_with_nan /
                                                              df.shape[0] * 10000)/100) + '%)')
                print('NaNs per Feature:\n' + str(df.isna().sum(axis=0)))
                print('---------------------------------------------')
        df.to_csv(first_nan_treat_path + '/' + f, index= False)

In [None]:
#Perform the NaN treatment
first_nan_treatment(CSV_path, visual_info=True)

In [None]:
#Check the dataset after 1st nan treatment and drop any constant columns,
#while preserving the naming of the remaining columns.
for file in os.listdir(first_nan_treat_path):
    df = pd.read_table(first_nan_treat_path + '/'+ file,sep=',')
    L = df.shape[1]
    drops = []
    i=0
    for feat in df.columns:
        if len(df.loc[:,feat].unique()) == 1:
            if i == 0:
                print('Processing file: ' + file)
                print(df.head())
            print('Found one! It is column: ' + feat)
            print('Unique value = ', df.loc[0,feat])
            drops.append(feat)
            L -=1
            i+=1
    flag = False
    if df.columns[0] == '0':
        flag = True
    df.drop(drops, axis = 1, inplace = True)
    if flag:
        df.columns = range(L)
    if len(drops)>0:
        print(df.head())
    df.to_csv(first_nan_treat_path + '/'+ file,sep=',',index= False)

In [None]:
################ Extraction of Metafeatures ################

In [None]:
################ Tree metafeatures ################

In [None]:
#Used to make a dictionary with leave nodes' ids maped to their branch lengths. So there are that many key:value
#entries as the numbeer of leaves. The aformentioned dictionary is the global variable "branch_lengths".
branch_lengths = {}
def get_branches_len(input_tree_, start_node, counter):
    if input_tree_.children_left[start_node] !=-1:
        counter += 1
        get_branches_len(input_tree_,input_tree_.children_left[start_node],counter)
        get_branches_len(input_tree_,input_tree_.children_right[start_node],counter)
    else:
        global branch_lengths
        branch_lengths[start_node] = counter
        
#Returns a list of ints, containing the node ids of the leaves.
def find_leaves(input_tree_):
    leaves = []
    for node in range(input_tree_.node_count):
        if input_tree_.children_left[node] == -1:
            leaves.append(node)
    return leaves

#Returns a dictionary with level-id int keys maped to the nodes on that level. e.g. 0:0, 1:[1,2] for a simple 2-level tree
#root + 2 children nodes.
def get_levels(input_tree_):
    levels = {0:[0]}
    for i in range(input_tree_.max_depth):
        left_children = [input_tree_.children_left[x] for x in levels[i] if input_tree_.children_left[x] != -1]
        right_children = [input_tree_.children_right[x] for x in levels[i] if input_tree_.children_right[x] != -1]
        levels[i+1] = left_children + right_children
    return levels

#Returns a dictionary with the features as keys, and their frequency of appearance as values.
def get_feature_freq(input_tree_):
    freqs = {}
    for i in range(input_tree_.node_count):
        if input_tree_.children_left[i] !=-1:
            if input_tree_.feature[i] in freqs:
                freqs[input_tree_.feature[i]] +=1
            else:
                freqs[input_tree_.feature[i]] = 1
    return freqs

#The next 2 functions' code was found at: https://www.geeksforgeeks.org/diameter-of-a-binary-tree-in-on-a-new-method/
#and tweaked slightly to match the needs of this work.

# Function to find height of a tree  
def height(tree_,root, ans): 
    if (root == -1): 
        return 0
  
    left_height = height(tree_,tree_.children_left[root], ans)  
  
    right_height = height(tree_,tree_.children_right[root], ans)  
  
    # update the answer, because diameter  
    # of a tree is nothing but maximum  
    # value of (left_height + right_height + 1) 
    # for each node  
    ans[0] = max(ans[0], 1 + left_height + 
                             right_height)  
  
    return 1 + max(left_height, 
                   right_height) 
  
# Computes the diameter of binary  
# tree with given root.  
def diameter(tree_,root): 
    if (root < 0):  
        return 0
    ans = [-999999999999] # This will store 
                          # the final answer  
    height_of_tree = height(tree_,root, ans)  
    return ans[0]

#Returns the 14-element feature vector (list) of the given dataFrame (df). Also if "visual_tree" is True,
#then the tree is ploted, default value: False.
def metafeatures(df, visual_tree = False):
    meta_vector = []
    x = df.loc[:,df.columns[0:-1]]
    y = df.loc[:,df.columns[-1]]
    regr = tr.DecisionTreeRegressor(random_state = 8328, min_impurity_decrease = 1e-06)
    regr.fit(x,y)
    if visual_tree:
        tr.plot_tree(regr)
    
    #1 Tree width (diameter)
    meta_vector.append(diameter(regr.tree_,0))    
    #2 Tree height
    meta_vector.append(regr.tree_.max_depth)    
    #3 Total number of nodes
    meta_vector.append(regr.tree_.node_count)    
    #4 Total number of leaves
    meta_vector.append(len(find_leaves(regr.tree_)))
    
    levels = get_levels(regr.tree_)
    nodes_per_level = [len(levels[i]) for i in range(len(levels))]    
    #5 Maximun nodes per level
    meta_vector.append(np.max(nodes_per_level))
    #6 Mean number of nodes per level
    meta_vector.append(np.mean(nodes_per_level))
    #7 Standard deviation of nodes per level
    meta_vector.append(np.std(nodes_per_level))
    
    global branch_lengths
    branch_lengths = {} #(re)initialise the global variable to hold the branches and their lengths
    get_branches_len(regr.tree_,0,0)
    length_per_branch = [branch_lengths[x] for x in branch_lengths]
    #(8) Longest branch's length| Not used because it's always the same as #2 Tree height
    #meta_vector.append(np.max(length_per_branch))
    #8 Shortest branch's length
    meta_vector.append(np.min(length_per_branch))
    #9 Mean length of branches
    meta_vector.append(np.mean(length_per_branch))
    #10 Standard deviation of length of branches
    meta_vector.append(np.std(length_per_branch))
    
    feature_frequencies = get_feature_freq(regr.tree_)
    freqs = [feature_frequencies[x] for x in feature_frequencies]
    #11 Maximum frequency of feature appearance
    meta_vector.append(np.max(freqs))
    #12 Minimum frequency of feature appearance
    meta_vector.append(np.min(freqs))
    #13 Mean frequency of feature appearance
    meta_vector.append(np.mean(freqs))
    #14 Standard deviation of frequency of feature appearance
    meta_vector.append(np.std(freqs))
    
    return meta_vector

#This function prepares the dataframe for metafeature extraction. 
#First a missing values imputation is performed using the "method" arguement:
# drop = all instances with at least one missing value are droped,
# mean = missing values are assigned the mean value for each numerical feature,
# median = missing values are assigned the median value for each numerical feature.
#In any case, any missing values on categorical deatures are filled with the mode of
#each feature, the value most frequently seen.
#Returns the processed dataframe and the NaNs per line metric.
def tree_ready(df, method = 'drop'):
    total_nans = df.isna().sum().sum()
    mean_nans_per_line = total_nans / df.shape[0]
    feats_to_encode = [feat for i, feat in enumerate(df.columns) if df.dtypes[i] == 'object']
    if total_nans > 0:
        cols = np.setdiff1d(df.columns, feats_to_encode)
        if method == 'drop':
            df.dropna(axis = 0, how = 'any', inplace = True)
        elif method == 'mean':            
            df.fillna({col: df.loc[:,col].mean() for col in cols}, inplace = True)
            df.fillna({col: df.loc[:,col].mode()[0] for col in feats_to_encode}, inplace = True)
        elif method == 'median':
            df.fillna({col: df.loc[:,col].median() for col in cols}, inplace = True)
            df.fillna({col: df.loc[:,col].mode()[0] for col in feats_to_encode}, inplace = True)
        else:
            print('Non identifiable method provided!')
            return None, None
        
    for x in feats_to_encode:
        le = preprocessing.LabelEncoder()
        df.loc[:,x] = le.fit_transform(df.loc[:,x])
    return df , mean_nans_per_line

In [None]:
#This creates and processes the tree to get the metafatures. The metafeature vector is saved in
#a txt file located in "../DATAsets/DATA/tree_metafeatures_for_test_CSVs" dir.
datasets = os.listdir(target_test_path)
start_time = time.time()
for file in datasets:
    print('proccessing file: ' + file)
    print('Algorithm:',algorithm)
    for method in ['drop','mean','median']:
        if not os.path.isfile(tree_meta_feature_path + '/' + method + '/' + file):
            print('method: ' + method)
            df = pd.read_table(target_test_path + '/'+ file,sep=',')
            df, MNpL = tree_ready(df, method)

            with open(tree_meta_feature_path + '/' + method + '/' + file,'w' ) as f:
                for i,x in enumerate(metafeatures(df)):
                    f.write(str(x)+',')
                f.write(str(MNpL))
mfe_time = time.time() - start_time

In [None]:
################ Manual metafeatures ################

In [None]:
#Definitions of functions for >>>MANUAL<<< computation of general and statistical
#type metafeatures of the given dataset.

def append_mean_and_std(vec, values):
    vec.append(np.mean(values))
    vec.append(np.std(values))

def manual_mfe_extraction(data):    
    meta_vector = []
    x = df.loc[:,df.columns[0:-1]]
    y = df.loc[:,df.columns[-1]]
    
    # ------------ General metafeatures ------------
    # 1. number of instances
    nr_inst = x.shape[0]
    meta_vector.append(nr_inst)
    # 2. number of features
    nr_attr = x.shape[1]
    meta_vector.append(nr_attr)
    # 3. ratio between instances and features
    inst_to_attr = nr_inst / nr_attr
    meta_vector.append(inst_to_attr)
    # 4. number of categorical features
    nr_cat = 0
    # 5. number of numeric features
    nr_num = 0
    
    uniques_values_per_feature = x.apply(pd.Series.nunique)
    classes = []
    for i, feature in enumerate(x.columns):
        if x.dtypes[i] == 'object':
            nr_cat +=1
            classes.append(uniques_values_per_feature[i])
        else:
            nr_num +=1
    meta_vector.append(nr_cat)
    meta_vector.append(nr_num)
    # 6. categorical to numerical ratio
    cat_to_num = nr_cat / nr_num
    meta_vector.append(cat_to_num)
    # 7. & 8. number of distinct classes (mean and std)
    if len(classes) == 0:
        classes = [0]
    append_mean_and_std(meta_vector, classes)
    
    # ------------ Statistical metafeatures ------------
    cor_matrix = x.corr(method = 'pearson').abs()
    cov_matrix = x.cov().abs()
    cor_flat = []
    cov_flat = []
    for i in range(cor_matrix.shape[0]):
        for j in range(cor_matrix.shape[1]):
            if j > i:
                cor_flat.append(cor_matrix.iloc[i,j])
                cov_flat.append(cov_matrix.iloc[i,j])
    if len(cor_flat) == 0: #if there are less than 2 numeric features in dataset.
        cor_flat = [0]
        cov_flat = [0]
    # 9. & 10. absolute pairwise correlations (mean and std)
    append_mean_and_std(meta_vector, cor_flat)
    # 11. & 12. absolute pairwise covariances (mean and std)
    append_mean_and_std(meta_vector, cov_flat)
    # 13. & 14. kurtosis of features (mean and std)
    kurt = x.kurtosis(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, kurt)
    # 15. & 16. mean absolute deviation of features (mean and std)
    mad = x.mad(axis = 0)
    append_mean_and_std(meta_vector, mad)
    # 17. & 18. maximum value of features (mean and std)
    maxes = x.max(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, maxes)
    # 19. & 20. mean value of features (mean and std)
    means = x.mean(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, means)
    # 21. & 22. median of features (mean and std)
    medians = x.median(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, medians)
    # 23. & 24. minimum value of features (mean and std)
    mins = x.min(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, mins)
    # 25. & 26. standard deviation of value of features (mean and std)
    stds = x.std(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, stds)
    # 27. & 28. skewness of features (mean and std)
    skews = x.skew(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, skews)
    # 29. & 30. variance of features (mean and std)
    variances = x.var(axis = 0, numeric_only = True)
    append_mean_and_std(meta_vector, variances)
    
    return meta_vector

In [None]:
# The metafeature vector is saved in  txt file located in 
# "../DATAsets/DATA/metafeatures_for_test_CSVs" dir.
datasets = os.listdir(target_test_path)
start_time = time.time()
for file in datasets:
    print('proccessing file: ' + file)
    for method in ['drop','mean','median']:
        if not os.path.isfile(manual_meta_feature_path + '/' + method + '/' + file):
            print('method: ' + method)
            df = pd.read_table(target_test_path + '/'+ file,sep=',')
            metafeatures = manual_mfe_extraction(df)

            with open(manual_meta_feature_path + '/' + method + '/' + file,'w' ) as f:
                for i,x in enumerate(metafeatures):
                    if i < len(metafeatures) - 1:
                        f.write(str(x)+',')
                    else:
                        f.write(str(x))
mfe_time = mfe_time + time.time() - start_time

In [None]:
################ Meta learning process ################

In [None]:
def do_predict_RF(x_test, y_cols, estimator):
    predictions = []
    for target_column in range(y_cols):
        y_pred = estimator.predict(x_test)
        predictions.append(y_pred)
    return predictions

#Combines the extracted metafeatures and ground truth of the dataset into a new dataset.
#It contains the 16 metafeatures + 2 (3 for xgboost) ground truth hyperparameters.

def build_input_vec(tree_meta_feature_path, manual_meta_feature_path = None,
                     algo = 'RF', method = 'drop', eco = True):
    for f in os.listdir(tree_meta_feature_path + '/' + method):
        line = pd.read_csv(tree_meta_feature_path + '/' + method + '/' + f,
                           names = ['TreeDiam', 'TreeHeight', 'TotalNodes', 'TotalLeaves',
                                    'maxNodePerLevel', 'meanNodePerLevel', 'stdNodePerLevel',
                                    'ShortBranch', 'meanBranch', 'stdBranch','maxFeatureFreq',
                                    'minFeatureFreq', 'meanFeatureFreq', 'stdFeatureFreq','NaNsPerLine'
                                   ],
                           header = None, sep=',')
        if not manual_meta_feature_path == None:
            line2 = pd.read_csv(manual_meta_feature_path + '/' + method + '/' + f,
                                names = ['nr_inst', 'nr_attr', 'inst_to_attr', 'nr_cat', 'nr_num', 'cat_to_num',
                                         'nr_class_mean', 'nr_class_std', 'cor_mean', 'cor_std', 'cov_mean', 'cov_std',
                                         'kurtosis_mean', 'kurtosis_std', 'mad_mean', 'mad_std', 'max_mean', 'max_std',
                                         'mean_mean', 'mean_std', 'median_mean', 'median_std', 'min_mean', 'min_std',
                                         'std_mean', 'std_std', 'skew_mean', 'skew_std','var_mean', 'var_std'
                                        ],
                                header = None, sep=',')
            line = pd.concat([line, line2], ignore_index = False, axis = 1)
    line.to_csv(metaModel_data_path + '/' + algo + '_' + method + '.csv', sep =',', index = False)
    return line

def regularize_input(data, algo, method, regularization_path):
    regularization_means = pd.read_csv(regularization_path + '/regularization_means_' + algo + '_' + method + '.csv')
    regularization_stds = pd.read_csv(regularization_path + '/regularization_stds_' + algo + '_' + method + '.csv')
    L = data.columns
    for j in L:
        for i,x in enumerate(data.loc[:,j]):
            data.loc[i,j] = (x-regularization_means.loc[0,j])/regularization_stds.loc[0,j]
    return data.values

def fix_preds_calcs(param_vector, single_pred):
    param_vector2 = [abs(x - single_pred) for x in param_vector]
    temp = 0   #Store parameter value
    ind = 100000  #Store parameter comparison value
    for i,pa in enumerate(param_vector2):
        if pa < ind:
            ind = pa
            temp = param_vector[i]
    return temp

def fix_preds_RF(preds, algo, index, lambdas):
    in_preds = []
    length = len(preds)
    if algo == 'GLM':
        if index == 0:
            y = range(0, 1100, 125)
            param_alpha = [x / 1000 for x in y]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_alpha, preds[i]))
        elif index == 1:
            for i in range(length):
                in_preds.append(preds[i])
                
    elif algo =='RF':
        if index == 0:
            param_tree = [25,50,75,100,200,300,400,500]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_tree, preds[i]))
        elif index == 1:
            param_depth = [20,40,60,80]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_depth, preds[i]))
    elif algo == 'XGboost':
        if index == 0:
            param_tree = [25,50,100,200]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_tree, preds[i]))
        elif index == 1:
            param_depth = [6,10,15]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_depth, preds[i]))
        elif index == 2:
            colsample_bytree = [.6,.7,.8,.9]
            for i in range(length):
                in_preds.append(fix_preds_calcs(colsample_bytree, preds[i]))
    return in_preds

In [None]:
start_time = time.time()

#Load meta-models
RF_models = {}
for i in os.listdir(model_path):
    model_name = i.split('/')[-1].split('.')[0]
    prefix = model_name.split('_')[0]
    algo = model_name.split('_')[2]
    if algo == algorithm:
        if prefix == 'RF':
            param = model_name.split('_')[-1]
            key = algo + '_' + param
            RF_models[key] = joblib.load(model_path + '/'+ i)
        else:
            NN_model = load_model(model_path + '/'+ i)

#Prepare input data for RF-type meta-models
if algorithm == 'GLM':
    meta = ['all', 'tree']
    method = ['drop', 'median']
elif algorithm == 'RF':
    meta = ['tree', 'all']
    method = ['drop', 'drop']
elif algorithm == 'XGboost':
    meta = ['all', 'all', 'all']
    method = ['mean', 'drop', 'drop']
    print('Algorithm:',algorithm)

#Do predictions for RF-type meta-models
RF_preds = []
print('----- RF predictions -----')
for i in range(len(method)):
    if print_model_params:
        print('NaN method: ' + method[i])
    if meta[i] == 'all':
        if print_model_params:
            print('Meta-features used: 45')
        inp_vector = build_input_vec(tree_meta_feature_path,
                                     manual_meta_feature_path = manual_meta_feature_path,
                                     algo = algorithm, method = method[i]
                                    )            
    elif meta[i] == 'tree':
        if print_model_params:
            print('Meta-features used: 15')
        inp_vector = build_input_vec(tree_meta_feature_path,
                                     algo = algorithm, method = method[i]
                                    )
            
    model_key = algorithm + '_param' + str(i+1)
    batch_preds = do_predict_RF(inp_vector, len(method), RF_models[model_key])
    RF_preds.append(batch_preds[i][0])
print('RF results:',RF_preds)
print('\n')
        
#Do predictions for NN-type meta-models
print('----- NN predictions -----')
#Print model info
standardize = 'regularize'
if algorithm == 'GLM':
    meta = 'tree'
    standardize_y = False
    two_layers = False
    half_on_second = False
    method = 'drop'
    hidden = 5
elif algorithm == 'RF':
    meta = 'tree'
    standardize_y = False
    two_layers = True
    half_on_second = False
    method = 'mean'
    hidden = 10
elif algorithm =='XGboost':
    meta = 'tree'
    standardize_y = False
    two_layers = False
    method = 'drop'
    hidden = 5        

if print_model_params:
    print('Metafeature set:',meta)
    print('Standardization:',standardize)
    print('Hidden neurons:', hidden)
    print('2nd hidden layer:',two_layers)
    print('Nan method:',method)
    
#Prepare input data
if meta == 'all':
    inp_vector = build_input_vec(tree_meta_feature_path,
                                 manual_meta_feature_path = manual_meta_feature_path,
                                 algo = algorithm, method = method
                                )
elif meta == 'tree':
    inp_vector = build_input_vec(tree_meta_feature_path,
                                 algo = algorithm, method = method
                                )
reg_inp_vector = regularize_input(inp_vector, algorithm, method, regularization_path)
NN_preds = NN_model.predict(reg_inp_vector, verbose = 0)[0]
NN_preds = NN_preds.tolist()
print('NN results:', NN_preds)
print('\n')

#Combine predictions
print('----- Combined predictions -----')
RF_combine_weight = 0.5
NN_combine_weight = 0.5
y_cols = len(NN_preds)
combined_preds = []
print('Combined predictions:')
for i in range(y_cols):
    true_preds = RF_preds[i] * RF_combine_weight + NN_preds[i] * NN_combine_weight
    if algorithm == 'GLM' and i == 1:
        true_preds_fixed = [true_preds]
    else:
        temp = []
        temp.append(true_preds)
        true_preds_fixed = fix_preds_RF(temp, algorithm, i, lambdas = None)
    combined_preds.append(true_preds_fixed[0])
print('Combined preds:',combined_preds)

pred_time = time.time() - start_time

In [None]:
def GLM_gTruth_oneByOne(file, source, save_to_path, alpha, lam):
    print('processing file: ' + file + ' | alpha:' + str(alpha) + ' | lambda:' + str(lam))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
           
    model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(model_id = 'GLMestimator',
                                                             seed = 888,
                                                             alpha = alpha,
                                                             lambda_ = lam,
                                                             lambda_search = False,
                                                             nlambdas = 100,
                                                             standardize = True,
                                                             nfolds = 10,
                                                             keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    lambda_param = float(model.summary()['regularization'][0].split('= ')[-1].split(' ')[0])
    params = h2o.H2OFrame({'alpha':alpha,
                           'lambda':lambda_param,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/GLM/' + '.'.join(file.split('.')[0:-1]) + 
                    '_alpha' + str(alpha) + '_lambda' + str(lambda_param) +'.csv')
    my_mojo = model.download_mojo(path = save_to_path + '/GLM/', get_genmodel_jar = True)

def RF_gTruth_oneByOne(file, source, save_to_path, ntrees, depth):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time() #Start counting training time
    
    model = h2o.estimators.H2ORandomForestEstimator(model_id = 'RFestimator',
                                                    seed = 888,
                                                    ntrees = ntrees,
                                                    max_depth = depth,
                                                    nfolds = 10,
                                                    keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time #Stop timing
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/RF/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth)+'.csv')
    my_mojo = model.download_mojo(path = save_to_path + '/RF/', get_genmodel_jar = True)

def XGboost_gTruth_oneByOne(file, source, save_to_path, ntrees, depth, colsample_bytree, method = 'cpu'):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth)+
         ', colsPerTree:' + str(colsample_bytree))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
    
    model = h2o.estimators.xgboost.H2OXGBoostEstimator(model_id = 'XGboostEstimator',
                                                       seed = 888,
                                                       backend= method,
                                                       ntrees = ntrees,
                                                       max_depth = depth,
                                                       col_sample_rate_per_tree = colsample_bytree,
                                                       nfolds = 10,
                                                       keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'cols_per_tree': colsample_bytree,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/XGboost/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth) + '_colsPerTree' + str(colsample_bytree) + '.csv')
    my_mojo = model.download_mojo(path = save_to_path + '/XGboost/', get_genmodel_jar = True)

In [None]:
h2o.init(max_mem_size = h2o_mem, nthreads = h2o_threads)

datasets = os.listdir(target_test_path)
for file_name in datasets:
    h2o.remove_all()
    if algorithm == 'GLM':
        GLM_gTruth_oneByOne(file_name, target_test_path, save_to_path, combined_preds[0], combined_preds[1])
    elif algorithm == 'RF':
        RF_gTruth_oneByOne(file_name, target_test_path, save_to_path, combined_preds[0], combined_preds[1])
    elif algorithm == 'XGboost':
        XGboost_gTruth_oneByOne(file_name, target_test_path, save_to_path, combined_preds[0], combined_preds[1], combined_preds[2])
    h2o.remove_all()
    h2o.remove_all()

h2o.remove_all()
h2o.cluster().shutdown()

#Calculate method time for the test dataset.
datasets = os.listdir(save_to_path + '/' + algorithm)
for name in datasets:
    if name.split('.')[-1] == "csv":
        df = pd.read_csv(save_to_path + '/' + algorithm + '/' + name, header = 0, sep = ',')
        train_time = df.iloc[0,-1]
        rmse = df.iloc[0,-2]
    
method_time = mfe_time + pred_time + train_time
print('Mfe_time:', mfe_time)
print('Prediction_time:', pred_time)
print('Train_time:', train_time)
print('\n>>> Total_method_time:' + str(method_time)+'s <<<')
print('\n+++ method rmse:'+ str(rmse) + ' +++')