In [None]:
import os
import pandas as pd
import numpy as np
import time
import math
import random
import matplotlib.pyplot as plt
import keras.optimizers as opts
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn import tree as sktr
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib
#Paths to access the necessary data
path_to_dataset_folder = 'C:/Users/Dimiexe/Desktop/DATAsets'
tree_meta_feature_path = path_to_dataset_folder + '/DATA/tree_metafeatures_for_test_CSVs'
ground_truth_path = path_to_dataset_folder + '/DATA/ground_truth'
metaModel_data_path = path_to_dataset_folder + '/DATA/metaModel_Datasets'
regularization_path = path_to_dataset_folder + '/regularization'
normalization_path = path_to_dataset_folder + '/normalization'
model_path = path_to_dataset_folder + '/final_models'
manual_meta_feature_path = path_to_dataset_folder + '/DATA/manual_metafeatures_for_test_CSVs'

In [None]:
#Combines the extracted metafeatures and ground truth for each dataset into a new dataset.
#Every line contains the 16 metafeatures + 2 (3 for xgboost) ground truth hyperparameters and represents
#a different dataset. Returns that unified dataset.
def build_dataSet_DL(tree_meta_feature_path, ground_truth_path, manual_meta_feature_path = None,
                     algo = 'RF', method = 'drop', eco = True):
    for i,f in enumerate(os.listdir(tree_meta_feature_path + '/' + method)):
        line = pd.read_csv(tree_meta_feature_path + '/' + method + '/' + f,
                           names = ['TreeDiam', 'TreeHeight', 'TotalNodes', 'TotalLeaves',
                                    'maxNodePerLevel', 'meanNodePerLevel', 'stdNodePerLevel',
                                    'ShortBranch', 'meanBranch', 'stdBranch','maxFeatureFreq',
                                    'minFeatureFreq', 'meanFeatureFreq', 'stdFeatureFreq','NaNsPerLine'
                                   ],
                           header = None, sep=',')
        if not manual_meta_feature_path == None:
            line2 = pd.read_csv(manual_meta_feature_path + '/' + method + '/' + f,
                                names = ['nr_inst', 'nr_attr', 'inst_to_attr', 'nr_cat', 'nr_num', 'cat_to_num',
                                         'nr_class_mean', 'nr_class_std', 'cor_mean', 'cor_std', 'cov_mean', 'cov_std',
                                         'kurtosis_mean', 'kurtosis_std', 'mad_mean', 'mad_std', 'max_mean', 'max_std',
                                         'mean_mean', 'mean_std', 'median_mean', 'median_std', 'min_mean', 'min_std',
                                         'std_mean', 'std_std', 'skew_mean', 'skew_std','var_mean', 'var_std'
                                        ],
                                header = None, sep=',')
            line = pd.concat([line, line2], ignore_index = False, axis = 1)
        line2 = pd.read_csv(ground_truth_path + '/' + algo + '/' + f, header = 0, sep=',')
        if algo == 'GLM':
            line2.drop(line2.columns[2:], axis = 1, inplace = True)                
        elif algo == 'RF':
            if eco:
                line2.drop(line2.columns[:3], axis = 1, inplace = True)
            line2.drop(line2.columns[2:], axis = 1, inplace = True)                
        elif algo == 'XGboost':
            if eco:
                line2.drop(line2.columns[:4], axis = 1, inplace = True)
            line2.drop(line2.columns[3:], axis = 1, inplace = True)
        else:
            print("\n>>> Algorithm '" + algo + "' not valid. <<<")
            return None
        line = pd.concat([line, line2], ignore_index = False, axis = 1)
        if i==0:
            df = line
        else:
            df = pd.concat([df, line], ignore_index = True, axis = 0)
    df.to_csv(metaModel_data_path + '/' + algo + '_' + method + '.csv', sep =',', index = False)
    return df

In [None]:
def regularize_dataset(data, algo, method, regularize_y, regularization_path):
    regularization_means = pd.read_csv(regularization_path + '/regularization_means_' + algo + '_' + method + '.csv')
    regularization_stds = pd.read_csv(regularization_path + '/regularization_stds_' + algo + '_' + method + '.csv')
    if regularize_y:
        L = data.columns
    else:
        if algo in ['RF', 'GLM']:
            L = data.columns[0:-2]
        elif algo == 'XGboost':
            L = data.columns[0:-3]
    for j in L:
        for i,x in enumerate(data.loc[:,j]):
            data.loc[i,j] = (x-regularization_means.loc[0,j])/regularization_stds.loc[0,j]
    return data

def normalize_dataset(data, algo, method, normalize_y, normalization_path):
    normal_mins = pd.read_csv(normalization_path + '/normalizaton_mins_' + algo + '_' + method + '.csv')
    normal_maxes = pd.read_csv(normalization_path + '/normalizaton_maxes_' + algo + '_' + method + '.csv')
    if normalize_y:
        L = data.columns
    else:
        if algo in ['RF', 'GLM']:
            L = data.columns[0:-2]
        elif algo == 'XGboost':
            L = data.columns[0:-3]
    for j in L:
        for i,x in enumerate(data.loc[:,j]):
            data.loc[i,j] = ( x-normal_mins.loc[0,j] ) / (normal_maxes.loc[0,j] - normal_mins.loc[0,j])
    return data

def deregularize_preds(preds, algo, method, regularization_path):
    regularization_means = pd.read_csv(regularization_path + '/regularization_means_' + algo + '_' + method + '.csv')
    regularization_stds = pd.read_csv(regularization_path + '/regularization_stds_' + algo + '_' + method + '.csv')
    if algo in ['RF','GLM']:
        L = [-1,-2]
    elif algo == 'XGboost':
        L = [-1,-2,-3]
    for j in L:
        for i,x in enumerate(preds[:,j]):
            preds[i,j] = (x * regularization_stds.iloc[0,j] + regularization_means.iloc[0,j])
    return preds

def denormalize_preds(preds, algo, method, normalization_path):    
    normal_mins = pd.read_csv(normalization_path + '/normalizaton_mins_' + algo + '_' + method + '.csv')
    normal_maxes = pd.read_csv(normalization_path + '/normalizaton_maxes_' + algo + '_' + method + '.csv')
    if algo in ['RF','GLM']:
        L = [-1,-2]
    elif algo == 'XGboost':
        L = [-1,-2,-3]
    for j in L:
        for i,x in enumerate(preds[:,j]):
            preds[i,j] = x * (normal_maxes.iloc[0,j] - normal_mins.iloc[0,j]) + normal_mins.iloc[0,j]
    return preds

# Standardize data and splits to predictors (x) and targets (y)
# Standardization options:
# 'regularize' for regularization (mean = 0, std = 1)
# 'normalize' for normalization (range of values in [0,1])
def Data_prep_NN(data, algo, method, regularization_path, normalization_path, standardize = 'regularize', regularize_y = False, normalize_y = False):
    if standardize == 'regularize':
        data = regularize_dataset(data, algo, method, regularize_y, regularization_path)
    elif standardize == 'normalize':
        data = normalize_dataset(data, algo, method, normalize_y, normalization_path)
    else:
        print('No regularization performed. Results will be sub-optimal.')
    data = data.values
    if algo in ['RF', 'GLM']:
        y = data[:, -2:]
        x = data[:, 0:-2]
    elif algo == 'XGboost':
        y = data[:, -3:]
        x = data[:, 0:-3]
    else:
        print('\n>>> Not supported algorithm. Returning None! <<<')
        return None
    return x, y

def Data_prep_RF(data, algo):
    if algo in ['RF', 'GLM']:
        y = data.iloc[:, -2:]
        x = data.iloc[:, 0:-2]
    elif algo == 'XGboost':
        y = data.iloc[:, -3:]
        x = data.iloc[:, 0:-3]
    else:
        print('\n>>> Not supported algorithm. Returning None! <<<')
        return None
    return x, y

In [None]:
#Returns "train" and "test" pandas.DataFrames split acording to the provided ratio.
def split_train_test_indices(dataset, split_ratio = .15):
    import random
    rows = dataset.shape[0]
    test_rows = round(rows * split_ratio)
    train_rows = rows - test_rows
    
    #Generate train_data
    train_indices = []
    while len(train_indices) < train_rows:
        candidate = random.randint(0, rows-1)
        if candidate not in train_indices:
            train_indices.append(candidate)
    
    #Generate test data
    test_indices = []
    for j in range(rows):
        if j not in train_indices:
            test_indices.append(j)    
    return train_indices, test_indices

def split_train_test(dataset, split_ratio = .15):
    import random
    rows = len(dataset)
    test_rows = round(rows * split_ratio)
    train_rows = rows - test_rows
    
    #Generate train_data
    train_indices = []
    while len(train_indices) < train_rows:
        candidate = random.randint(0, rows-1)
        if candidate not in train_indices:
            train_indices.append(candidate)
    train = dataset.loc[train_indices,:]
    train.reset_index(drop=True, inplace = True)
    
    #Generate test data
    test_indices = []
    for j in range(rows):
        if j not in train_indices:
            test_indices.append(j)
    test = dataset.loc[test_indices,:]
    test.reset_index(drop=True, inplace = True)    
    return train, test

In [None]:
def fix_preds_calcs(param_vector, single_pred):
    param_vector2 = [abs(x - single_pred) for x in param_vector]
    temp = 0   #Store parameter value
    ind = 100000  #Store parameter comparison value
    for i,pa in enumerate(param_vector2):
        if pa < ind:
            ind = pa
            temp = param_vector[i]
    return temp

def fix_preds_RF(preds, algo, index, lambdas):
    in_preds = []
    length = len(preds)
    if algo == 'GLM':
        if index == 0:
            y = range(0, 1100, 125)
            param_alpha = [x / 1000 for x in y]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_alpha, preds[i]))
        elif index == 1:
            param_lambda = set(lambdas)
            param_lambda = list(param_lambda)  #Create list of unique train lambda values
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_lambda, preds[i]))
                
    elif algo =='RF':
        if index == 0:
            param_tree = [25,50,75,100,200,300,400,500]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_tree, preds[i]))
        elif index == 1:
            param_depth = [20,40,60,80]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_depth, preds[i]))
    elif algo == 'XGboost':
        if index == 0:
            param_tree = [25,50,100,200]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_tree, preds[i]))
        elif index == 1:
            param_depth = [6,10,15]
            for i in range(length):
                in_preds.append(fix_preds_calcs(param_depth, preds[i]))
        elif index == 2:
            colsample_bytree = [.6,.7,.8,.9]
            for i in range(length):
                in_preds.append(fix_preds_calcs(colsample_bytree, preds[i]))
    return in_preds

def fix_preds_NN(preds, algo, lambdas):
    in_preds = pd.DataFrame({})
    length = len(preds)
    temp0 = []
    temp1 = []
    temp2 = []
    if algo == 'GLM':
        y = range(0, 1100, 125)
        param_alpha = [x / 1000 for x in y]
        param_lambda = set(lambdas)
        param_lambda = list(param_lambda)  #Create list of unique train lambda values
        for i in range(length):
            temp0.append(fix_preds_calcs(param_alpha, preds[i,0]))
            temp1.append(fix_preds_calcs(param_lambda, preds[i,1]))
        in_preds['A'] = temp0
        in_preds['B'] = temp1
                
    elif algo =='RF':
        param_tree = [25,50,75,100,200,300,400,500]
        param_depth = [20,40,60,80]
        for i in range(length):
            temp0.append(fix_preds_calcs(param_tree, preds[i,0]))
            temp1.append(fix_preds_calcs(param_depth, preds[i,1]))
        in_preds['A'] = temp0
        in_preds['B'] = temp1
        
    elif algo == 'XGboost':
        for i in range(length):
            param_tree = [25,50,100,200]
            param_depth = [6,10,15]
            colsample_bytree = [.6,.7,.8,.9]
            temp0.append(fix_preds_calcs(param_tree, preds[i,0]))
            temp1.append(fix_preds_calcs(param_depth, preds[i,1]))
            temp2.append(fix_preds_calcs(colsample_bytree, preds[i,2]))
        in_preds['A'] = temp0
        in_preds['B'] = temp1
        in_preds['C'] = temp2
    return in_preds.values

In [None]:
def do_fit_predict_RF(algo, x_train, y_train, x_test, y_test, estimator, plot_true_preds = True):
    y_cols = len(y_train.columns)
    fig,a =  plt.subplots(1,y_cols,figsize=(20, 5))
    x = range(len(y_test))
    predictions = []
    for i,target_column in enumerate(range(y_cols)):
        estimator.fit(x_train,y_train.iloc[:,target_column])
        y_pred = estimator.predict(x_test)
        predictions.append(y_pred)
        #print("RMSE:",metrics.mean_squared_error(y_test.iloc[:,target_column], y_pred, squared = False))
        #Plotting ground truth
        a[target_column].plot(x,y_test.iloc[:,target_column],'bo')
        #---------------------- Used for rounding to quantized vlues -----------------------#
        if algo == 'GLM' and target_column == 1:                                            #
            y_lambda_values = pd.concat([y_train.iloc[:,1],y_test.iloc[:,1]], axis=0)       #
            preds_fixed = fix_preds_RF(y_pred, algo, target_column, y_lambda_values.values) #
        else:                                                                               #
            preds_fixed = fix_preds_RF(y_pred, algo, target_column, lambdas = None)         #
        #-----------------------------------------------------------------------------------#
        print('RMSE for hyperparam "' + y_test.columns[target_column] + '": '+ str(metrics.mean_squared_error(y_test.iloc[:,target_column], preds_fixed, squared = False)))
        #Plotting quantized predictions
        a[target_column].plot(x,preds_fixed,'yx')
        #Plotting true predictions
        if plot_true_preds:
            a[target_column].plot(x,y_pred,'r+')
            a[target_column].legend(['ground truth', 'quantized preds', 'real preds'], loc='upper left')
        else:
            a[target_column].legend(['ground truth', 'quantized preds'], loc='upper left')    
    plt.show()
    return predictions

In [None]:
def do_predict_RF(algo, x_train, y_train, x_test, y_test, estimator, plot_true_preds = True):
    y_cols = len(y_train.columns)
    fig,a =  plt.subplots(1,y_cols,figsize=(20, 5))
    x = range(len(y_test))
    predictions = []
    for i,target_column in enumerate(range(y_cols)):
        y_pred = estimator.predict(x_test)
        predictions.append(y_pred)
        #print("RMSE:",metrics.mean_squared_error(y_test.iloc[:,target_column], y_pred, squared = False))
        #Plotting ground truth
        a[target_column].plot(x,y_test.iloc[:,target_column],'bo')
        #---------------------- Used for rounding to quantized vlues -----------------------#
        if algo == 'GLM' and target_column == 1:                                            #
            y_lambda_values = pd.concat([y_train.iloc[:,1],y_test.iloc[:,1]], axis=0)       #
            preds_fixed = fix_preds_RF(y_pred, algo, target_column, y_lambda_values.values) #
        else:                                                                               #
            preds_fixed = fix_preds_RF(y_pred, algo, target_column, lambdas = None)         #
        #-----------------------------------------------------------------------------------#
        print('RMSE for hyperparam "' + y_test.columns[target_column] + '": '+ str(metrics.mean_squared_error(y_test.iloc[:,target_column], preds_fixed, squared = False)))
        #Plotting quantized predictions
        a[target_column].plot(x,preds_fixed,'yx')
        #Plotting true predictions
        if plot_true_preds:
            a[target_column].plot(x,y_pred,'r+')
            a[target_column].legend(['ground truth', 'quantized preds', 'real preds'], loc='upper left')
        else:
            a[target_column].legend(['ground truth', 'quantized preds'], loc='upper left')    
    plt.show()
    return predictions

In [None]:
#Trains and returns the deep learning estimator for the specified algorithm.
#The below code is based on the code found at:
#https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/
#and is tweked acording to the needs of this project.

# define function to make base model
def create_model(input_dim = 45, algo = 'GLM', hidden = 5, two_layers = False, half_on_second = False):
    # create model
    model = Sequential()
    model.add(Dense(hidden, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    if two_layers:
        if half_on_second:
            model.add(Dense(round(hidden/2), kernel_initializer='normal'))
        else:
            model.add(Dense(hidden, kernel_initializer='normal'))
    if algo in ['GLM', 'RF']:
        model.add(Dense(2, kernel_initializer='normal'))
    elif algo == 'XGboost':
        model.add(Dense(3, kernel_initializer='normal'))
    else:
        print("\n>>> Algorithm '" + algo + "' not valid. <<<")
        return None
    # Compile model
    model.compile(loss = 'mse', optimizer = opts.Adam(learning_rate=0.01))
    return model

In [None]:
#Run this once for each cross validation AND prediction session, as the row
#indices are stored in "train_indices" and "test_indices" and are the same
#for both crossvalidation and fit processes.

manual_indices = True

split_ratio = .15 #Training/Testing set ratio.
kfold = KFold(n_splits=10)
#Creating the dataset once to get its dimensions. We need the number of rows, not columns,
#so any "algo" is aplicable. We dont need the values of the rows, just the rows, so any
#"method" is also aplicable as is the option for eco_best, or absolute_best.
#Also we dont care about the number of metafeatures contained in the dataset, as they
#make up the columns not the rows.
#Get the indices for test and train instances based on extracted dataset dimensions.
if manual_indices:
    train_indices = [28,58,31,29,20,147,157,155,62,159,137,112,118,134,164,90,142,19,60,161,94,18,84,67,56,61,74,30,52,2,117,1,126,65,70,163,73,114,27,139,63,89,154,166,46,6,37,116,8,173,140,13,122,39,135,174,14,36,24,104,124,130,16,141,7,149,167,79,43,69,53,9,77,12,98,22,165,93,85,11,100,108,3,4,145,123,144,10,119,105,129,92,80,33,71,99,107,25,148,97,106,42,109,82,158,51,152,50,168,40,0,146,86,153,41,156,170,136,35,57,143,44,83,102,54,26,177,131,169,120,138,81,68,133,175,132,32,45,78,76,48,59,15,88,113,64,96,171,128,66,75]
    test_indices = [5,17,21,23,34,38,47,49,55,72,87,91,95,101,103,110,111,115,121,125,127,150,151,160,162,172,176]
else:
    dataset = build_dataSet_DL(tree_meta_feature_path,
                               ground_truth_path,
                               algo = 'RF', method = 'drop', eco = True)
    train_indices, test_indices = split_train_test_indices(dataset, split_ratio = .15)
print()

In [None]:
#Fit final meta-models and make predictions.
RF_combine_weight = .5
NN_combine_weight = 1 - RF_combine_weight
plot_true_preds = True
print_model_params = True
load_models = True
RF_models = {}
RF_preds = {}
NN_preds = {}
final_preds = {}
times = {}

for algo in ['GLM', 'RF', 'XGboost']:
##################### Load models #####################
    if load_models:
        for i in os.listdir(model_path):
            model_name = i.split('/')[-1].split('.')[0]
            prefix = model_name.split('_')[0]
            algorithm = model_name.split('_')[2]
            if algorithm == algo:
                if prefix == 'RF':
                    param = model_name.split('_')[-1]
                    key = algorithm + '_' + param
                    RF_models[key] = joblib.load(model_path + '/'+ i)
                else:
                    NN_model = load_model(model_path + '/'+ i)    
    
##################### RF metamodel section #####################
    print('------ Algo = ' + algo + ' ------')
    eco = True  #All final RF models use eco forest option.
    if algo == 'GLM':
        meta = ['all', 'tree']
        method = ['drop', 'median']
    elif algo == 'RF':
        meta = ['tree', 'all']
        method = ['drop', 'drop']
    elif algo == 'XGboost':
        meta = ['all', 'all', 'all']
        method = ['mean', 'drop', 'drop']
    
    mean_time = 0
    intermediate_times = []
    intermediate_preds = []
    for i in range(len(method)):
        print('Examined parameter:', i+1)
        if print_model_params:
            print('NaN method: ' + method[i])
        if meta[i] == 'all':
            if print_model_params:
                print('Meta-features used: 45')
            dataset = build_dataSet_DL(tree_meta_feature_path,
                                       ground_truth_path,
                                       manual_meta_feature_path = manual_meta_feature_path,
                                       algo = algo, method = method[i], eco = eco
                                      )            
        elif meta[i] == 'tree':
            if print_model_params:
                print('Meta-features used: 15')
            dataset = build_dataSet_DL(tree_meta_feature_path,
                                       ground_truth_path,
                                       algo = algo, method = method[i], eco = eco
                                      )
        #Creating train and test datasets according to predefined row indices.
        train = dataset.loc[train_indices,:]
        train.reset_index(drop=True, inplace = True)
        test = dataset.loc[test_indices,:]
        test.reset_index(drop=True, inplace = True)
        
        x_train, y_train = Data_prep_RF(train, algo)
        x_test, y_test = Data_prep_RF(test, algo)
                
        print('RF meta-model results:')
        start_time = time.time()
        if not load_models:
            regr = RandomForestRegressor(random_state = 8328,
                                         min_impurity_decrease = 1e-06,
                                         max_depth = 10,
                                         n_estimators = 1000,
                                         max_features = .8
                                        )
            batch_preds = do_fit_predict_RF(algo = algo, x_train = x_train, y_train = y_train,
                                            x_test = x_test, y_test = y_test,
                                            estimator = regr, plot_true_preds = plot_true_preds
                                           )
        else:
            model_key = algo + '_param' + str(i+1)
            regr = RF_models[model_key]
            batch_preds = do_predict_RF(algo = algo, x_train = x_train, y_train = y_train,
                                        x_test = x_test, y_test = y_test,
                                        estimator = regr, plot_true_preds = plot_true_preds
                                       )
        intermediate_times.append(time.time() - start_time)
        intermediate_preds.append(batch_preds[i])
        #joblib.dump(regr, model_path + '/RF_model_' + algo + '_param' + str(i+1) + '.joblib')
        print(' ')
    mean_time = np.mean(intermediate_times)
    RF_preds[algo] = intermediate_preds
    
    ##################### NN metamodel section #####################    
    #Setting params for each algorithm's model
    standardize = 'regularize'
    if algo == 'GLM':
        meta = 'tree'
        standardize_y = False
        two_layers = False
        half_on_second = False
        method = 'drop'
        hidden = 5
    elif algo == 'RF':
        meta = 'tree'
        standardize_y = False
        two_layers = True
        half_on_second = False
        method = 'mean'
        hidden = 10
    elif algo =='XGboost':
        meta = 'tree'
        standardize_y = False
        two_layers = False
        method = 'drop'
        hidden = 5
        
    #Print model info
    if print_model_params:
        print('Metafeature set:',meta)
        print('Standardization:',standardize)
        if standardize == 'regularize':
            print('regularize_y:',standardize_y)
        elif standardize == 'normalize':
            print('normalize_y:',standardize_y)
        print('Hidden neurons:', hidden)
        print('2nd hidden layer:',two_layers)
        if two_layers:
            print('2nd hidden has half neurons:',half_on_second)
        print('Algo:',algo)
        print('Nan method:',method)
    
    #Prepare the data
    if meta == 'all':
        input_dim = 45
        dataset = build_dataSet_DL(tree_meta_feature_path,
                                   ground_truth_path,
                                   manual_meta_feature_path = manual_meta_feature_path,
                                   algo = algo, method = method
                                  )
    elif meta == 'tree':
        input_dim = 15
        dataset = build_dataSet_DL(tree_meta_feature_path,
                                   ground_truth_path,
                                   algo = algo, method = method
                                  )
    #Creating train and test datasets according to predefined row indices. They
    #are the same as the RF metamodel ones.
    train = dataset.loc[train_indices,:]
    train.reset_index(drop=True, inplace = True)
    test = dataset.loc[test_indices,:]
    test.reset_index(drop=True, inplace = True)
    x_train, y_train = Data_prep_NN(train, algo, method,
                                    regularization_path, normalization_path,
                                    standardize = standardize,
                                    regularize_y = standardize_y,
                                    normalize_y = standardize_y
                                   )
    x_test, y_test = Data_prep_NN(test, algo, method,
                                  regularization_path, normalization_path,
                                  standardize = standardize
                                 )
    
    #Create the model
    model = create_model(input_dim = input_dim, algo = algo, hidden = hidden,
                         two_layers = two_layers, half_on_second = half_on_second
                        )
    
    print('NN meta-model results:')
    
    #Do fit and predictions session
    es = EarlyStopping(monitor='val_loss',
                       mode='min',
                       patience = 400,
                       verbose = 1
                      )
    mc = ModelCheckpoint(model_path + '/NN_model_' + algo + '.h5',
                         monitor='val_loss',
                         mode='min',
                         verbose=0,
                         save_best_only=True
                        )
    
    start_time = time.time()
    if not load_models:
        history = model.fit(x = x_train, y = y_train,
                            validation_split = 0.15,
                            callbacks = [es, mc],
                            epochs = 2000,
                            batch_size = 5,
                            verbose = 0
                           )
    model = load_model(model_path + '/NN_model_' + algo + '.h5')
    NN_preds[algo] = model.predict(x_test, verbose = 0)
    mean_time = mean_time + (time.time() - start_time)
    times[algo] = mean_time
    #Destandardize predictions if y was standardized.
    if standardize == 'regularize' and standardize_y:
        NN_preds[algo] = deregularize_preds(NN_preds[algo], algo, method, regularization_path)
    elif standardize == 'normalize' and standardize_y:        
        NN_preds[algo] = denormalize_preds(NN_preds[algo], algo, method, normalization_path)
    
    #Plot the results
    y_cols = NN_preds[algo].shape[1]
    fig,a =  plt.subplots(1,y_cols,figsize=(20, 5))
    x = range(NN_preds[algo].shape[0])        
    y_lambda_values = []
    if algo == 'GLM':
        y_lambda_values = np.concatenate([y_train[:,1],y_test[:,1]], axis=0)
    preds_fixed = fix_preds_NN(NN_preds[algo], algo, y_lambda_values)
    temp_rmse = []
    for target_column in range(y_cols):
        #Plot ground truth
        a[target_column].plot(x,y_test[:,target_column],'bo')
        a[target_column].plot(x,preds_fixed[:,target_column],'yx')
        if plot_true_preds:
            a[target_column].plot(x,NN_preds[algo][:,target_column],'r+')
            a[target_column].legend(['ground truth', 'quantized preds', 'real preds'], loc='upper left')
        else:
            a[target_column].legend(['ground truth', 'quantized preds'], loc='upper left')                        
    
        #Calculate rmse separately for each parameter
        temp2 = math.sqrt(np.mean(np.square(preds_fixed[:,target_column] - y_test[:,target_column])))
        print('RMSE for paramenter ' + str(target_column+1) + ': '+ str(round(10000*temp2)/10000))
    plt.show()
    print(' ')
                                   
    ##################### Combining the results #####################
    combined_preds = []
    fig,a =  plt.subplots(1,y_cols,figsize=(20, 5))
    print('Combined predictions:')
    print('Elapsed time:',times[algo])
    for i in range(y_cols):
        true_preds = RF_preds[algo][i] * RF_combine_weight + NN_preds[algo][:,i] * NN_combine_weight
        a[i].plot(x,y_test[:,i],'bo')
        #-------------------- Used for rounding to quantized vlues ----------------------#
        if algo == 'GLM' and i == 1:                                                     #
            true_preds_fixed = fix_preds_RF(true_preds, algo, i, y_lambda_values)        #
        else:                                                                            #
            true_preds_fixed = fix_preds_RF(true_preds, algo, i, lambdas = None)         #
        #--------------------------------------------------------------------------------#
        a[i].plot(x,true_preds_fixed,'yx')
        if plot_true_preds:
            a[i].plot(x,true_preds,'r+')
            a[i].legend(['ground truth', 'quantized preds', 'real preds'], loc='upper left')
        else:
            a[i].legend(['ground truth', 'quantized preds'], loc='upper left')
        combined_preds.append(true_preds_fixed)
        temp2 = math.sqrt(np.mean(np.square(true_preds_fixed - y_test[:,i])))
        print('RMSE for parameter ' + str(i+1) + ': '+ str(round(10000*temp2)/10000))
    plt.show()
    final_preds[algo] = combined_preds

In [None]:
import h2o
target_test_path = path_to_dataset_folder + '/DATA/Target_feature_test_CSVs'
save_to_path = path_to_dataset_folder + '/final_results_base_learning'
tree_names = os.listdir(tree_meta_feature_path + '/drop')

In [None]:
def GLM_gTruth_oneByOne(file, source, save_to_path, alpha, lam):
    print('processing file: ' + file + ' | alpha:' + str(alpha) + ' | lambda:' + str(lam))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
           
    model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(model_id = 'GLMesti',
                                                             seed = 888,
                                                             alpha = alpha,
                                                             lambda_ = lam,
                                                             lambda_search = False,
                                                             nlambdas = 100,
                                                             standardize = True,
                                                             nfolds = 10,
                                                             keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    lambda_param = float(model.summary()['regularization'][0].split('= ')[-1].split(' ')[0])
    params = h2o.H2OFrame({'alpha':alpha,
                           'lambda':lambda_param,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/GLM/' + '.'.join(file.split('.')[0:-1]) + 
                    '_alpha' + str(alpha) + '_lambda' + str(lambda_param) +'.csv')


In [None]:
def RF_gTruth_oneByOne(file, source, save_to_path, ntrees, depth):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time() #Start counting training time
    
    model = h2o.estimators.H2ORandomForestEstimator(model_id = 'RFesti',
                                                    seed = 888,
                                                    ntrees = ntrees,
                                                    max_depth = depth,
                                                    nfolds = 10,
                                                    keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time #Stop timing
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/RF/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth)+'.csv')

In [None]:
def XGboost_gTruth_oneByOne(file, source, save_to_path, ntrees, depth, colsample_bytree, method = 'cpu'):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth)+
         ', colsPerTree:' + str(colsample_bytree))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
    
    model = h2o.estimators.xgboost.H2OXGBoostEstimator(model_id = 'XGesti',
                                                       seed = 888,
                                                       backend= method,
                                                       ntrees = ntrees,
                                                       max_depth = depth,
                                                       col_sample_rate_per_tree = colsample_bytree,
                                                       nfolds = 10,
                                                       keep_cross_validation_models = False)
    print('Zoo')
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'cols_per_tree': colsample_bytree,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, save_to_path + '/XGboost/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth) + '_colsPerTree' + str(colsample_bytree) + '.csv')

In [None]:
h2o.init(max_mem_size = 64, nthreads = 6)

In [None]:
for algor in ['GLM', 'RF', 'XGboost']:
    params = []
    if algor == 'XGboost':
        param_num = 3
    else:
        param_num = 2
    for i in range(param_num):
        params.append(final_preds[algor][i])
    flag = -1
    for ind, file_name in enumerate(tree_names):
        if ind in test_indices:
            flag += 1
            h2o.remove_all()
            if algor == 'GLM':
                GLM_gTruth_oneByOne(file_name, target_test_path, save_to_path, params[0][flag], params[1][flag])
            elif algor == 'RF':
                RF_gTruth_oneByOne(file_name, target_test_path, save_to_path, params[0][flag], params[1][flag])
            elif algor == 'XGboost':
                XGboost_gTruth_oneByOne(file_name, target_test_path, save_to_path, params[0][flag], params[1][flag], params[2][flag])
            h2o.remove_all()
            h2o.remove_all()

In [None]:
h2o.remove_all()
h2o.shutdown()

In [None]:
#Calculate ground truth extraction time for the test set datasets.
gTruth_rmses = {}
final_rmses = {}
for algor in ['GLM','RF', 'XGboost']:
    temp_names = os.listdir(save_to_path + '/' + algor)
    rmses_gTruth = []
    rmses_final = []
    method_time = 0
    gTruthTime = 0
    flag = -1
    for ind,name in enumerate(tree_names):
        if ind in test_indices:
            flag += 1
            df = pd.read_csv(ground_truth_path + '/' + algor + '/' + name, header = 0, sep = ',')
            gTruthTime += df.iloc[0,-1]
            rmses_gTruth.append(df.iloc[0,-2])
            if algor == 'GLM':
                df = pd.read_csv('C:Users/Dimiexe/Desktop' + '/' + algor + '/' + temp_names[flag], header = 0, sep = ',')
            else:
                df = pd.read_csv(save_to_path + '/' + algor + '/' + temp_names[flag], header = 0, sep = ',')
            method_time += df.iloc[0,-1]
            rmses_final.append(df.iloc[0,-2])
    gTruth_rmses[algor] = rmses_gTruth
    final_rmses[algor] = rmses_final
    print(algor+' grid_time:',gTruthTime)
    print(algor + ' method_time:',(method_time))
    print('\n', times[algor])    

In [None]:
#Used to save final predictions to  csv file, so as to have them available for later work.
dddf = pd.DataFrame({'GLM_1':final_preds['GLM'][0], 'GLM_2':final_preds['GLM'][1],
                    'RF_1':final_preds['RF'][0], 'RF_2':final_preds['RF'][1],
                    'XGboost_1':final_preds['XGboost'][0],'XGboost_2':final_preds['XGboost'][1],'XGboost_3':final_preds['XGboost'][2]})
dddf.to_csv('C:/Users/Dimiexe/Desktop/DATAsets/final_results_base_learning/final_preds.csv', index = False)