In [None]:
import os
import time
import pandas as pd
import numpy as np
import h2o
master_path = 'C:/Users/Dimiexe/Desktop'
target_test_path = master_path + '/DATAsets/DATA/Target_feature_test_CSVs'
ground_truth_path = master_path + '/DATAsets/DATA/ground_truth'
all_partials_path = master_path + '/from_helper'

In [None]:
h2o.init(max_mem_size = 64, nthreads = 6)

In [None]:
def RF_gTruth_oneByOne(file, source, gTrouth_path, ntrees, depth):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time() #Start counting training time
    
    model = h2o.estimators.H2ORandomForestEstimator(model_id = 'RFesti',
                                                    seed = 888,
                                                    ntrees = ntrees,
                                                    max_depth = depth,
                                                    nfolds = 10,
                                                    keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time #Stop timing
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, gTrouth_path + '/RF_one_by_one/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth)+'.csv')

In [None]:
def GLM_gTruth_oneByOne(file, source, gTrouth_path, alpha):
    print('processing file: ' + file + ' | alpha:' + str(alpha))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
           
    model = h2o.estimators.glm.H2OGeneralizedLinearEstimator(model_id = 'GLMesti',
                                                             seed = 888,
                                                             alpha = alpha,
                                                             lambda_search = True,
                                                             nlambdas = 100,
                                                             standardize = True,
                                                             nfolds = 10,
                                                             keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    lambda_param = float(model.summary()['regularization'][0].split('= ')[-1].split(' ')[0])
    params = h2o.H2OFrame({'alpha':alpha,
                           'lambda':lambda_param,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, gTrouth_path + '/GLM_one_by_one/' + '.'.join(file.split('.')[0:-1]) + 
                    '_alpha' + str(alpha) + '_lambda' + str(lambda_param) +'.csv')

In [None]:
def XGboost_gTruth_oneByOne(file, source, gTrouth_path, ntrees, depth, colsample_bytree, method = 'cpu'):
    print('processing file: ' + file + ' | ntrees:' + str(ntrees) + ', Tree depth:' + str(depth)+
         ', colsPerTree:' + str(colsample_bytree))
    data = h2o.import_file(source + '/' + file, header = 1)
    y = data.columns[-1]
    x = data.columns[0:-1]
    
    start_time = time.time()
    
    model = h2o.estimators.xgboost.H2OXGBoostEstimator(model_id = 'XGesti',
                                                       seed = 888,
                                                       backend= method,
                                                       ntrees = ntrees,
                                                       max_depth = depth,
                                                       col_sample_rate_per_tree = colsample_bytree,
                                                       nfolds = 10,
                                                       keep_cross_validation_models = False)
    model.train(x=x, y=y, training_frame=data)
    
    start_time = time.time() - start_time
    
    params = h2o.H2OFrame({'ntrees':ntrees,
                           'max_depth':depth,
                           'cols_per_tree': colsample_bytree,
                           'rmse': model.rmse(xval=True),
                           'calc_time':start_time
                          })
    h2o.download_csv(params, gTrouth_path + '/XGboost_one_by_one/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth) + '_colsPerTree' + str(colsample_bytree) + '.csv')
    h2o.download_csv(params, all_partials_path + '/' + '.'.join(file.split('.')[0:-1]) + 
                    '_ntr' + str(ntrees) + '_dpth' + str(depth) + '_colsPerTree' + str(colsample_bytree) + '.csv')

In [None]:
#Given a dataFrame containing different RF hyperparameter combinations and a set of
#indices pointing to some lines in that dataFrame, this function returns the one index
#that points to that hyperparameter combination which minimises the size of the RF. 
def find_eco_index(indices, data):
    for i,ind in enumerate(indices):
        if i == 0:
            temp = data.loc[ind,'ntrees'] * data.loc[ind,'max_depth']
            eco_index = ind
        else:
            if temp > data.loc[ind,'ntrees'] * data.loc[ind,'max_depth']:
                temp = data.loc[ind,'ntrees'] * data.loc[ind,'max_depth']
                eco_index = ind
    return eco_index

In [None]:
def combine_results_RF(source_path, visualInfo = False):
    files = os.listdir(source_path)
    for i,f in enumerate(files):
        if i == 0:
            data = pd.read_table(os.path.join(source_path, f),sep=',')
        else:
            data = pd.concat([data,
                              pd.read_table(os.path.join(source_path, f),sep=',')
                             ], ignore_index=True)
    if visualInfo:
        print(data.head(10))
    best_rmse = data['rmse'].min()
    best_index = data[data['rmse'] == best_rmse].index
    #The below line guarantees that, even if there are more than one hyperparameter
    #combinations that give the same-minimum rmse, the smallest RF is chosen.
    best_index = find_eco_index(best_index, data)
    
    #Below every hyperparameter combination with an rmse 1% higher than the best rmse
    #is considered, and the one resulting to the smallest RF is chosen.
    indices = data[data['rmse'] <= best_rmse * 1.01].index
    eco_index = find_eco_index(indices, data)
    
    df = pd.DataFrame({'ntrees_best':[data.loc[best_index,'ntrees']],
                       'max_depth_best':[data.loc[best_index,'max_depth']],
                       'rmse_best':[data.loc[best_index,'rmse']],
                       'ntrees_eco':[data.loc[eco_index,'ntrees']],
                       'max_depth_eco':[data.loc[eco_index,'max_depth']],
                       'rmse_eco':[data.loc[eco_index,'rmse']],
                       'calc_time':[data['calc_time'].sum()]
                      })
    df.to_csv(source_path + '/../RF/'+ f.split('_ntr')[0] + '.csv',index= False)
    for f in files:
        os.remove(source_path + '/' + f)

In [None]:
def combine_results_GLM(source_path, visualInfo = False):
    files = os.listdir(source_path)
    for i,f in enumerate(files):
        if i == 0:
            data = pd.read_table(os.path.join(source_path, f), header = 0, sep = ',')
        else:
            data = pd.concat([data,
                              pd.read_table(os.path.join(source_path, f), header = 0, sep = ',')
                             ], ignore_index=True)
    if visualInfo:
        print(data.head(10))
    best_rmse = data['rmse'].min()
    best_index = data[data['rmse'] == best_rmse].index
    if len(best_index.values) > 1: #Deals with multiple equally good indices
        temp = 888
        for l in best_index.values:
            if abs(4.5 - l) <= temp:
                temp = abs(4.5 - l)
                flag = l
        best_index = l # "best_index" contains a number now.
    else:
        #This else is needed because "best_index" contains an array of length 1; e.g. [8]
        best_index = best_index[0]
    print('Best index:' + str(best_index))
    df = pd.DataFrame({'alpha_best':[data.loc[best_index,'alpha']],
                       'lambda_best':[data.loc[best_index,'lambda']],
                       'rmse_best':[data.loc[best_index,'rmse']],
                       'calc_time':[data['calc_time'].sum()]
                      })
    df.to_csv(source_path + '/../GLM/'+ f.split('_alpha')[0] + '.csv',index= False)
    for f in files:
        os.remove(source_path + '/' + f)

In [None]:
def combine_results_XGboost(source_path, visualInfo = False):
    files = os.listdir(source_path)
    for i,f in enumerate(files):
        if i == 0:
            data = pd.read_table(os.path.join(source_path, f),sep=',')
        else:
            data = pd.concat([data,
                              pd.read_table(os.path.join(source_path, f),sep=',')
                             ], ignore_index=True)
    if visualInfo:
        print(data.head(10))
    best_rmse = data['rmse'].min()
    best_index = data[data['rmse'] == best_rmse].index
    #The below line guarantees that, even if there are more than one hyperparameter
    #combinations that give the same-minimum rmse, the smallest RF is chosen.
    best_index = find_eco_index(best_index, data)
    
    #Below every hyperparameter combination with an rmse 1% higher than the best rmse
    #is considered, and the one resulting to the smallest RF size is chosen.
    indices = data[data['rmse'] <= best_rmse * 1.01].index
    eco_index = find_eco_index(indices, data)
    
    df = pd.DataFrame({'ntrees_best':[data.loc[best_index,'ntrees']],
                       'max_depth_best':[data.loc[best_index,'max_depth']],
                       'cols_per_tree_best':[data.loc[best_index,'cols_per_tree']],
                       'rmse_best':[data.loc[best_index,'rmse']],
                       'ntrees_eco':[data.loc[eco_index,'ntrees']],
                       'max_depth_eco':[data.loc[eco_index,'max_depth']],
                       'cols_per_tree_eco':[data.loc[eco_index,'cols_per_tree']],
                       'rmse_eco':[data.loc[eco_index,'rmse']],
                       'calc_time':[data['calc_time'].sum()]
                      })
    df.to_csv(source_path + '/../XGboost/'+ f.split('_ntr')[0] + '.csv',index= False)
    for f in files:
        os.remove(source_path + '/' + f)

In [None]:
from shutil import copyfile
def help_combine(file_name, source, destination, algo):
    files = os.listdir(source)
    for f in files:
        if algo in ['XGBoost', 'RF']:
            if f.split('_ntr')[0] == '.'.join(file_name.split('.')[0:-1]):
                #Copy f to destination (==Algo_one_by_one folder)
                copyfile(source + '/' + f, destination + '/' + f)
        elif algo == 'GLM':
            if f.split('_alpha')[0] == '.'.join(file_name.split('.')[0:-1]):
                #Copy f to destination (==Algo_one_by_one folder)
                copyfile(source + '/' + f, destination + '/' + f)

In [None]:
#Manually implemented Random Forest
param_ntrees = [25,50,75,100,200,300,400,500]
param_maxDepth = [20,40,60,80]
extra_ignore = []
alreadyDone = os.listdir(ground_truth_path + '/RF')
datasets = os.listdir(target_test_path)
for f in np.sort(np.setdiff1d(datasets, alreadyDone + extra_ignore, assume_unique=True)):
    for nt in param_ntrees:
        for dpth in param_maxDepth:
            h2o.remove_all()
            h2o.remove_all()
            #The below variable "file_name" and the subsequent if statement are used to
            #determine if a combination has already been tested, and thus be skiped.
            file_name = '.'.join(f.split('.')[0:-1]) + '_ntr' + str(nt) + '_dpth' + str(dpth) + '.csv'
            if not os.path.isfile(ground_truth_path + '/RF_one_by_one/' + file_name):
                RF_gTruth_oneByOne(f, target_test_path, ground_truth_path,nt,dpth)
            h2o.remove_all()
    combine_results_RF(ground_truth_path + '/RF_one_by_one', visualInfo = True)

In [None]:
#Manually implemented Generalised Linear Model
y = range(0, 1100, 125)
param_alpha = [x / 1000 for x in y]
#param_lambda = being searched internally for the best value of lambda for each value of alpha by search_lambda = True

alreadyDone = os.listdir(ground_truth_path + '/GLM')
datasets = os.listdir(target_test_path)
for f in np.setdiff1d(datasets, alreadyDone, assume_unique=True):
    start_time = time.time()
    for A in param_alpha:
        h2o.remove_all()
        h2o.remove_all()
        GLM_gTruth_oneByOne(f , target_test_path, ground_truth_path,A)
        h2o.remove_all()
    combine_results_GLM(ground_truth_path + '/GLM_one_by_one', visualInfo = True)

In [None]:
#Manually implimented XGboost
param_ntrees = [25,50,100,200]
param_maxDepth = [6,10,15]
colsample_bytree = [.6,.7,.8,.9]
cpu_gpu_pick = 20971520  #Size of file, in bytes, over which the gpu method is picked. 20MB
alreadyDone = os.listdir(ground_truth_path + '/XGboost')
extra_ignore = ['processed_Houston%humidity.csv']
datasets = os.listdir(target_test_path)
for f in np.sort(np.setdiff1d(datasets, alreadyDone + extra_ignore, assume_unique=True)):
    f_size = os.path.getsize(target_test_path + '/' + f)
    print(f_size)
    help_combine(f, all_partials_path, ground_truth_path + '/XGboost_one_by_one/', algo = 'XGBoost')
    for nt in param_ntrees:
        for dpth in param_maxDepth:
            for colsPerTree in colsample_bytree:
                h2o.remove_all()
                h2o.remove_all()
                file_name = '.'.join(f.split('.')[0:-1]) + '_ntr' + str(nt) + '_dpth' + str(dpth) + '_colsPerTree' + str(colsPerTree) + '.csv'
                if not os.path.isfile(ground_truth_path + '/XGboost_one_by_one/' + file_name):
                    if f_size < cpu_gpu_pick or dpth > 16:
                        XGboost_gTruth_oneByOne(f , target_test_path, ground_truth_path,
                                                nt, dpth, colsPerTree, 'cpu')
                    else:
                        XGboost_gTruth_oneByOne(f , target_test_path, ground_truth_path,
                                                nt, dpth, colsPerTree, 'gpu')
                h2o.remove_all()
    combine_results_XGboost(ground_truth_path + '/XGboost_one_by_one', visualInfo = True)

In [None]:
h2o.remove_all()
h2o.cluster().shutdown()