In [1]:
import sys
sys.path.append('./helpers')
sys.path.append('./optimizers')
sys.path.append('./h2o')

In [2]:
# External libraries
import pandas as pd
import numpy as np
import importlib
import matplotlib.pyplot as plt
import networkx as nx
from deap import gp

# Custom functions and classes
from pymoo_optimizer import PymooOptimizer
import benchmark as Benchmark
import data_provider as dp
import feature_evolver as fe
import h2o_wrapper as h2o_wrapper

In [3]:
import warnings
warnings.filterwarnings("ignore", module='h2o')

In [4]:
x_train, x_test, x_val, y_train, y_test, y_val = dp.get_train_test_validation_data('breast', 0.8, 0.1)

In [5]:
optimizer = PymooOptimizer(x_train, x_val, y_train, y_val)
solution, _ = optimizer.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False, repeats=5)

In [6]:
optimizer_constructor = lambda x_train, x_val, y_train, y_val: PymooOptimizer(x_train, x_val, y_train, y_val)
x_train_3, x_val_3, x_test_3 = fe.evolve_new_feature_set(optimizer_constructor, x_train, x_val, y_train, y_val, x_test, verbose=False,
                                                        heuristics=Benchmark.dtree_accuracy, epochs=50, repeats=5,
                                                        min_features=1, max_features=10)


In [7]:
import h2o

h2o.init(verbose=False)

train = h2o.H2OFrame(pd.concat([x_train, y_train], axis=1))
test = h2o.H2OFrame(pd.concat([x_test, y_test], axis=1))
val = h2o.H2OFrame(pd.concat([x_val, y_val], axis=1))

target = y_train.name
features = x_train.columns.tolist()

train2 = h2o.H2OFrame(pd.concat([x_train.loc[:, solution], y_train], axis=1))
test2 = h2o.H2OFrame(pd.concat([x_test.loc[:, solution], y_test], axis=1))
val2 = h2o.H2OFrame(pd.concat([x_val.loc[:, solution], y_val], axis=1))

features2 = x_train.loc[:, solution].columns.tolist()

train3 = h2o.H2OFrame(pd.concat([x_train_3, y_train], axis=1))
test3 = h2o.H2OFrame(pd.concat([x_test_3, y_test], axis=1))
val3 = h2o.H2OFrame(pd.concat([x_val_3, y_val], axis=1))

features3 = x_train_3.columns.tolist()

In [8]:
print(f'Optimized dataset is {100 - (len(features2) / len(features) * 100):.1f}% smaller than original dataset.')
print(f'Newly created dataset is {100 - (len(features3) / len(features) * 100):.1f}% smaller than original dataset.')

Optimized dataset is 70.0% smaller than original dataset.
Newly created dataset is 86.7% smaller than original dataset.


In [9]:
accuracy_gbe = h2o_wrapper.gradient_boost(features, target, train, val, test, y_test)
fs_accuracy_gbe = h2o_wrapper.gradient_boost(features2, target, train2, val2, test2, y_test)
fc_accuracy_gbe = h2o_wrapper.gradient_boost(features3, target, train3, val3, test3, y_test)

print(f'After optimization, H2O Gradient Boosting Estimator accuracy changed from {np.round(accuracy_gbe * 100, 2) }% to {np.round(fs_accuracy_gbe * 100, 2) }%\n')
print('Accuracy on the evolved dataset:', np.round(fc_accuracy_gbe * 100, 2), '%')

After optimization, H2O Gradient Boosting Estimator accuracy changed from 93.86% to 94.74%

Accuracy on the evolved dataset: 95.61 %


In [10]:
accuracy_rf = h2o_wrapper.random_forest(features, target, train, val, test, y_test)
fs_accuracy_rf = h2o_wrapper.random_forest(features2, target, train2, val2, test2, y_test)
fc_accuracy_rf = h2o_wrapper.random_forest(features3, target, train3, val3, test3, y_test)

print(f'After optimization, H2O Random Forest accuracy changed from {np.round(accuracy_rf * 100, 2) }% to {np.round(fs_accuracy_rf * 100, 2) }%\n')
print('Accuracy on the evolved dataset:', np.round(fc_accuracy_rf * 100, 2), '%')

After optimization, H2O Random Forest accuracy changed from 93.86% to 95.61%

Accuracy on the evolved dataset: 96.49 %


In [11]:
accuracy_gl = h2o_wrapper.generalized_linear(features, target, train, val, test, y_test)
fs_accuracy_gl = h2o_wrapper.generalized_linear(features2, target, train2, val2, test2, y_test)
fc_accuracy_gl = h2o_wrapper.generalized_linear(features3, target, train3, val3, test3, y_test)

print(f'After optimization, H2O Generalized Linear Model accuracy changed from {np.round(accuracy_gl * 100, 2) }% to {np.round(fs_accuracy_gl * 100, 2) }%\n')
print('Accuracy on the evolved dataset:', np.round(fc_accuracy_gl * 100, 2), '%')

After optimization, H2O Generalized Linear Model accuracy changed from 87.72% to 90.35%

Accuracy on the evolved dataset: 87.72 %


In [12]:
accuracy_gl_irlsm = h2o_wrapper.generalized_linear(features, target, train, val, test, y_test)
fs_accuracy_gl_irlsm = h2o_wrapper.generalized_linear(features2, target, train2, val2, test2, y_test)
fc_accuracy_gl_irlsm = h2o_wrapper.generalized_linear(features3, target, train3, val3, test3, y_test)

print(f'After optimization, H2O Generalized Linear Model accuracy changed from {np.round(accuracy_gl_irlsm * 100, 2) }% to {np.round(fs_accuracy_gl_irlsm * 100, 2) }%\n')
print('Accuracy on the evolved dataset:', np.round(fc_accuracy_gl_irlsm * 100, 2), '%')

After optimization, H2O Generalized Linear Model accuracy changed from 87.72% to 90.35%

Accuracy on the evolved dataset: 87.72 %


In [13]:
accuracy_aml = h2o_wrapper.auto_ml(features, target, train, val, test, y_test)
fs_accuracy_aml = h2o_wrapper.auto_ml(features2, target, train2, val2, test2, y_test)
fc_accuracy_aml = h2o_wrapper.auto_ml(features3, target, train3, val3, test3, y_test)

print(f'After optimization, H2O AutoML accuracy changed from {np.round(accuracy_aml * 100, 2) }% to {np.round(fs_accuracy_aml * 100, 2) }%\n')
print('Accuracy on the evolved dataset:', np.round(fc_accuracy_aml * 100, 2), '%')

After optimization, H2O AutoML accuracy changed from 93.86% to 92.11%

Accuracy on the evolved dataset: 95.61 %
