In [5]:
import sys
sys.path.append('./helpers')
sys.path.append('./optimizers')

In [6]:
# External libraries
import pandas as pd
import numpy as np


# Custom functions and classes
from pymoo_optimizer import PymooOptimizer
import benchmark as Benchmark
import data_provider as dp

In [7]:
x_train, x_test, x_val, y_train, y_test, y_val = dp.get_train_test_validation_data('breast', 0.8, 0.1)

In [8]:
optimizer = PymooOptimizer(x_train, x_val, y_train, y_val)
solution, score = optimizer.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)

In [9]:
raw_score = np.round(Benchmark.dtree_accuracy(x_train, x_test, y_train, y_test) * 100, 2)
optimized_score = np.round(Benchmark.dtree_accuracy(x_train.loc[:, solution], x_test.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After optimization, decision tree accuracy changed from {raw_score}% to {optimized_score}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')


After optimization, decision tree accuracy changed from 90.53% to 94.12%

Optimal contains 70.0% less columns than the original dataset,reducing from 30 to 9

Selected columns are: area1, radius2, texture2, perimeter2, radius3, texture3, area3, smoothness3, concave_points3


In [10]:
# Evolve a new feature
x_train_2 = x_train
x_test_2 = x_test
x_val_2 = x_val
best_feature_func = optimizer.evolve_new_feature(epochs=50, heuristics=Benchmark.dtree_accuracy)

# Apply the evolved feature to the training and test sets
x_train_2["evolved_feature"] = x_train_2.apply(lambda row: best_feature_func(*row), axis=1)
x_test_2["evolved_feature"] = x_test_2.apply(lambda row: best_feature_func(*row), axis=1)
x_val_2["evolved_feature"] = x_val_2.apply(lambda row: best_feature_func(*row), axis=1)

# Evaluate the model with the new feature added
score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2, x_test_2, y_train, y_test) * 100, 2)
optimizer_2 = PymooOptimizer(x_train_2, x_val_2, y_train, y_val)
solution, score = optimizer_2.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)
optimized_score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2.loc[:, solution], x_test_2.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After feature creation, decision tree accuracy set to {score_with_new_feature}%\n\n' +
      f'After feature creation and selection, decision tree accuracy set to {optimized_score_with_new_feature}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')

gen	nevals	avg     	std       	min     	max     
0  	50    	0.884802	0.00397692	0.876441	0.895739
1  	40    	0.884932	0.00487651	0.861905	0.895739
2  	35    	0.885794	0.00504312	0.869925	0.896992
3  	32    	0.887183	0.00507975	0.879198	0.905764
4  	25    	0.888501	0.00584527	0.878947	0.905764
5  	29    	0.889343	0.00786971	0.868922	0.912281
6  	36    	0.891128	0.0105379 	0.866416	0.922807
7  	27    	0.892306	0.0132203 	0.837093	0.922807
8  	27    	0.894647	0.0117049 	0.876692	0.922807
9  	24    	0.896867	0.0123342 	0.87995 	0.922807
10 	28    	0.8999  	0.00962589	0.881454	0.922807
11 	33    	0.895333	0.0129919 	0.849875	0.922807
12 	32    	0.898366	0.0140454 	0.85589 	0.922807
13 	37    	0.899388	0.0147215 	0.870426	0.922807
14 	36    	0.901218	0.0137928 	0.875439	0.923308
15 	26    	0.907554	0.0139791 	0.87193 	0.925063
16 	32    	0.905614	0.015529  	0.874687	0.925063
17 	28    	0.907584	0.0144898 	0.861404	0.925063
18 	32    	0.904817	0.0149507 	0.878446	0.925063
19 	31    	0.908932	

In [11]:
import h2o
from sklearn.model_selection import train_test_split

h2o.init()

train = h2o.H2OFrame(pd.concat([x_train, y_train], axis=1))
test = h2o.H2OFrame(pd.concat([x_test, y_test], axis=1))
val = h2o.H2OFrame(pd.concat([x_val, y_val], axis=1))

target = y_train.name
features = x_train.columns.tolist()

train2 = h2o.H2OFrame(pd.concat([x_train_2.loc[:, solution], y_train], axis=1))
test2 = h2o.H2OFrame(pd.concat([x_test_2.loc[:, solution], y_test], axis=1))
val2 = h2o.H2OFrame(pd.concat([x_val_2.loc[:, solution], y_val], axis=1))

features2 = x_train_2.loc[:, solution].columns.tolist()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.431-b10, mixed mode)
  Starting server from C:\Users\Tristan\AppData\Local\Programs\Python\Python311\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Tristan\AppData\Local\Temp\tmp86jgmgtb
  JVM stdout: C:\Users\Tristan\AppData\Local\Temp\tmp86jgmgtb\h2o_Tristan_started_from_python.out
  JVM stderr: C:\Users\Tristan\AppData\Local\Temp\tmp86jgmgtb\h2o_Tristan_started_from_python.err


  Please download the latest 64-bit Java SE JDK from Oracle.

  warn("  You have a 32-bit version of Java. H2O works best with 64-bit Java.\n"


  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Belgrade
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_Tristan_lkse80
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,247.5 Mb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [12]:
from h2o.estimators import H2OGradientBoostingEstimator

gbm = H2OGradientBoostingEstimator(
    ntrees=100,
    max_depth=5,
    min_rows=10,
    learn_rate=0.1,
    seed=623
)

gbm.train(x=features, y=target, training_frame=train, validation_frame=val)

predictions = np.round(gbm.predict(test).as_data_frame().values.flatten())
accuracy = np.sum(predictions == y_test.values) / len(y_test.values)

gbm2 = H2OGradientBoostingEstimator(
    ntrees=100,
    max_depth=5,
    min_rows=10,
    learn_rate=0.1,
    seed=623
)

gbm2.train(x=features2, y=target, training_frame=train2, validation_frame=val2)

predictions2 = np.round(gbm2.predict(test2).as_data_frame().values.flatten())
accuracy2 = np.sum(predictions2 == y_test.values) / len(y_test.values)

gbm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm Model Build progress: |




██████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





In [13]:
print(f'After optimization, H2O Gradient Boosting Estimator accuracy changed from {accuracy}% to {accuracy2}%\n')

After optimization, H2O Gradient Boosting Estimator accuracy changed from 0.9210526315789473% to 0.9385964912280702%



In [14]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

rf = H2ORandomForestEstimator(
    ntrees=100,
    max_depth=20,
    min_rows=5,
    seed=623
)

rf.train(x=features, y=target, training_frame=train, validation_frame=val)

predictions = np.round(rf.predict(test).as_data_frame().values.flatten())
accuracy = np.sum(predictions == y_test.values) / len(y_test.values)

rf2 = H2ORandomForestEstimator(
    ntrees=100,
    max_depth=20,
    min_rows=5,
    seed=623
)

rf2.train(x=features2, y=target, training_frame=train2, validation_frame=val2)

predictions2 = np.round(rf2.predict(test2).as_data_frame().values.flatten())
accuracy2 = np.sum(predictions2 == y_test.values) / len(y_test.values)

drf Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |




██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%





In [15]:
print(f'After optimization, H2O Random Forest Estimator accuracy changed from {accuracy}% to {accuracy2}%\n')

After optimization, H2O Random Forest Estimator accuracy changed from 0.9298245614035088% to 0.9298245614035088%



In [20]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

glm = H2OGeneralizedLinearEstimator(
    alpha=0.5,
    lambda_=0.001,
    seed=623
)

glm.train(x=features, y=target, training_frame=train, validation_frame=val)

predictions = np.round(glm.predict(test).as_data_frame().values.flatten())
accuracy = np.sum(predictions == y_test.values) / len(y_test.values)

glm2 = H2OGeneralizedLinearEstimator(
    alpha=0.5,
    lambda_=0.001,
    seed=623
)

glm2.train(x=features2, y=target, training_frame=train2, validation_frame=val2)

predictions2 = np.round(glm2.predict(test2).as_data_frame().values.flatten())
accuracy2 = np.sum(predictions2 == y_test.values) / len(y_test.values)



glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





In [21]:
print(f'After optimization, H2O Generalized Linear Estimator accuracy changed from {accuracy}% to {accuracy2}%\n')

After optimization, H2O Generalized Linear Estimator accuracy changed from 0.9035087719298246% to 0.9298245614035088%



In [33]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

sgd = H2OGeneralizedLinearEstimator(
    solver="irlsm",
    alpha=0.001,
    lambda_=0.0001,
    seed=623
)

sgd.train(x=features, y=target, training_frame=train, validation_frame=val)

predictions = np.round(sgd.predict(test).as_data_frame().values.flatten())
accuracy = np.sum(predictions == y_test.values) / len(y_test.values)

sgd2 = H2OGeneralizedLinearEstimator(
    solver="irlsm",
    alpha=0.001,
    lambda_=0.0001,
    seed=623
)

sgd2.train(x=features2, y=target, training_frame=train2, validation_frame=val2)

predictions2 = np.round(sgd2.predict(test2).as_data_frame().values.flatten())
accuracy2 = np.sum(predictions2 == y_test.values) / len(y_test.values)



glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





In [34]:
print(f'After optimization, H2O Generalized Linear Estimator accuracy changed from {accuracy}% to {accuracy2}%\n')

After optimization, H2O Generalized Linear Estimator accuracy changed from 0.8508771929824561% to 0.9210526315789473%



In [35]:
from h2o.automl import H2OAutoML

aml = H2OAutoML(
    max_models=25,
    max_runtime_secs_per_model=30,
    seed=623,
    balance_classes=True,
    class_sampling_factors=[0.5, 1.25]
)

aml.train(x=features, y=target, training_frame=train, validation_frame=val)

predictions = np.round(aml.predict(test).as_data_frame().values.flatten())
accuracy = np.sum(predictions == y_test.values) / len(y_test.values)

aml2 = H2OAutoML(
    max_models=25,
    max_runtime_secs_per_model=30,
    seed=623,
    balance_classes=True,
    class_sampling_factors=[0.5, 1.25]
)

aml2.train(x=features2, y=target, training_frame=train2, validation_frame=val2)

predictions2 = np.round(aml2.predict(test2).as_data_frame().values.flatten())
accuracy2 = np.sum(predictions2 == y_test.values) / len(y_test.values)

AutoML progress: |
01:12:32.415: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
01:12:36.858: AutoML: XGBoost is not available; skipping it.
01:12:46.49: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
01:13:04.802: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
01:13:04.802: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 56.0.
01:13:04.802: _response param, We





01:13:14.538: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
01:13:14.538: AutoML: XGBoost is not available; skipping it.
01:13:14.538: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
01:13:14.601: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
01:13:14.601: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 56.0.
01:13:14.601: _response param, We have detected th




In [36]:
print(f'After optimization, H2O AutoML accuracy changed from {accuracy}% to {accuracy2}%\n')

After optimization, H2O AutoML accuracy changed from 0.9298245614035088% to 0.9298245614035088%

