In [1]:
import sys
sys.path.append('./helpers')
sys.path.append('./optimizers')

In [2]:
# External libraries
import pandas as pd
import numpy as np


# Custom functions and classes
from pymoo_optimizer import PymooOptimizer
import benchmark as Benchmark
import data_provider as dp

In [3]:
x_train, x_test, x_val, y_train, y_test, y_val = dp.get_train_test_validation_data('breast', 0.8, 0.1)

In [4]:
optimizer = PymooOptimizer(x_train, x_val, y_train, y_val)
solution, score = optimizer.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)

In [5]:
raw_score = np.round(Benchmark.dtree_accuracy(x_train, x_test, y_train, y_test) * 100, 2)
optimized_score = np.round(Benchmark.dtree_accuracy(x_train.loc[:, solution], x_test.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After optimization, decision tree accuracy changed from {raw_score}% to {optimized_score}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')


After optimization, decision tree accuracy changed from 91.23% to 93.86%

Optimal contains 60.0% less columns than the original dataset,reducing from 30 to 12

Selected columns are: texture1, perimeter1, smoothness1, concave_points1, texture2, perimeter2, area2, smoothness2, concavity2, radius3, perimeter3, concave_points3


In [6]:
# Evolve a new feature
x_train_2 = x_train
x_test_2 = x_test
x_val_2 = x_val
best_feature_func = optimizer.evolve_new_feature(epochs=50, heuristics=Benchmark.dtree_accuracy)

# Apply the evolved feature to the training and test sets
x_train_2["evolved_feature"] = x_train_2.apply(lambda row: best_feature_func(*row), axis=1)
x_test_2["evolved_feature"] = x_test_2.apply(lambda row: best_feature_func(*row), axis=1)
x_val_2["evolved_feature"] = x_val_2.apply(lambda row: best_feature_func(*row), axis=1)

# Evaluate the model with the new feature added
score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2, x_test_2, y_train, y_test) * 100, 2)
optimizer_2 = PymooOptimizer(x_train_2, x_val_2, y_train, y_val)
solution, score = optimizer_2.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)
optimized_score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2.loc[:, solution], x_test_2.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After feature creation, decision tree accuracy set to {score_with_new_feature}%\n\n' +
      f'After feature creation and selection, decision tree accuracy set to {optimized_score_with_new_feature}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')

gen	nevals	avg     	std      	min     	max     
0  	50    	0.885464	0.0237514	0.842105	0.937343
1  	30    	0.894386	0.0231121	0.842105	0.937343
2  	34    	0.895439	0.0209692	0.857143	0.929825
3  	35    	0.896441	0.0192604	0.859649	0.929825
4  	30    	0.896742	0.0238029	0.857143	0.947368
5  	37    	0.89584 	0.0217982	0.842105	0.947368
6  	30    	0.898847	0.020556 	0.842105	0.929825
7  	29    	0.903008	0.0227519	0.842105	0.944862
8  	31    	0.903409	0.0268543	0.842105	0.944862
9  	19    	0.909875	0.0254702	0.842105	0.944862
10 	36    	0.905965	0.0284093	0.842105	0.944862
11 	34    	0.915088	0.0242611	0.842105	0.9599  
12 	31    	0.918847	0.0269881	0.857143	0.9599  
13 	28    	0.919298	0.0272065	0.842105	0.9599  
14 	34    	0.91604 	0.0302054	0.857143	0.9599  
15 	30    	0.917544	0.0295452	0.849624	0.957393
16 	30    	0.925113	0.0244468	0.842105	0.9599  
17 	35    	0.922206	0.0291157	0.857143	0.962406
18 	26    	0.933283	0.0244069	0.857143	0.962406
19 	24    	0.938747	0.0250283	0.859649	0