In [8]:
import sys
sys.path.append('./helpers')
sys.path.append('./optimizers')

In [9]:
# External libraries
import pandas as pd
import numpy as np


# Custom functions and classes
from pymoo_optimizer import PymooOptimizer
import benchmark as Benchmark
import data_provider as dp

In [10]:
x_train, x_test, x_val, y_train, y_test, y_val = dp.get_train_test_validation_data('breast', 0.8, 0.1)

In [11]:
optimizer = PymooOptimizer(x_train, x_val, y_train, y_val)
solution, score = optimizer.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)

In [12]:
raw_score = np.round(Benchmark.dtree_accuracy(x_train, x_test, y_train, y_test) * 100, 2)
optimized_score = np.round(Benchmark.dtree_accuracy(x_train.loc[:, solution], x_test.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After optimization, decision tree accuracy changed from {raw_score}% to {optimized_score}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')


After optimization, decision tree accuracy changed from 88.95% to 89.3%

Optimal contains 83.3% less columns than the original dataset,reducing from 30 to 5

Selected columns are: texture1, concavity1, concave_points1, concavity2, compactness3


In [13]:
# Calculates multiple solutions, returning the best one and the average score
# If the average score is worse than previous, it means that there probably was no true optimization
def get_average_heuristics(optimizer, heuristics, tries = 1):
    scores = []
    solutions = []
    for _ in range(tries):
        solution, score = optimizer.optimize(64, 50, heuristics, verbose = False)
        solutions.append(solution)
        scores.append(score)
    return solutions[scores.index(np.max(score))], np.mean(scores)


In [14]:
# Evolve a new feature
x_train_2 = x_train
x_test_2 = x_test
x_val_2 = x_val
best_feature_func = optimizer.evolve_new_feature(epochs=50, heuristics=Benchmark.dtree_accuracy)

# Apply the evolved feature to the training and test sets
x_train_2["evolved_feature"] = x_train_2.apply(lambda row: best_feature_func(*row), axis=1)
x_test_2["evolved_feature"] = x_test_2.apply(lambda row: best_feature_func(*row), axis=1)
x_val_2["evolved_feature"] = x_val_2.apply(lambda row: best_feature_func(*row), axis=1)

# Evaluate the model with the new feature added
score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2, x_test_2, y_train, y_test) * 100, 2)
optimizer_2 = PymooOptimizer(x_train_2, x_val_2, y_train, y_val)
solution, score = optimizer_2.optimize(64, 100, Benchmark.dtree_accuracy, verbose = False)
optimized_score_with_new_feature = np.round(Benchmark.dtree_accuracy(x_train_2.loc[:, solution], x_test_2.loc[:, solution], y_train, y_test) * 100, 2)
print(f'After feature creation, decision tree accuracy set to {score_with_new_feature}%\n\n' +
      f'After feature creation and selection, decision tree accuracy set to {optimized_score_with_new_feature}%\n\n' +
        f'Optimal contains {np.round((x_train.shape[1] - np.sum(solution)) / x_train.shape[1] * 100, 1)}% less columns than the original dataset,' +
        f'reducing from {x_train.shape[1]} to {np.sum(solution)}\n\nSelected columns are: {", ".join(x_train.columns[solution].tolist())}')

gen	nevals	avg     	std       	min     	max     
0  	50    	0.883699	0.00336996	0.876692	0.890226
1  	31    	0.884291	0.00353212	0.876692	0.892481
2  	37    	0.884361	0.00413283	0.876692	0.896491
3  	27    	0.886586	0.00398748	0.876692	0.896491
4  	29    	0.88594 	0.00514884	0.874436	0.896491
5  	33    	0.887398	0.00683729	0.878947	0.920551
6  	35    	0.886932	0.00795525	0.876692	0.920551
7  	33    	0.88783 	0.00745422	0.878947	0.905764
8  	28    	0.891454	0.00925382	0.878947	0.92005 
9  	34    	0.891494	0.00995393	0.878947	0.92005 
10 	34    	0.891353	0.0104901 	0.876692	0.92005 
11 	29    	0.892466	0.0126996 	0.878947	0.92005 
12 	30    	0.895479	0.0141236 	0.874436	0.92406 
13 	27    	0.901935	0.0135275 	0.878947	0.92406 
14 	32    	0.897454	0.0139553 	0.876692	0.92406 
15 	31    	0.897554	0.0131554 	0.878446	0.92406 
16 	25    	0.898972	0.0138097 	0.876692	0.92005 
17 	31    	0.897253	0.0139073 	0.878947	0.92005 
18 	37    	0.896762	0.0144168 	0.874436	0.92005 
19 	31    	0.895754	

In [15]:
x_train.drop(columns=["evolved_feature"], inplace=True)
x_test.drop(columns=["evolved_feature"], inplace=True)
x_val.drop(columns=["evolved_feature"], inplace=True)

In [20]:
# Evolve a new set of features
x_train_2 = pd.DataFrame({})
x_test_2 = pd.DataFrame({})
x_val_2 = pd.DataFrame({})

old_accuracy = 0
new_accuracy = 1
feature_count = 0

while new_accuracy - old_accuracy > 0:
    old_accuracy = new_accuracy
    feature_count += 1
    optimizer = PymooOptimizer(pd.concat([x_train, x_train_2], axis=1), pd.concat([x_val, x_val_2], axis=1), y_train, y_val)
    best_feature_func = optimizer.evolve_new_feature(epochs=50, heuristics=Benchmark.dtree_accuracy, verbose=False, target_train = x_train_2, target_test = x_val_2)

    x_train_2[f'evolved_feature_{feature_count}'] = pd.concat([x_train, x_train_2], axis=1).apply(lambda row: best_feature_func(*row), axis=1)
    x_test_2[f'evolved_feature_{feature_count}'] = pd.concat([x_test, x_test_2], axis=1).apply(lambda row: best_feature_func(*row), axis=1)
    x_val_2[f'evolved_feature_{feature_count}'] = pd.concat([x_val, x_val_2], axis=1).apply(lambda row: best_feature_func(*row), axis=1)
    
    new_accuracy = np.round(Benchmark.dtree_accuracy(x_train_2, x_test_2, y_train, y_test) * 100, 2)

score = np.round(Benchmark.dtree_accuracy(x_train, x_test, y_train, y_test) * 100, 2)
print(f'After feature creation, decision tree accuracy was equal to {new_accuracy}%\n\n' +
        f'With previous data, it was {score}%\n\n' +
        f'Optimal contains {100 - np.round(x_train_2.shape[1] / x_train.shape[1] * 100, 1)}% less columns than the original dataset,')

94.74 1
94.74 94.74
After feature creation, decision tree accuracy was equal to 94.74%

With previous data, it was 89.12%

Optimal contains 93.3% less columns than the original dataset,
