In [1]:
from raw_python.Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from raw_python.BaggingSA import BaggingSA
from raw_python.DatasetsHandle import get_dataset
from tabulate import tabulate

seed = 41
np.random.seed(seed)
random.seed(seed)

In [2]:
dataset_name = 'pima'
k_cross = 3
reps = 3
params = {
    'T0': 2,
    'cooling_method': 'geometric',
    'alpha': 0.995,
    'max_iterations': 2000,
    'feature_mutation_chance': 0.25,
    'test_split_amount': 5,
    'n_trees': 10
}

In [3]:
def evaluate_bagging(X_train, y_train, X_test, y_test):
    bags = create_bags(X=X_train, y=y_train, bags_amount=params['n_trees'])
    models = create_models(bags=bags)
    accuracy = evaluate(X=X_test, y=y_test, models=models)
    return accuracy

def evaluate_bagging_sa(X_train, y_train, X_test, y_test):
    T0 = params['T0']
    cooling_method = params['cooling_method']
    alpha = params['alpha']
    max_iterations = params['max_iterations']
    feature_mutation_chance = params['feature_mutation_chance']
    test_split_amount = params['test_split_amount']
    n_trees = params['n_trees']
    bagging_sa = BaggingSA(X=X_train, y=y_train,
                            T0=T0, cooling_method=cooling_method, alpha=alpha, max_iterations=max_iterations, n_trees=n_trees,
                            feature_mutation_chance=feature_mutation_chance, test_split_amount=test_split_amount)
    models, fitness = bagging_sa.run(X_for_test=X_test, y_for_test=y_test, monitor_fun=fun_monitor, get_fitness=True)
    accuracy = evaluate(X_test, y_test, models=models)
    return accuracy, models, fitness

def fun_monitor(iteration, T, best_fitness, fitness, new_fitness, accuracy):
    if accuracy is None:
        accuracy = 0.0
    # print(f"    I: {iteration}, T: {T:.3f}, Best fitness: {best_fitness:.3f}, Fitness: {fitness:.3f}, New fitness: {new_fitness:.3f}, Accuracy: {accuracy:.3f}")

In [4]:
result = []
X,y = get_dataset(dataset_name)
X = X[:int(len(X)/2)]
y = y[:int(len(y)/2)]


# random_indices = np.arange(X.shape[0])
# np.random.shuffle(random_indices)
# X = X[random_indices]
# y = y[random_indices]

sub_groups_X = np.array_split(np.array(X), k_cross)
sub_groups_y = np.array_split(np.array(y), k_cross) 

for k in range(k_cross):
    X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
    y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
    X_test = sub_groups_X[k]
    y_test = sub_groups_y[k]
    for r in range(reps):
        acc_bagging = evaluate_bagging(X_train, y_train, X_test, y_test)
        accuracy, models, fitness = evaluate_bagging_sa(X_train, y_train, X_test, y_test)
        result.append([k+1, r+1, fitness, accuracy, acc_bagging])
        print(f"Fold {k+1}/{k_cross} | Rep {r+1}/{reps} | Fitness: {fitness:.3f} | Accuracy: {accuracy:.3f} | Bagging: {acc_bagging:.3f}")
        df = pd.DataFrame(result, columns=['Fold', 'Rep', 'Fitness', 'Accuracy', 'Bagging'])
        df.to_csv(f'./../res/test_bagging.csv', index=False)

   Acc: 0.5873, Dis: 0.0375, Com: 0.0000 => Fit: 0.6248
   Acc: 0.5873, Dis: 0.0375, Com: 0.0007 => Fit: 0.6241
   Acc: 0.5564, Dis: 0.0380, Com: 0.0013 => Fit: 0.5931
   Acc: 0.6027, Dis: 0.0414, Com: 0.0020 => Fit: 0.6421
   Acc: 0.5857, Dis: 0.0403, Com: 0.0022 => Fit: 0.6238
   Acc: 0.5054, Dis: 0.0406, Com: 0.0028 => Fit: 0.5432
   Acc: 0.5394, Dis: 0.0409, Com: 0.0023 => Fit: 0.5779
   Acc: 0.5873, Dis: 0.0405, Com: 0.0021 => Fit: 0.6257
   Acc: 0.5734, Dis: 0.0401, Com: 0.0021 => Fit: 0.6113
   Acc: 0.5687, Dis: 0.0376, Com: 0.0023 => Fit: 0.6040
   Acc: 0.6553, Dis: 0.0388, Com: 0.0022 => Fit: 0.6918
   Acc: 0.6058, Dis: 0.0382, Com: 0.0024 => Fit: 0.6416
   Acc: 0.6522, Dis: 0.0381, Com: 0.0026 => Fit: 0.6877
   Acc: 0.5857, Dis: 0.0402, Com: 0.0024 => Fit: 0.6235
   Acc: 0.5718, Dis: 0.0405, Com: 0.0022 => Fit: 0.6101
   Acc: 0.6383, Dis: 0.0378, Com: 0.0025 => Fit: 0.6735
   Acc: 0.6367, Dis: 0.0388, Com: 0.0025 => Fit: 0.6730
   Acc: 0.6553, Dis: 0.0388, Com: 0.0026 => Fit:

In [5]:
df = pd.read_csv(f'./../res/test_bagging.csv')

tmp = df.copy().round(3)
print(tabulate(tmp, headers='keys', tablefmt='pretty', showindex=False))

fitness_mean = df['Fitness'].mean()
acc_mean = df['Accuracy'].mean()
bagging_mean = df['Bagging'].mean()

print(f"Fitness mean:  {fitness_mean:.3f}")
print(f"Accuracy mean: {acc_mean:.3f}")
print(f"Bagging mean:  {bagging_mean:.3f}")

+------+-----+---------+----------+---------+
| Fold | Rep | Fitness | Accuracy | Bagging |
+------+-----+---------+----------+---------+
| 1.0  | 1.0 |  0.783  |   0.68   |  0.68   |
| 1.0  | 2.0 |  0.715  |  0.695   |  0.648  |
| 1.0  | 3.0 |  0.799  |  0.703   |  0.758  |
| 2.0  | 1.0 |  0.783  |   0.68   |  0.719  |
| 2.0  | 2.0 |  0.769  |  0.703   |  0.672  |
| 2.0  | 3.0 |  0.814  |  0.688   |  0.711  |
| 3.0  | 1.0 |  0.781  |  0.688   |  0.68   |
| 3.0  | 2.0 |  0.719  |  0.602   |  0.688  |
| 3.0  | 3.0 |  0.77   |  0.695   |  0.68   |
+------+-----+---------+----------+---------+
Fitness mean:  0.770
Accuracy mean: 0.681
Bagging mean:  0.693


    +------+-----+---------+----------+---------+
    | Fold | Rep | Fitness | Accuracy | Bagging |
    +------+-----+---------+----------+---------+
    | 1.0  | 1.0 |  0.776  |  0.758   |  0.695  |
    | 1.0  | 2.0 |  0.714  |  0.688   |  0.711  |
    | 1.0  | 3.0 |  0.688  |  0.617   |  0.688  |
    | 2.0  | 1.0 |  0.755  |  0.641   |  0.656  |
    | 2.0  | 2.0 |  0.819  |  0.734   |  0.727  |
    | 2.0  | 3.0 |  0.684  |  0.727   |  0.742  |
    | 3.0  | 1.0 |  0.755  |  0.633   |  0.672  |
    | 3.0  | 2.0 |  0.754  |  0.703   |  0.633  |
    | 3.0  | 3.0 |  0.792  |  0.664   |  0.734  |
    +------+-----+---------+----------+---------+
    Fitness mean:  0.749
    Accuracy mean: 0.685
    Bagging mean:  0.695

    accuracy = accuracy *0.85
    disagreement = disagreement*0.1
    complexity = complexity*0.05
            
    fitness = accuracy + disagreement - complexity