In [1]:
from raw_python.Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from raw_python.BaggingSA import BaggingSA
from raw_python.DatasetsHandle import get_dataset
from tabulate import tabulate
from raw_python.BaggingRandom import BaggingRandom

seed = 41
np.random.seed(seed)
random.seed(seed)

In [2]:
dataset_name = 'digits'
k_cross = 5
reps = 5
params = {
    'T0': 2,
    'cooling_method': 'geometric',
    'alpha': 0.995,
    'max_iterations': 2000,
    'feature_mutation_chance': 0.25,
    'test_split_amount': 5,
    'theta': 0.85,
    'beta': 0.1,
    'gamma': 0.05,
    'n_trees': 10,
    'pop_size':1000
}

In [None]:


from sklearn.ensemble import BaggingClassifier


def evaluate_bagging(X_train, y_train, X_test, y_test):
    n_trees = params['n_trees']
    model = BaggingClassifier(n_estimators=n_trees, random_state=seed)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    return accuracy  

def evaluate_bagging_random(X_train, y_train, X_test, y_test):
    n_trees = params['n_trees']
    test_split_amount = params['test_split_amount']
    pop_size = params['pop_size']
    bagging_rand = BaggingRandom(X_train, y_train, n_trees, test_split_amount,pop_size)
    models = bagging_rand.run()
    accuracy = evaluate(X_test, y_test, models=models)
    return accuracy
    

def evaluate_bagging_sa(X_train, y_train, X_test, y_test):
    T0 = params['T0']
    cooling_method = params['cooling_method']
    alpha = params['alpha']
    max_iterations = params['max_iterations']
    feature_mutation_chance = params['feature_mutation_chance']
    test_split_amount = params['test_split_amount']
    n_trees = params['n_trees']
    theta = params['theta']
    beta = params['beta']
    gamma = params['gamma']
    bagging_sa = BaggingSA(X=X_train, y=y_train,
                            T0=T0, cooling_method=cooling_method, alpha=alpha, max_iterations=max_iterations, n_trees=n_trees,
                            feature_mutation_chance=feature_mutation_chance, test_split_amount=test_split_amount, theta=theta, beta=beta, gamma=gamma)
    models, fitness = bagging_sa.run(X_for_test=X_test, y_for_test=y_test, monitor_fun=fun_monitor, get_fitness=True)
    accuracy = evaluate(X_test, y_test, models=models)
    return accuracy, models, fitness

def fun_monitor(iteration, T, best_fitness, fitness, new_fitness, accuracy):
    if accuracy is None:
        accuracy = 0.0
    # print(f"    I: {iteration}, T: {T:.3f}, Best fitness: {best_fitness:.3f}, Fitness: {fitness:.3f}, New fitness: {new_fitness:.3f}, Accuracy: {accuracy:.3f}")

In [None]:
result = []
X,y = get_dataset(dataset_name)
random_indices = np.arange(X.shape[0])
np.random.shuffle(random_indices)
X = X[random_indices]
y = y[random_indices]

sub_groups_X = np.array_split(np.array(X), k_cross)
sub_groups_y = np.array_split(np.array(y), k_cross) 

for k in range(k_cross):
    X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
    y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
    X_test = sub_groups_X[k]
    y_test = sub_groups_y[k]
    for r in range(reps):
        acc_bagging = evaluate_bagging(X_train, y_train, X_test, y_test)
        acc_bagging_random = evaluate_bagging_random(X_train, y_train, X_test, y_test)
        accuracy, models, fitness = evaluate_bagging_sa(X_train, y_train, X_test, y_test)
        
        result.append([k+1, r+1, fitness, accuracy, acc_bagging, acc_bagging_random])
        print(f"Fold {k+1:2}/{k_cross:2} | Rep {r+1:2}/{reps:2} | Fitness: {fitness:.3f} | Accuracy: {accuracy:.3f} | Bagging: {acc_bagging:.3f}, BaggingRandom: {acc_bagging_random:.3f}")
        df = pd.DataFrame(result, columns=['Fold', 'Rep', 'Fitness', 'Accuracy', 'Bagging', 'BaggingRandom'])
        df.to_csv(f'./../res/test_bagging.csv', index=False)

Fold  1/ 5 | Rep  1/ 5 | Fitness: 0.808 | Accuracy: 0.939 | Bagging: 0.947, BaggingRandom: 0.917
Fold  1/ 5 | Rep  2/ 5 | Fitness: 0.808 | Accuracy: 0.933 | Bagging: 0.947, BaggingRandom: 0.950
Fold  1/ 5 | Rep  3/ 5 | Fitness: 0.796 | Accuracy: 0.936 | Bagging: 0.947, BaggingRandom: 0.950
Fold  1/ 5 | Rep  4/ 5 | Fitness: 0.796 | Accuracy: 0.942 | Bagging: 0.947, BaggingRandom: 0.936
Fold  1/ 5 | Rep  5/ 5 | Fitness: 0.800 | Accuracy: 0.950 | Bagging: 0.947, BaggingRandom: 0.925
Fold  2/ 5 | Rep  1/ 5 | Fitness: 0.796 | Accuracy: 0.961 | Bagging: 0.947, BaggingRandom: 0.944
Fold  2/ 5 | Rep  2/ 5 | Fitness: 0.797 | Accuracy: 0.950 | Bagging: 0.947, BaggingRandom: 0.947
Fold  2/ 5 | Rep  3/ 5 | Fitness: 0.797 | Accuracy: 0.958 | Bagging: 0.947, BaggingRandom: 0.947
Fold  2/ 5 | Rep  4/ 5 | Fitness: 0.783 | Accuracy: 0.947 | Bagging: 0.947, BaggingRandom: 0.956
Fold  2/ 5 | Rep  5/ 5 | Fitness: 0.798 | Accuracy: 0.958 | Bagging: 0.947, BaggingRandom: 0.956
Fold  3/ 5 | Rep  1/ 5 | Fitne

In [None]:
df = pd.read_csv(f'./../res/test_bagging.csv')

tmp = df.copy().round(3)
print(tabulate(tmp, headers='keys', tablefmt='pretty', showindex=False))

fitness_mean = df['Fitness'].mean()
acc_mean = df['Accuracy'].mean()
bagging_mean = df['Bagging'].mean()
bagging_random_mean = df['BaggingRandom'].mean()

print(f"Fitness mean:  {fitness_mean:.3f}")
print(f"Accuracy mean: {acc_mean:.3f}")
print(f"Bagging mean:  {bagging_mean:.3f}")
print(f"Bagging Random mean:  {bagging_random_mean:.3f}")

+------+-----+---------+----------+---------+---------------+
| Fold | Rep | Fitness | Accuracy | Bagging | BaggingRandom |
+------+-----+---------+----------+---------+---------------+
| 1.0  | 1.0 |   0.8   |  0.947   |  0.947  |     0.942     |
| 1.0  | 2.0 |  0.793  |  0.917   |  0.947  |     0.933     |
| 2.0  | 1.0 |  0.803  |  0.958   |  0.947  |     0.944     |
| 2.0  | 2.0 |  0.806  |  0.956   |  0.947  |     0.956     |
| 3.0  | 1.0 |  0.787  |  0.891   |  0.919  |     0.928     |
| 3.0  | 2.0 |  0.808  |  0.914   |  0.919  |     0.894     |
| 4.0  | 1.0 |  0.797  |  0.928   |  0.916  |     0.955     |
| 4.0  | 2.0 |  0.816  |  0.925   |  0.916  |     0.933     |
| 5.0  | 1.0 |  0.805  |  0.942   |  0.942  |     0.928     |
| 5.0  | 2.0 |  0.823  |  0.936   |  0.942  |     0.933     |
+------+-----+---------+----------+---------+---------------+
Fitness mean:  0.804
Accuracy mean: 0.931
Bagging mean:  0.934
Bagging Random mean:  0.935


    +------+-----+---------+----------+---------+---------------+
    | Fold | Rep | Fitness | Accuracy | Bagging | BaggingRandom |
    +------+-----+---------+----------+---------+---------------+
    | 1.0  | 1.0 |  0.822  |  0.797   |  0.793  |     0.817     |
    | 1.0  | 2.0 |  0.828  |  0.793   |  0.807  |     0.823     |
    | 1.0  | 3.0 |  0.847  |  0.783   |  0.83   |     0.813     |
    | 2.0  | 1.0 |  0.803  |  0.839   |  0.849  |     0.876     |
    | 2.0  | 2.0 |  0.822  |  0.853   |  0.883  |     0.873     |
    | 2.0  | 3.0 |  0.795  |  0.873   |  0.89   |     0.89      |
    | 3.0  | 1.0 |  0.828  |  0.866   |  0.839  |     0.846     |
    | 3.0  | 2.0 |  0.824  |  0.806   |  0.803  |     0.839     |
    | 3.0  | 3.0 |  0.86   |  0.856   |  0.846  |     0.803     |
    +------+-----+---------+----------+---------+---------------+
    Fitness mean:  0.826
    Accuracy mean: 0.830
    Bagging mean:  0.838
    Bagging Random mean:  0.842

In [None]:
#statistic tests
from scipy.stats import ttest_rel, wilcoxon, mannwhitneyu, shapiro

t1_start, p1 = shapiro(df['Bagging'])
t2_start, p2 = shapiro(df['Accuracy'])

print(f"Shapiro Bagging: t-statistic = {t1_start:.3f}, p-value = {p1:.3f}")
print(f"Shapiro BaggingSA: t-statistic = {t2_start:.3f}, p-value = {p2:.3f}")

if p1 > 0.05 and p2 > 0.05:
    t_stat, p_value = ttest_rel(df['Accuracy'], df['Bagging'], alternative='greater')
    print(f"t-test: t-statistic = {t_stat:.3f}, p-value = {p_value:.3f}")
    txt = 'BaggingSA is greater than Bagging' if p_value < 0.05 else 'BaggingSA is not greater than Bagging'
    print(txt)
else:
    w_stat, p_value = wilcoxon(df['Accuracy'], df['Bagging'], alternative='greater')
    print(f"Wilcoxon: w-statistic = {w_stat:.3f}, p-value = {p_value:.3f}")
    txt = 'BaggingSA is greater than Bagging' if p_value < 0.05 else 'BaggingSA is not greater than Bagging'
    print(txt)

Wilcoxon: w-statistic = 18.000, p-value = 0.516
BaggingSA is not greater than Bagging
