In [1]:
from Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from BaggingSA import BaggingSA
from typing import Literal, Tuple
from Bagging import predict
import sklearn
from scipy.stats import spearmanr, kendalltau, pearsonr



In [2]:
seed = 42

k_cross = 10
fitness_accuracy_diversity_ratios = [.25, .5, .75] 
feature_mutation_chances = [.1, .2, .3]
datasets = ['digits','wine', 'breast_cancer', 'pima']


np.random.seed(seed)
random.seed(seed)

In [3]:


def get_dataset(dataset_name: str) -> Tuple[np.ndarray, np.ndarray]:
    if dataset_name == 'digits':
        data = sklearn.datasets.load_digits()
        X = data.data
        y = data.target
        
    elif dataset_name == 'wine':
        data = sklearn.datasets.load_wine()
        X = data.data
        y = data.target
    
    elif dataset_name == 'breast_cancer':
        data = sklearn.datasets.load_breast_cancer()
        X = data.data
        y = data.target
        
    elif dataset_name == 'pima':
        data = pd.read_csv("./../datasets/pima.csv")
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
    
    else:
        raise ValueError("Unsupported dataset")
    return X, y

In [4]:

def evaluate_bagging_sa(X_train, y_train, X_test, y_test, fitness_accuracy_diversity_ratio, feature_mutation_chance) -> Tuple[float, int, int]: 
    bagging_sa = BaggingSA(X=X_train, y=y_train,
                            T0=2.0, cooling_method='geometric', alpha=0.995, max_iterations=2000, n_trees=10,
                            fitness_accuracy_diversity_ratio=fitness_accuracy_diversity_ratio,
                            feature_mutation_chance=feature_mutation_chance, test_split_amount=2)
    models, fitness = bagging_sa.run(monitor_fun=fun_monitor, get_fitness=True, X_for_test=X_test, y_for_test=y_test)
    accuracy = evaluate(X=X_test, y=y_test, models=models)
    return bagging_sa, accuracy, fitness
    
def fun_monitor(iteration, T, best_fitness, fitness, new_fitness, accuracy):
    global fit_acc_sum, acc_fitness_difference
    
    acc_fitness_difference += abs(accuracy - fitness)
    
    fits.append(new_fitness)
    accs.append(accuracy)

    if iteration % 100 == 0:
        print(f"    Iteration: {iteration}, T: {T:.2f}, Best fitness: {best_fitness:.4f}")

acc_fitness_difference = 0.0
fits = []
accs = []
result = []
print(f"Start at {pd.Timestamp.now()}")
for dataset in datasets:
    X, y = get_dataset(dataset)       
    
    random_indices = np.arange(X.shape[0])
    np.random.shuffle(random_indices)
    X = X[random_indices]
    y = y[random_indices]
    
    sub_groups_X = np.array_split(np.array(X), k_cross)
    sub_groups_y = np.array_split(np.array(y), k_cross) 
         
    for fadr in fitness_accuracy_diversity_ratios:
        for fmc in feature_mutation_chances:
            for k in range(k_cross):
                print(f"[Dataset: {dataset}, FADR: {fadr}, FMC: {fmc}, k: {k}]")
                
                if k_cross == 1:
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
                else:
                    X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
                    y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
                    X_test = sub_groups_X[k]
                    y_test = sub_groups_y[k]
                
                acc_fitness_difference = 0.0
                fits = []
                accs = []
                
                bagging_sa, accuracy, fitness = evaluate_bagging_sa(X_train, y_train, X_test, y_test, fadr, fmc)
                
                spearman_corr, spearman_p = spearmanr(fits, accs)
                
                acc_fitness_difference /= bagging_sa.max_iterations
                
                result.append([dataset, k, fadr, fmc, accuracy, spearman_corr, spearman_p, fitness, acc_fitness_difference])
                
                df = pd.DataFrame(result, columns=["dataset", "kCrossIndex", "fadr", "fmc", "accuracy", "correlation", "spearmanP", "fitness", "accFitnessDifference"])
                df.to_csv("./../res/bagging_sa_params.csv", index=False)
                print(f"    Accuracy: {accuracy:.4f}")


Start at 2025-04-20 13:25:55.258857
[Dataset: digits, FADR: 0.25, FMC: 0.1, k: 0]
    Iteration: 100, T: 1.22, Best fitness: 0.9308
    Iteration: 200, T: 0.74, Best fitness: 0.9598
    Iteration: 300, T: 0.45, Best fitness: 0.9605
    Iteration: 400, T: 0.27, Best fitness: 0.9620
    Iteration: 500, T: 0.16, Best fitness: 0.9620
    Iteration: 600, T: 0.10, Best fitness: 0.9675
    Iteration: 700, T: 0.06, Best fitness: 0.9675
    Iteration: 800, T: 0.04, Best fitness: 0.9675
    Iteration: 900, T: 0.02, Best fitness: 0.9675
    Iteration: 1000, T: 0.01, Best fitness: 0.9675
    Iteration: 1100, T: 0.01, Best fitness: 0.9675
    Iteration: 1200, T: 0.00, Best fitness: 0.9696
    Iteration: 1300, T: 0.00, Best fitness: 0.9696
    Iteration: 1400, T: 0.00, Best fitness: 0.9697
    Iteration: 1500, T: 0.00, Best fitness: 0.9706
    Iteration: 1600, T: 0.00, Best fitness: 0.9706
    Iteration: 1700, T: 0.00, Best fitness: 0.9720
    Iteration: 1800, T: 0.00, Best fitness: 0.9720
    Itera

In [5]:
df = pd.read_csv("./../res/bagging_sa_params.csv")
df_agg = df.groupby(["dataset", "fadr", "fmc"]).agg(
    dataset=("dataset", "first"),
    fadr=("fadr", "first"),
    fmc=("fmc", "first"),
    accuracyMean=("accuracy", "mean"),
    accuracyStd=("accuracy", "std"),
    correlationMean=("correlation", "mean"),
    correlationStd=("correlation", "std"),
    fitnessMean=("fitness", "mean"),
    fitnessStd=("fitness", "std"),
    accFitnessDifferenceMean=("accFitnessDifference", "mean"),
    accFitnessDifferenceStd=("accFitnessDifference", "std"),
).reset_index(drop=True)
df_agg.to_csv("./../res/bagging_sa_params_aggregated.csv", index=False)

print(df_agg.head())

         dataset  fadr  fmc  accuracyMean  accuracyStd  correlationMean  \
0  breast_cancer  0.25  0.1      0.938503     0.036262        -0.031509   
1  breast_cancer  0.25  0.2      0.948997     0.025532         0.017565   
2  breast_cancer  0.25  0.3      0.959586     0.023450         0.043112   
3  breast_cancer  0.50  0.1      0.952600     0.033101         0.083860   
4  breast_cancer  0.50  0.2      0.943766     0.048075         0.091310   

   correlationStd  fitnessMean  fitnessStd  accFitnessDifferenceMean  \
0        0.181013     0.994621    0.003092                  0.207948   
1        0.082498     0.995376    0.003089                  0.196295   
2        0.064445     0.996373    0.003340                  0.173900   
3        0.132727     0.987785    0.013147                  0.127019   
4        0.134745     0.951314    0.123533                  0.167811   

   accFitnessDifferenceStd  
0                 0.055888  
1                 0.059634  
2                 0.063581  
