In [5]:
from Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from BaggingSA import BaggingSA
from typing import Literal, Tuple
from Bagging import predict
import sklearn
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier


In [6]:
seed = 42

k_cross = 10
n_trees = [10]
datasets = ['digits']

bagging_sa_params = {
    'wine' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.2,
        'test_split_amount': 5        
    },
    'breast_cancer' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.5,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    },
    'pima' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    },
    
    'digits' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    }
    
}

np.random.seed(seed)
random.seed(seed)

In [7]:
def get_dataset(dataset_name: str) -> Tuple[np.ndarray, np.ndarray]:
    if dataset_name == 'digits':
        data = sklearn.datasets.load_digits()
        X = data.data
        y = data.target
        
    elif dataset_name == 'wine':
        data = sklearn.datasets.load_wine()
        X = data.data
        y = data.target
    
    elif dataset_name == 'breast_cancer':
        data = sklearn.datasets.load_breast_cancer()
        X = data.data
        y = data.target
        
    elif dataset_name == 'pima':
        data = pd.read_csv("./../datasets/pima.csv")
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
    
    else:
        raise ValueError("Unsupported dataset")
    return X, y

In [8]:



def get_measures(predictions, y_test):
    correct_pred_amount = 0
    wrong_pred_amount = 0
    accuracy = 0
    for i in range(len(predictions)):
        if predictions[i] == y_test[i]:
            correct_pred_amount += 1
        else:
            wrong_pred_amount += 1
    accuracy = correct_pred_amount / (correct_pred_amount + wrong_pred_amount)
    return accuracy, correct_pred_amount, wrong_pred_amount
    

def evaluate_bagging(X_train, y_train, X_test, y_test, n_trees: int) -> Tuple[float, int, int]:
    bags = create_bags(X_train, bags_amount=n_trees)
    models = create_models(X=X_train, y=y_train, bags=bags)
    predictions = predict(X=X_test, models=models)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount
  
    
def evaluate_bagging_lib(X_train, y_train, X_test, y_test, n_trees: int):
    model = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=None, min_samples_split=2), n_estimators=n_trees, random_state=seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(y_pred, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount


result = []
print(f"Start at {pd.Timestamp.now()}")
for dataset in datasets:
    X, y = get_dataset(dataset)       
    
    random_indices = np.arange(X.shape[0])
    np.random.shuffle(random_indices)
    X = X[random_indices]
    y = y[random_indices]
    
    sub_groups_X = np.array_split(np.array(X), k_cross)
    sub_groups_y = np.array_split(np.array(y), k_cross) 
         
    for n_tree in n_trees:
        for k in range(k_cross):
            X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
            y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
            X_test = sub_groups_X[k]
            y_test = sub_groups_y[k]
            
            dt_acc, dt_correct, dt_wrong = evaluate_bagging_lib(X_train, y_train, X_test, y_test, n_trees=n_tree)
            bagging_acc, bagging_correct, bagging_wrong = evaluate_bagging(X_train, y_train, X_test, y_test, n_trees=n_tree)

            print(f"    Dataset: {dataset}, n_trees: {n_tree}, k: {k+1}/{k_cross} >> Bagging LIB: {dt_acc:.4f}, Bagging: {bagging_acc:.4f}")
            

Start at 2025-04-22 12:45:31.978223
    Dataset: digits, n_trees: 10, k: 1/10 >> Bagging LIB: 0.9333, Bagging: 0.8222
    Dataset: digits, n_trees: 10, k: 2/10 >> Bagging LIB: 0.9556, Bagging: 0.8333
    Dataset: digits, n_trees: 10, k: 3/10 >> Bagging LIB: 0.9333, Bagging: 0.8611
    Dataset: digits, n_trees: 10, k: 4/10 >> Bagging LIB: 0.9333, Bagging: 0.8778
    Dataset: digits, n_trees: 10, k: 5/10 >> Bagging LIB: 0.9333, Bagging: 0.7889
    Dataset: digits, n_trees: 10, k: 6/10 >> Bagging LIB: 0.8556, Bagging: 0.8000
    Dataset: digits, n_trees: 10, k: 7/10 >> Bagging LIB: 0.9667, Bagging: 0.8444
    Dataset: digits, n_trees: 10, k: 8/10 >> Bagging LIB: 0.9050, Bagging: 0.8771
    Dataset: digits, n_trees: 10, k: 9/10 >> Bagging LIB: 0.9274, Bagging: 0.8324
    Dataset: digits, n_trees: 10, k: 10/10 >> Bagging LIB: 0.9497, Bagging: 0.8268
