In [1]:
from Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from BaggingSA import BaggingSA
from typing import Literal, Tuple
from Bagging import predict
import sklearn
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier


In [2]:
seed = 42

k_cross = 10
n_trees = [10, 20, 30, 40, 50]
datasets = ['digits', 'wine', 'breast_cancer', 'pima']

bagging_sa_params = {
    'wine' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.2,
        'test_split_amount': 5        
    },
    'breast_cancer' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.5,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    },
    'pima' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    },
    
    'digits' : {
        'T0': 2,
        'cooling_method': 'geometric',
        'alpha': 0.995,
        'max_iterations': 2000,
        'fitness_accuracy_diversity_ratio': 0.75,
        'feature_mutation_chance': 0.3,
        'test_split_amount': 5        
    }
    
}

np.random.seed(seed)
random.seed(seed)

In [3]:
def get_dataset(dataset_name: str) -> Tuple[np.ndarray, np.ndarray]:
    if dataset_name == 'digits':
        data = sklearn.datasets.load_digits()
        X = data.data
        y = data.target
        
    elif dataset_name == 'wine':
        data = sklearn.datasets.load_wine()
        X = data.data
        y = data.target
    
    elif dataset_name == 'breast_cancer':
        data = sklearn.datasets.load_breast_cancer()
        X = data.data
        y = data.target
        
    elif dataset_name == 'pima':
        data = pd.read_csv("./../datasets/pima.csv")
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
    
    else:
        raise ValueError("Unsupported dataset")
    return X, y

In [4]:



def get_measures(predictions, y_test):
    correct_pred_amount = 0
    wrong_pred_amount = 0
    accuracy = 0
    for i in range(len(predictions)):
        if predictions[i] == y_test[i]:
            correct_pred_amount += 1
        else:
            wrong_pred_amount += 1
    accuracy = correct_pred_amount / (correct_pred_amount + wrong_pred_amount)
    return accuracy, correct_pred_amount, wrong_pred_amount
    
def evaluate_rf(X_train, y_train, X_test, y_test, n_trees: int) -> Tuple[float, int, int]:
    model = RandomForestClassifier(n_estimators=n_trees, random_state=seed)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount

def evaluate_bagging(X_train, y_train, X_test, y_test, n_trees: int) -> Tuple[float, int, int]:
    bags = create_bags(X_train, bags_amount=n_trees)
    models = create_models(X=X_train, y=y_train, bags=bags)
    predictions = predict(X=X_test, models=models)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount
  
    
def evaluate_bagging_sa(X_train, y_train, X_test, y_test, n_trees: int, params: dict) -> Tuple[float, int, int]: 
    T0 = params['T0']
    cooling_method = params['cooling_method']
    alpha = params['alpha']
    max_iterations = params['max_iterations']
    fitness_accuracy_diversity_ratio = params['fitness_accuracy_diversity_ratio']
    feature_mutation_chance = params['feature_mutation_chance']
    test_split_amount = params['test_split_amount']
    bagging_sa = BaggingSA(X=X_train, y=y_train,
                            T0=T0, cooling_method=cooling_method, alpha=alpha, max_iterations=max_iterations, n_trees=n_trees,
                            fitness_accuracy_diversity_ratio=fitness_accuracy_diversity_ratio,
                            feature_mutation_chance=feature_mutation_chance, test_split_amount=test_split_amount)
    models = bagging_sa.run()
    predictions = predict(X=X_test, models=models)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount
    
def evaluate_decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(y_pred, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount


result = []
print(f"Start at {pd.Timestamp.now()}")
for dataset in datasets:
    X, y = get_dataset(dataset)       
    
    random_indices = np.arange(X.shape[0])
    np.random.shuffle(random_indices)
    X = X[random_indices]
    y = y[random_indices]
    
    sub_groups_X = np.array_split(np.array(X), k_cross)
    sub_groups_y = np.array_split(np.array(y), k_cross) 
         
    for n_tree in n_trees:
        for k in range(k_cross):
            X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
            y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
            X_test = sub_groups_X[k]
            y_test = sub_groups_y[k]
            
            dt_acc, dt_correct, dt_wrong = evaluate_decision_tree(X_train, y_train, X_test, y_test)
            bagging_acc, bagging_correct, bagging_wrong = evaluate_bagging(X_train, y_train, X_test, y_test, n_trees=n_tree)

            pars = bagging_sa_params[dataset]
            bagging_sa_acc, bagging_sa_correct, bagging_sa_wrong = evaluate_bagging_sa(X_train, y_train, X_test, y_test, n_trees=n_tree, params=pars)
            
            print(f"    Dataset: {dataset}, n_trees: {n_tree}, k: {k+1}/{k_cross} >> DT: {dt_acc:.4f}, Bagging: {bagging_acc:.4f}, BaggingSA: {bagging_sa_acc:.4f}")
            
            result.append([
                dataset, n_tree, k+1, dt_acc, bagging_acc, bagging_sa_acc,
                dt_correct, bagging_correct, bagging_sa_correct,
                dt_wrong, bagging_wrong, bagging_sa_wrong
            ])
            
            df = pd.DataFrame(result, columns=[
                'dataset', 'nTrees', 'kCrossIndex', 'dtAccuracy', 'baggingAccuracy', 'baggingSAAccuracy',
                'dtCorrectPred', 'baggingCorrectPred', 'baggingSACorrectPred',
                'dtWrongPred', 'baggingWrongPred', 'baggingSAWrongPred'
            ])
            
            df.to_csv(f'./../res/accuracy_comparison_{dataset}.csv', index=False)                   

Start at 2025-04-22 12:38:57.578721
    Dataset: digits, n_trees: 10, k: 1/10 >> DT: 0.8833, Bagging: 0.8222, BaggingSA: 0.8722
    Dataset: digits, n_trees: 10, k: 2/10 >> DT: 0.8167, Bagging: 0.8944, BaggingSA: 0.9000


KeyboardInterrupt: 