In [1]:
from Bagging import create_models, create_bags, evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import random
from BaggingSA import BaggingSA
from typing import Literal, Tuple
from Bagging import predict



In [None]:
seed = 42

k_cross = 10
n_trees = [10, 20, 30, 40, 50]
datasets = ['wine', 'abalone', 'breast_cancer', 'pima', 'digits']


np.random.seed(seed)
random.seed(seed)

In [3]:
import sklearn


def get_dataset(dataset_name: str) -> Tuple[np.ndarray, np.ndarray]:
    if dataset_name == 'digits':
        data = sklearn.datasets.load_digits()
        X = data.data
        y = data.target
        
    elif dataset_name == 'wine':
        data = sklearn.datasets.load_wine()
        X = data.data
        y = data.target
    
    elif dataset_name == 'abalone':
        column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
        data = pd.read_csv("./../datasets/abalone.data", names=column_names)
        for label in "MFI":
            data[label] = data["sex"] == label
        del data["sex"]
        y = data.rings.values
        del data["rings"]
        X = data.values.astype(float)
    
    elif dataset_name == 'breast_cancer':
        data = sklearn.datasets.load_breast_cancer()
        X = data.data
        y = data.target
        
    elif dataset_name == 'pima':
        data = pd.read_csv("./../datasets/pima.csv")
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
    
    else:
        raise ValueError("Unsupported dataset")
    return X, y

In [None]:
def get_measures(predictions, y_test):
    correct_pred_amount = 0
    wrong_pred_amount = 0
    accuracy = 0
    for i in range(len(predictions)):
        if predictions[i] == y_test[i]:
            correct_pred_amount += 1
        else:
            wrong_pred_amount += 1
    accuracy = correct_pred_amount / (correct_pred_amount + wrong_pred_amount)
    return accuracy, correct_pred_amount, wrong_pred_amount
    

def evaluate_bagging(X_train, y_train, X_test, y_test, n_trees: int) -> Tuple[float, int, int]:
    bags = create_bags(X_train, bags_amount=n_trees)
    models = create_models(X=X_train, y=y_train, bags=bags)
    predictions = predict(X=X_test, models=models)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount
    
def evaluate_bagging_sa(X_train, y_train, X_test, y_test, n_trees: int) -> Tuple[float, int, int]: 
    bagging_sa = BaggingSA(X=X_train, y=y_train,
                            T0=10, cooling_method='geometric', alpha=0.99, max_iterations=2500, n_trees=n_trees,
                            fitness_accuracy_disagreement_ratio=0.8,
                            feature_mutation_chance=0.3, test_split_amount=10)
    models = bagging_sa.run()
    predictions = predict(X=X_test, models=models)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(predictions, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount
    
def evaluate_decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy, correct_pred_amount, wrong_pred_amount = get_measures(y_pred, y_test)
    return accuracy, correct_pred_amount, wrong_pred_amount


result = []
print(f"Start at {pd.Timestamp.now()}")
for dataset in datasets:
    X, y = get_dataset(dataset)       
    sub_groups_X = np.array_split(np.array(X), k_cross)
    sub_groups_y = np.array_split(np.array(y), k_cross) 
         
    for n_tree in n_trees:
        for k in range(k_cross):
            X_train = np.concatenate(sub_groups_X[:k] + sub_groups_X[k+1:])
            y_train = np.concatenate(sub_groups_y[:k] + sub_groups_y[k+1:])
            X_test = sub_groups_X[k]
            y_test = sub_groups_y[k]
            
            dt_acc, dt_correct, dt_wrong = evaluate_decision_tree(X_train, y_train, X_test, y_test)
            bagging_acc, bagging_correct, bagging_wrong = evaluate_bagging(X_train, y_train, X_test, y_test, n_trees=n_tree)
            bagging_sa_acc, bagging_sa_correct, bagging_sa_wrong = evaluate_bagging_sa(X_train, y_train, X_test, y_test, n_trees=n_tree)
            
            print(f"Dataset: {dataset}, n_trees: {n_tree}, k: {k}/{k_cross} >> DT: {dt_acc:.4f}, Bagging: {bagging_acc:.4f}, BaggingSA: {bagging_sa_acc:.4f}")
            
            result.append([
                dataset, n_tree, k, dt_acc, bagging_acc, bagging_sa_acc,
                dt_correct, bagging_correct, bagging_sa_correct,
                dt_wrong, bagging_wrong, bagging_sa_wrong
            ])
            
            df = pd.DataFrame(result, columns=[
                'dataset', 'nTrees', 'kCrossIndex', 'dtAccuracy', 'baggingAccuracy', 'baggingSAAccuracy',
                'dtCorrectPred', 'baggingCorrectPred', 'baggingSACorrectPred',
                'dtWrongPred', 'baggingWrongPred', 'baggingSAWrongPred'
            ])
            
            df.to_csv(f'./../res_article/accuracy_comparison_{dataset}.csv', index=False)                   
                
            
df_aggregated = df.groupby(['dataset', 'nTrees']).agg(
    dataset=('dataset', 'first'),
    nTrees=('nTrees', 'first'),
    
    dtAccuracy=('dtAccuracy', 'mean'),
    dtAccuracyStd=('dtAccuracy', 'std'),
    baggingAccuracy=('baggingAccuracy', 'mean'),
    baggingAccuracyStd=('baggingAccuracy', 'std'),
    baggingSAAccuracy=('baggingSAAccuracy', 'mean'),
    baggingSAAccuracyStd=('baggingSAAccuracy', 'std'),
    
    dtCorrectPred=('dtCorrectPred', 'mean'),
    dtCorrectPredStd=('dtCorrectPred', 'std'),
    baggingCorrectPred=('baggingCorrectPred', 'mean'),
    baggingCorrectPredStd=('baggingCorrectPred', 'std'),
    baggingSACorrectPred=('baggingSACorrectPred', 'mean'),
    baggingSACorrectPredStd=('baggingSACorrectPred', 'std'),
    
    dtWrongPred=('dtWrongPred', 'mean'),
    dtWrongPredStd=('dtWrongPred', 'std'),
    baggingWrongPred=('baggingWrongPred', 'mean'),
    baggingWrongPredStd=('baggingWrongPred', 'std'),
    baggingSAWrongPred=('baggingSAWrongPred', 'mean'),
    baggingSAWrongPredStd=('baggingSAWrongPred', 'std')
).reset_index(drop=True)
df_aggregated.to_csv('./../res_article/results_comparison_aggregated.csv', index=False)
print(df_aggregated)        

Start at 2025-04-14 11:35:41.045230
Dataset: pima, n_trees: 10, k: 0/10 >> DT: 0.6623, Bagging: 0.5844, BaggingSA: 0.6494


KeyboardInterrupt: 