In [98]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random
from typing import Tuple

seed = 42
bags_amount = 10
bag_size_ratio = 0.7

t0 = 10
alpha = 0.1
max_iter = 100


np.random.seed(seed)
random.seed(seed)


In [99]:
def create_random_bag(
    data: Tuple[np.ndarray, np.ndarray],
    bag_size_ratio: float = 0.7) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    
    X, y = data
    bag_size = int(len(X) * bag_size_ratio)
    idx = np.random.choice(len(X), bag_size, replace=True)
    tmpX, tmpy = X[idx], y[idx]

    # Add randomness to feature selection
    min_features = max(2, int(np.sqrt(X.shape[1]) * 0.8))  # Ensure at least 2 features
    max_features = min(X.shape[1], int(np.sqrt(X.shape[1]) * 1.2))  
    features_amount = np.random.randint(min_features, max_features + 1)  

    features_idx = np.random.choice(X.shape[1], features_amount, replace=False)
    return tmpX[:, features_idx], tmpy, features_idx  # Return selected features too

    
def get_neighbor_bag(
    data: Tuple[np.ndarray, np.ndarray],
    bag: Tuple[np.ndarray, np.ndarray, np.ndarray]) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:

    X, y = data
    X_bag, y_bag, features_idx = bag  
    
    new_X_bag = np.copy(X_bag)
    new_y_bag = np.copy(y_bag)

    num_swaps = max(1, len(X_bag) // 10)
    for _ in range(num_swaps):
        index_from = random.randint(0, len(new_X_bag) - 1)
        index_to = random.randint(0, len(X) - 1)

        new_X_bag[index_from] = X[index_to, features_idx]
        new_y_bag[index_from] = y[index_to]

    return new_X_bag, new_y_bag, features_idx



def create_model(
    bag: Tuple[np.ndarray, np.ndarray, np.ndarray]) -> Tuple[DecisionTreeClassifier, np.ndarray]:
    X, y, features_idx = bag
    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    return clf, features_idx


def evaluate_model(
    clf: DecisionTreeClassifier,
    data: Tuple[np.ndarray, np.ndarray]) -> float:
    X, y = data
    y_pred = clf.predict(X)
    return accuracy_score(y, y_pred)

def evaluate_ensemble(
    clfs: list[Tuple[DecisionTreeClassifier, np.ndarray]],
    data: Tuple[np.ndarray, np.ndarray]) -> float:
    X, y = data
    y_pred = np.zeros(len(y))

    for clf, features_idx in clfs:
        sub_X = X[:, features_idx]
        y_pred += clf.predict(sub_X)

    y_pred = [1 if x > len(clfs) / 2 else 0 for x in y_pred]
    return accuracy_score(y, y_pred)


def get_new_temperature(
    t: float,
    alpha: float) -> float:
    return t * (1 - alpha)

def get_acceptance_probability(delta_fitness:float ,t: float) -> float:
    return np.exp(-delta_fitness / t)

def SA_Bagging(
    data: Tuple[np.ndarray, np.ndarray],
    bags_amount: int, 
    bag_size_ratio: float, 
    t0: float, 
    alpha: float, 
    max_iter: int) -> np.ndarray:
    
    cur_temp = t0
    cur_iter = 0
    cur_bags = [create_random_bag(data, bag_size_ratio) for _ in range(bags_amount)]
    models = [create_model(bag) for bag in cur_bags]
    cur_fitness = evaluate_ensemble(models, data)
    
    best_bags = cur_bags.copy()
    best_fitness = cur_fitness
    
    print(f"Initial fitness: {cur_fitness}")
    
    while cur_iter < max_iter:
        new_bags = [get_neighbor_bag(data, bag) for bag in cur_bags]
        new_models = [create_model(bag) for bag in new_bags]
        new_fitness = evaluate_ensemble(new_models, data)
        delta_fitness = new_fitness - cur_fitness


        if new_fitness > best_fitness:
            best_bags = new_bags
            best_fitness = new_fitness

        if delta_fitness > 0 or get_acceptance_probability(delta_fitness, cur_temp) > random.random():
            cur_bags = new_bags
            cur_fitness = new_fitness


        print(f"Iteration: {cur_iter}, Fitness: {cur_fitness}, Best fitness: {best_fitness}")
        cur_temp = get_new_temperature(cur_temp, alpha)
        cur_iter += 1

        
    print(f"Best fitness: {best_fitness}")
    return best_bags

In [100]:
data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
train_data = (X_train, y_train)
test_data = (X_test, y_test)

best_bags = SA_Bagging(train_data, bags_amount, bag_size_ratio, t0, alpha, max_iter)

best_models = [create_model(bag) for bag in best_bags]
accuracy = evaluate_ensemble(best_models, test_data)
print(f"Accuracy: {accuracy}")

Initial fitness: 0.6476190476190476
Iteration: 0, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 1, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 2, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 3, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 4, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 5, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 6, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 7, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 8, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 9, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 10, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 11, Fitness: 0.6476190476190476, Best fitness: 0.6476190476190476
Iteration: 12, Fitness: 0.6476190476190476, Best f

In [101]:
data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_data = (X_train, y_train)
test_data = (X_test, y_test)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
