In [9]:
# using genetic algorithms to optimize simple machine learning models, such 
# as the random forest classifier, to implement brain tumor classification
# using tpot runs thru many models to find the best one
# The dataset used for this model is images of brain tumors available on Kaggle
# https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset
# https://www.geeksforgeeks.org/random-forest-for-image-classification-using-opencv/

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import csv
from leap_ec import Individual, ops, util, Representation
from leap_ec.algorithm import generational_ea
from leap_ec.decoder import IdentityDecoder
from leap_ec.problem import ScalarProblem
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import cv2
import os
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from toolz import pipe
import time 

In [11]:
# --- Genetic Algorithm Parameters ---
POP_SIZE = 2
GENERATIONS = 10
MUTATION_RATE = 0.1
CROSSOVER_RATE = 0.5
TOURNAMENT_SIZES = 5

# --- Hyperparameter search grid ---
n_estimators_options = list(range(50, 55, 5))
max_depth_options = list(range(1, 6, 5))  

# Load Dataset
def load_data(folder_path):
    X = []
    y = []
    for class_name in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_name)
        if os.path.isdir(class_path):
            for file_name in os.listdir(class_path):
                file_path = os.path.join(class_path, file_name)
                try:
                    img = Image.open(file_path).convert('L') 
                    img = img.resize((64, 64))  
                    img_array = np.array(img).flatten()
                    X.append(img_array)
                    y.append(class_name)
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return np.array(X), np.array(y)

X_train, y_train = load_data('brain_tumor_mri/Training')
X_test, y_test = load_data('brain_tumor_mri/Testing')

# Encode Labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [12]:
fitness_cache = {}
class RandomForestGAProblem(ScalarProblem):
    def __init__(self):
        super().__init__(maximize=True)

    def evaluate(self, individual):
        n_estimators = n_estimators_options[int(individual[0]) % len(n_estimators_options)]
        max_depth = max_depth_options[int(individual[1]) % len(max_depth_options)]

        key = (n_estimators, max_depth)
        print(f"Evaluating key: {key}")
        
        if key in fitness_cache:
            return fitness_cache[key]
        
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, n_jobs=1)
        start = time.time()
        clf.fit(X_train, y_train)
        print(f"Training time: {time.time() - start:.2f} seconds")
        
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        fitness_cache[key] = acc
        return acc

In [13]:
# --- Initialization function (systematic) ---
genome_counter = 0
def initialize_hyperparameters():
    global genome_counter
    i = genome_counter % len(n_estimators_options)
    j = genome_counter % len(max_depth_options)
    genome_counter += 1
    return [i, j]

# --- Custom crossover and mutation operators ---
def uniform_crossover(population, p=0.5):
    offspring = []
    it = iter(population)
    for parent1 in it:
        try:
            parent2 = next(it)
        except StopIteration:
            offspring.append(parent1)
            break
        child1_genome = []
        child2_genome = []
        for gene1, gene2 in zip(parent1.genome, parent2.genome):
            if np.random.rand() < p:
                child1_genome.append(gene2)
                child2_genome.append(gene1)
            else:
                child1_genome.append(gene1)
                child2_genome.append(gene2)
        offspring.append(parent1.__class__(child1_genome))
        offspring.append(parent2.__class__(child2_genome))
    return offspring

def mutate_integer(population, search_space, p=0.1):
    for individual in population:
        for i, s in enumerate(search_space):
            if np.random.rand() < p:
                individual.genome[i] = np.random.randint(0, len(s))
        yield individual

In [14]:
search_space = [range(len(n_estimators_options)), range(len(max_depth_options))]

# --- Create initial population ---
parents = Individual.create_population(
    POP_SIZE,
    initialize=initialize_hyperparameters,
    decoder=IdentityDecoder(),
    problem=RandomForestGAProblem()
)

In [15]:
parents = Individual.evaluate_population(parents)
generation_counter = util.inc_generation()
seen_combinations = set()
best_accuracies = []
log_rows = []

Evaluating key: (50, 1)
Training time: 0.55 seconds
Evaluating key: (50, 1)


In [16]:
while generation_counter.generation() < GENERATIONS:
    print(f"Working on generation {generation_counter.generation() + 1}")
    offspring = pipe(parents,
                lambda pop: ops.tournament_selection(pop, k=TOURNAMENT_SIZES),
                ops.clone,
                lambda pop: uniform_crossover(pop, p=CROSSOVER_RATE),
                lambda pop: mutate_integer(pop, search_space, p=MUTATION_RATE),
                ops.evaluate,
                ops.pool(size=len(parents))
    )
    print (f"done piping generation {generation_counter.generation() + 1}")

    best = max(offspring, key=lambda ind: ind.fitness)
    avg_accuracy = np.mean([ind.fitness for ind in offspring])

    best_n_estimators = n_estimators_options[int(best.genome[0]) % len(n_estimators_options)]
    best_max_depth = max_depth_options[int(best.genome[1]) % len(max_depth_options)]

    all_n_estimators = [n_estimators_options[int(ind.genome[0]) % len(n_estimators_options)] for ind in offspring]
    all_max_depths = [max_depth_options[int(ind.genome[1]) % len(max_depth_options)] for ind in offspring]
    avg_n_estimators = np.mean(all_n_estimators)
    avg_max_depth = np.mean(all_max_depths)
    error = 1 - best.fitness

    for ind in offspring:
        n_estimators = n_estimators_options[int(ind.genome[0]) % len(n_estimators_options)]
        max_depth = max_depth_options[int(ind.genome[1]) % len(max_depth_options)]
        seen_combinations.add((n_estimators, max_depth))

    best_accuracies.append(best.fitness)

    print(f"Generation {generation_counter.generation()}:\n"
          f"    Best Accuracy: {best.fitness:.4f}\n"
          f"    Best Hyperparameters: n_estimators={best_n_estimators}, max_depth={best_max_depth}\n"
          f"    Unique hyperparameter combinations explored so far: {len(seen_combinations)} / 310\n")
    
    log_rows.append([
        generation_counter.generation(),
        best.fitness,
        avg_accuracy,
        best_n_estimators,
        best_max_depth,
        avg_n_estimators,
        avg_max_depth,
        error
    ])

    parents = offspring
    generation_counter()

Working on generation 1


KeyboardInterrupt: 

In [None]:
# --- Plot Best Accuracy per Generation ---
plt.plot(range(GENERATIONS), best_accuracies, marker='o')
plt.title('Best Accuracy per Generation')
plt.xlabel('Generation')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

In [None]:
# --- Save results to CSV ---
with open("RF_LEAP_GA_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Generation", "Best_Accuracy", "Average_Accuracy", "Best_n_estimators", "Best_max_depth", "Avg_n_estimators", "Avg_max_depth", "Error"])
    writer.writerows(log_rows)