In [3]:
# using genetic algorithms to optimize simple machine learning models, such 
# as the random forest classifier, to implement brain tumor classification
# This files does not implement tpot and only uses one specific version of a genetic algorithm
# The dataset used for this model is images of brain tumors available on Kaggle
# https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset
# https://medium.com/@ela.markovic/feature-selection-using-genetic-algorithm-complete-beginner-friendly-guide-198496393728
# https://www.geeksforgeeks.org/random-forest-for-image-classification-using-opencv/


In [6]:
# import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from PIL import Image
import cv2

In [7]:
# Load Dataset
def load_data(folder_path):
    X = []
    y = []
    for class_name in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_name)
        if os.path.isdir(class_path):
            for file_name in os.listdir(class_path):
                file_path = os.path.join(class_path, file_name)
                try:
                    img = Image.open(file_path).convert('L')  # 'L' = grayscale
                    img = img.resize((64, 64))  # Resize for consistency
                    img_array = np.array(img).flatten()
                    X.append(img_array)
                    y.append(class_name)
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return np.array(X), np.array(y)

X_train, y_train = load_data('brain_tumor_mri/Training')
X_test, y_test = load_data('brain_tumor_mri/Testing')

In [8]:
# Encode Labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [9]:
# Hyperparameter ranges
N_ESTIMATORS_MIN = 50
N_ESTIMATORS_MAX = 200
MAX_DEPTH_MIN = 1
MAX_DEPTH_MAX = 50

# Genetic Algorithm parameters
POP_SIZE = 100
GENERATIONS = 10
MUTATION_RATE = 0.1
CROSSOVER_RATE = 0.5
TOURNAMENT_SIZES = 5

In [10]:
# Initialize random population
population = [ 
    [np.random.randint(N_ESTIMATORS_MIN, N_ESTIMATORS_MAX + 1), 
     np.random.randint(MAX_DEPTH_MIN, MAX_DEPTH_MAX + 1)] 
    for _ in range(POP_SIZE) 
]

In [12]:
def eval_genome(genome):
    n_estimators = int(genome[0])
    max_depth = int(genome[1])
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train_encoded)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test_encoded, y_pred)
    return acc

def tournament_selection(pop, fitnesses):
    selected = []
    for _ in range(POP_SIZE):
        competitors_idx = np.random.choice(len(pop), TOURNAMENT_SIZES, replace=False)
        best_idx = competitors_idx[np.argmax([fitnesses[i] for i in competitors_idx])]
        selected.append(pop[best_idx].copy())
    return selected

def crossover(parent1, parent2):
    if np.random.rand() < CROSSOVER_RATE:
        point = np.random.randint(1, len(parent1))
        child1 = parent1[:point] + parent2[point:]
        child2 = parent2[:point] + parent1[point:]
        return child1, child2
    else:
        return parent1.copy(), parent2.copy()

def mutate(genome):
    if np.random.rand() < MUTATION_RATE:
        gene = np.random.choice([0,1])
        if gene == 0:
            genome[0] = np.random.randint(N_ESTIMATORS_MIN, N_ESTIMATORS_MAX + 1)
        else:
            genome[1] = np.random.randint(MAX_DEPTH_MIN, MAX_DEPTH_MAX + 1)
    return genome

In [None]:
# GA main loop
best_accuracies = []  # To track best accuracy over generations

for gen in range(GENERATIONS):
    print(f"Evaluating Generation {gen + 1}")
    fitnesses = [eval_genome(g) for g in population]
    best_accuracy = np.max(fitnesses)
    avg_accuracy = np.mean(fitnesses)
    best_accuracies.append(best_accuracy)
    
    print(f"Generation {gen}: Best Accuracy = {best_accuracy:.4f}, Avg Accuracy = {avg_accuracy:.4f}")

    # Selection
    selected_population = tournament_selection(population, fitnesses)

    # Crossover and Mutation
    next_population = []
    for i in range(0, POP_SIZE, 2):
        parent1 = selected_population[i]
        parent2 = selected_population[(i+1) % POP_SIZE]
        child1, child2 = crossover(parent1, parent2)
        child1 = mutate(child1)
        child2 = mutate(child2)
        next_population.extend([child1, child2])

    # Update the population
    population = next_population[:POP_SIZE]

In [None]:
# Final evaluation
final_fitnesses = [eval_genome(g) for g in population]
best_idx = np.argmax(final_fitnesses)
best_genome = population[best_idx]

print("\nBest hyperparameters found:")
print(f"n_estimators: {best_genome[0]}, max_depth: {best_genome[1]}")
print(f"Test accuracy: {final_fitnesses[best_idx]:.4f}")

# Train final model
best_model = RandomForestClassifier(
    n_estimators=int(best_genome[0]),
    max_depth=int(best_genome[1]),
    random_state=42
)
best_model.fit(X_train, y_train_encoded)
y_pred_final = best_model.predict(X_test)

print("\nFinal Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_encoded, y_pred_final):.4f}")

# (Optional) Plotting Accuracy Progress Over Generations
import matplotlib.pyplot as plt
plt.plot(best_accuracies)
plt.xlabel('Generation')
plt.ylabel('Best Accuracy')
plt.title('Best Accuracy Over Generations')
plt.grid(True)
plt.show()