In [1]:
#apply ga in medical dataset
import numpy as np
import random
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Genetic Algorithm Parameters
POPULATION_SIZE = 10
MUTATION_RATE = 0.1
GENERATIONS = 10

# Fitness function (using a Random Forest classifier)
def calculate_fitness(features, X_train, X_test, y_train, y_test):
    if not features:  # If features list is empty, return a very low accuracy
        return 0.0

    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train[:, features], y_train)
    predictions = clf.predict(X_test[:, features])
    return accuracy_score(y_test, predictions)

# Initialization of the population
population = [random.sample(range(X_train.shape[1]), k=5) for _ in range(POPULATION_SIZE)]

# Main Genetic Algorithm loop
for generation in range(GENERATIONS):
    fitness_scores = [calculate_fitness(features, X_train, X_test, y_train, y_test) for features in population]

    # Select top individuals (features) based on their fitness scores
    selected_indices = np.argsort(fitness_scores)[-POPULATION_SIZE:]
    selected_population = [population[i] for i in selected_indices]

    # Create new individuals through crossover and mutation
    new_population = []

    for _ in range(POPULATION_SIZE // 2):
        parent1 = random.choice(selected_population)
        parent2 = random.choice(selected_population)

        # Ensure parent1 is not empty
        while not parent1:
            parent1 = random.choice(selected_population)

        # Ensure crossover point allows at least one element from parent1
        crossover_point = random.randint(0, len(parent1) - 1)

        # Ensure child features do not have duplicates
        child1 = parent1[:crossover_point] + [f for f in parent2 if f not in parent1[crossover_point:]]
        child2 = parent2[:crossover_point] + [f for f in parent1 if f not in parent2[crossover_point:]]

        # Remove duplicates in child features
        child1 = list(dict.fromkeys(child1))
        child2 = list(dict.fromkeys(child2))

        # Ensure child features are within the valid range
        child1 = [f if 0 <= f < X_train.shape[1] else random.randint(0, X_train.shape[1] - 1) for f in child1]
        child2 = [f if 0 <= f < X_train.shape[1] else random.randint(0, X_train.shape[1] - 1) for f in child2]

        # Mutation
        child1 = [f if random.random() > MUTATION_RATE else random.randint(0, X_train.shape[1] - 1) for f in child1]
        child2 = [f if random.random() > MUTATION_RATE else random.randint(0, X_train.shape[1] - 1) for f in child2]

        new_population.extend([child1, child2])

    population = new_population

# Get the best individual (features) from the final population
best_indices = np.argsort(fitness_scores)[-1]
best_features = list(dict.fromkeys(population[best_indices]))

# Check if best_features is not empty before fitting the model
if best_features:
    # Train a model using the best features and evaluate it
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    clf.fit(X_train[:, best_features], y_train)
    predictions = clf.predict(X_test[:, best_features])
    accuracy = accuracy_score(y_test, predictions)

    print(f"Best features: {best_features}")
    print(f"Accuracy with selected features: {accuracy}")
else:
    print("No features selected.")


Best features: [11, 21]
Accuracy with selected features: 0.6929824561403509
