In [5]:
import numpy as np
import random
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Genetic Algorithm Parameters
population_size = 20
num_generations = 10
mutation_rate = 0.1

# Manually initialize population
def initialize_population():
    population = []
    for _ in range(population_size):
        individual = {
            "features": [random.randint(0, 1) for _ in range(X.shape[1])],   # Binary mask for features
            "n_neighbors": random.randint(1, 20)                             # Random number of neighbors for KNN
        }
        population.append(individual)
    return population

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually compute Euclidean distance
def euclidean_distance(x1, x2):
    return math.sqrt(sum((a - b) ** 2 for a, b in zip(x1, x2)))

# Manual K-Nearest Neighbors
def knn_predict(X_train, y_train, X_test, n_neighbors):
    predictions = []
    for test_sample in X_test:
        # Compute distances to all training samples
        distances = [(euclidean_distance(test_sample, train_sample), y) 
                     for train_sample, y in zip(X_train, y_train)]
        # Sort by distance and get top n_neighbors
        nearest_neighbors = sorted(distances, key=lambda x: x[0])[:n_neighbors]
        # Get most common class in neighbors
        labels = [label for _, label in nearest_neighbors]
        prediction = max(set(labels), key=labels.count)
        predictions.append(prediction)
    return predictions

# Manually calculate accuracy
def accuracy_manual(y_true, y_pred):
    correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct / len(y_true)

# Fitness function (Manual cross-validation with accuracy calculation)
def fitness(individual):
    selected_features = [i for i, bit in enumerate(individual["features"]) if bit == 1]
    if len(selected_features) == 0:
        return 0  # Avoid individuals with no features selected
    X_selected = X[:, selected_features]
    y_pred = knn_predict(X_train, y_train, X_test, individual["n_neighbors"])
    return accuracy_manual(y_test, y_pred)

# Selection
def select(population):
    population.sort(key=lambda ind: fitness(ind), reverse=True)
    return population[:population_size//2]   # Top half for breeding

# Crossover
def crossover(parent1, parent2):
    child = {"features": [], "n_neighbors": 0}
    # Uniform crossover for features
    for bit1, bit2 in zip(parent1["features"], parent2["features"]):
        child["features"].append(bit1 if random.random() > 0.5 else bit2)
    # Blend crossover for n_neighbors
    child["n_neighbors"] = (parent1["n_neighbors"] + parent2["n_neighbors"]) // 2
    return child

# Mutation
def mutate(individual):
    # Mutate features
    for i in range(len(individual["features"])):
        if random.random() < mutation_rate:
            individual["features"][i] = 1 - individual["features"][i]
    # Mutate n_neighbors
    if random.random() < mutation_rate:
        individual["n_neighbors"] = random.randint(1, 20)

# Main GA Loop
population = initialize_population()
best_scores = []

for generation in range(num_generations):
    # Evaluate fitness
    scores = [fitness(ind) for ind in population]
    best_scores.append(max(scores))
    print(f"Generation {generation + 1}, Best Score: {max(scores)}")

    # Selection
    selected = select(population)

    # Create new population with crossover and mutation
    next_population = []
    while len(next_population) < population_size:
        parent1, parent2 = random.sample(selected, 2)
        child = crossover(parent1, parent2)
        mutate(child)
        next_population.append(child)

    # Replace population with the new generation
    population = next_population

# Find best individual
best_individual = max(population, key=fitness)
print("Best Feature Set:", [i for i, bit in enumerate(best_individual["features"]) if bit == 1])
print("Best n_neighbors:", best_individual["n_neighbors"])
print("Best Accuracy:", fitness(best_individual))

# Plot accuracy progression (requires matplotlib but done manually here for completeness)
for i, score in enumerate(best_scores):
    print(f"Generation {i + 1}: Accuracy {score * 100:.2f}%")
    
    
# Plot accuracy progression
plt.plot(best_scores)
plt.xlabel("Generation")
plt.ylabel("Accuracy")
plt.title("Accuracy Progression Over Generations")
plt.show()    


Generation 1, Best Score: 0.9824561403508771
Generation 2, Best Score: 0.9824561403508771
Generation 3, Best Score: 0.9824561403508771
Generation 4, Best Score: 0.9824561403508771
Generation 5, Best Score: 0.9824561403508771
Generation 6, Best Score: 0.9824561403508771
Generation 7, Best Score: 0.9824561403508771
Generation 8, Best Score: 0.9824561403508771
Generation 9, Best Score: 0.9824561403508771
Generation 10, Best Score: 0.9824561403508771
Best Feature Set: [0, 1, 4, 10, 12, 19, 20, 21, 22, 28]
Best n_neighbors: 11
Best Accuracy: 0.9824561403508771
Generation 1: Accuracy 98.25%
Generation 2: Accuracy 98.25%
Generation 3: Accuracy 98.25%
Generation 4: Accuracy 98.25%
Generation 5: Accuracy 98.25%
Generation 6: Accuracy 98.25%
Generation 7: Accuracy 98.25%
Generation 8: Accuracy 98.25%
Generation 9: Accuracy 98.25%
Generation 10: Accuracy 98.25%
