This notebook aims to combine the concepts of evolutionary programming using the NEAT algorithm with Stochastic Gradient Descent (SGD). One of the issues adding SGD addresses is that the models are now able to learn to get better at the dataset during each 'life' before the fitness score is calculated. This allows for the model to not have to randomly stumble upon the correct weights.

This should also remove the need for speciation since speciation's primary goal is to protect mutations. Additionally, the mutation probabilities can be bumped up to maximum since there is no need for weight mutations.

In [None]:
import torch, torch.nn as nn
import networkx as nx
import numpy as np
import pandas as pd
import math, random, statistics, itertools

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Create the dataset loader and other utilities for evaluating the model. If the model can learn on this data, push toward other tests.

In [None]:
# @title The Iris Flowers Dataset
class IrisFlowers:
    def __init__(self, filename):
        self.file = filename
        df = pd.read_csv(self.file)
        dataset = df.to_dict(orient="list")
        self.variety_to_index = {}
        self.index_to_variety = {}

        for index, word in enumerate(set(dataset["variety"])):
            self.variety_to_index[word] = index
            self.index_to_variety[index] = word

        df["variety"] = df["variety"].map(self.variety_to_index)
        df = df.sample(frac=1)

        self.df = df
        self.df_np = self.df.to_numpy()
        self.length = len(df)

    def single_sample(self) -> np.ndarray:
        random_selection = random.randint(0, self.length - 1)
        return self.df_np[random_selection]

    def batched_sample(self, batch_size: int) -> np.ndarray:
        random_shuffled_df = np.copy(self.df_np)
        np.random.shuffle(random_shuffled_df)
        sample = random_shuffled_df[:batch_size]
        return torch.from_numpy(sample[:, :4]), torch.from_numpy(sample[:, 4])

dataset = IrisFlowers("./iris.csv")

In [None]:
# @title The Innovation Database Class.
class InnovationDatabase:

    def __init__(self):
        self.i2n = {}
        self.n2i = {}
        self.innovation_number = 1

    def __setitem__(self, key, value):
        if key in self.i2n.keys() or key in self.n2i.keys():
            raise Exception(f"Not Updating Database. Key {key} already exists.")
            return
        if type(key) is int:
            self.i2n[key] = value
            self.n2i[value] = key
        else:
            self.n2i[key] = value
            self.i2n[value] = key

    def __getitem__(self, key):
        if type(key) is int:
            return self.i2n[key]
        else:
            return self.n2i[key]

    def __str__(self):
        return str(self.i2n)

    def innovation(self, edge):
        if edge not in self.n2i.keys():
            self[edge] = self.innovation_number
            self.innovation_number += 1
        return self[edge]

    def reverse_innovation(self, innovation_number):
        if innovation_number not in self.i2n.keys():
            return None
        return self.i2n[innovation_number]

database = InnovationDatabase()

Create the class that defines each individual in a species. The individual class is centered around a `torch.nn.Module` and also a `nx.DiGraph`. For evaluation, the network uses the DiGraph and during backpropogation, the constructed graph can be used for updating the weights of the model. Modifying the graph allows for mutations in the genome of the individual.

The below implementation has the following features:
    - Given a graph, the class adapts itself to that network.
    - Returning a copy of the graph as a numpy array rather than as a torch array (to make sure the parameters are immutable).
    - Evaluation
    - Mutations (new edge, new vertex).

In [None]:
# @title The Individual Class that controls all the evaluation, learning, and mutations.
class Network(nn.Module):
    def __init__(self, inputs: int, outputs: int, graph: nx.DiGraph = None):
        super().__init__()

        self.graph = nx.DiGraph()
        self.inputs_and_outputs = (inputs, outputs)
        self.params = nn.ParameterDict()

        if graph is None:

            self.graph.add_nodes_from([
                *[(i, {"node": "input"}) for i in range(1, inputs + 1)],  # The inputs
                *[(o, {"node": "output"}) for o in range(inputs + 1, inputs + outputs + 1)]  # The outputs
            ])
            self.params = nn.ParameterDict({
                **{f"{i}": nn.Parameter(data=torch.normal(0, 1, size=())) for i in range(1, inputs + 1)},
                **{f"{i}": nn.Parameter(data=torch.normal(0, 1, size=())) for i in range(inputs + 1, inputs + outputs + 1)},
            })

            self.params.update((str(i), nn.Parameter( data=torch.normal(0, 1, size=()) )) for i in range(1, inputs + 1))
            self.params.update((str(i), nn.Parameter( data=torch.normal(0, 1, size=()) )) for i in range(inputs + 1, inputs + outputs + 1))

            for i in range(1, inputs + 1):
                for o in range(inputs + 1, inputs + outputs + 1):
                    self.params.update({f"{i}_{o}": nn.Parameter(data=torch.normal(0, 1, size=())) })
                    edge_attr = {"enabled": True, "i_n": database.innovation((i, o))}
                    self.graph.add_edge(i, o, **edge_attr)
        else:
            self.set_graph(graph)

    def get_graph(self):
        graph = nx.DiGraph()

        for node in self.graph.nodes:
            graph.add_node(node, weight=np.asarray(self.params.get( str(node) ).data), node=self.graph.nodes[node]["node"])

        for (i, o) in self.graph.edges:
            graph.add_edge(i, o, weight=np.asarray(self.params.get( f"{i}_{o}" ).data), enabled=self.graph.edges[(i, o)]["enabled"], i_n=self.graph.edges[(i, o)]["i_n"])

        return graph

    def set_graph(self, graph):
        self.params.clear()
        self.graph = nx.DiGraph()

        for node in graph.nodes:
            parameter = nn.Parameter( data=torch.as_tensor(graph.nodes[node]["weight"]) )
            self.params.update(((str(node), parameter),))
            self.graph.add_node(node, node=graph.nodes[node]["node"])

        for (i, o) in graph.edges:
            data = torch.as_tensor(graph.edges[(i, o)]["weight"])
            self.params.update(
                ((f"{i}_{o}", nn.Parameter(data=data)),)
            )
            self.graph.add_edge(i, o, enabled=graph.edges[(i, o)]["enabled"], i_n=graph.edges[(i, o)]["i_n"])

    @staticmethod
    def get_inputs_and_outputs(graph):
        inputs = []
        outputs = []
        for node in graph.nodes:
            if graph.nodes[node]["node"] == "input":
                inputs.append(node)
            elif graph.nodes[node]["node"] == "output":
                outputs.append(node)
            else:
                break
        return set(inputs), set(outputs)

    def forward(self, x):
        inputs, outputs = Network.get_inputs_and_outputs(self.graph)
        values = {i: x[:, idx] for idx, i in enumerate(inputs)}
        eval_stack, evaluated_stack = [], []

        # Add all the successors of the inputs to the evaluation stack.
        for i in inputs:
            for s in self.graph.successors(i):
                if self.graph.edges[(i, s)]["enabled"] and s not in eval_stack:
                    eval_stack.append(s)

        # Keep going through the eval_stack until it's empty.
        while len(eval_stack) > 0:
            node = eval_stack.pop(0)

            # Don't evaluate the node if it has already been evaluated.
            if node in values.keys():
                continue

            # Get the enabled parents and the successors of the node.
            parents = {parent for parent in set(nx.all_neighbors(self.graph, node)) - set(self.graph.successors(node)) if self.graph.edges[(parent, node)]["enabled"]}
            successors = {i for i in set(self.graph.successors(node)) if self.graph.edges[(node, i)]["enabled"]} - set(eval_stack)

            if len(parents - set(values.keys())) == 0:
                value = 0
                for parent in parents:
                    weight = self.params[f"{parent}_{node}"]
                    value += weight * values[parent]
                value += self.params[f"{node}"]
                values[node] = torch.relu(value)
                # Add successors that are not in the evaluation stack.
                eval_stack.extend(successors)
            else:
                eval_stack.extend(parents.union({node}) - set(eval_stack) - set(values.keys()))

        output = torch.zeros((x.shape[0], len(outputs)))
        for idx, i in enumerate(outputs):
            output[:, idx] = values[i]
        output = torch.softmax(output, dim=-1)
        return output

    def mutate_vertex(self):
        self.zero_grad()
        genome = self.get_graph()
        edge = random.choice([edge for edge in genome.edges if genome.edges[edge]["enabled"]])
        genome.edges[edge]["enabled"] = False
        in_node, out_node = edge[0], edge[1]
        # Get the integer for the new node that can be added.
        new_node = max(genome.nodes) + 1
        genome.add_node(new_node, weight=torch.normal(0, 1, size=()), node="hidden")
        genome.add_edges_from([
            (in_node, new_node, {"weight": np.random.normal(),
                                "enabled": True,
                                "i_n": database.innovation((in_node, new_node))}
            ),
            (new_node, out_node, {"weight": np.random.normal(),
                                "enabled": True,
                                "i_n": database.innovation((new_node, out_node))})])
        self.set_graph(genome)

    def mutate_connection(self):
        genome = self.get_graph()
        inputs, outputs = Network.get_inputs_and_outputs(genome)
        # Get the list of all possible genes that can be made and remove the genes that
        # are already in the genome that are enabled.
        possible_genes = set(itertools.product(set(genome.nodes) - outputs,
                                            set(genome.nodes) - inputs))
        possible_genes = {edge for edge in possible_genes if edge[0] != edge[1]}
        already_used_genes = set([edge for edge in genome.edges if genome.edges[edge]["enabled"]])
        possible_genes = possible_genes - already_used_genes
        possible_genes = [edge for edge in possible_genes if edge[1] not in set(nx.ancestors(genome, edge[0]))]
        if len(possible_genes) == 0:
            return genome
        random_gene = random.choice(possible_genes)
        if random_gene in genome.edges:
            genome.edges[random_gene]["enabled"] = True
        else:
            genome.add_edge(*random_gene, **{"weight": torch.normal(0, 1, size=()), "enabled": True, "i_n": database.innovation(random_gene)})
        self.set_graph(genome)

    def plot(self, with_weights=False):
        node_colors = {"hidden": "blue", "input": "green", "output": "red"}
        node_colors = [node_colors[self.graph.nodes[i]["node"]] for i in self.graph.nodes]
        # Define the edge colors.
        edge_colors = ["green" if self.graph.edges[edge]["enabled"] else "red" for edge in self.graph.edges]
        # Define the layout of the drawn genome.
        pos = nx.circular_layout(self.graph)

        nx.draw(self.graph, pos=pos, with_labels=True, node_color=node_colors, edge_color=edge_colors)
        if with_weights:
            edge_labels = {edge: round(float(self.params[f"{edge[0]}_{edge[1]}"].data.detach().numpy()), 2) for edge in self.graph.edges}
            nx.draw_networkx_edge_labels(self.graph, pos=pos, edge_labels=edge_labels, font_color="green")
        plt.show()

    def clone(self):
        inputs, outputs = self.inputs_and_outputs
        return Network(inputs, outputs, self.get_graph())

Perform tests to see if the graphs work.

In [None]:
# Graph Creation
net = Network(4, 3)

# Graph Mutations
for _ in range(10):
    net.mutate_connection()
    net.mutate_vertex()

# Graph Evaluation
x, y = dataset.batched_sample(16)
print(f"Input Shape: {x.shape}")
print(f"Output Shape: {net(x).shape}")

# Plotting
net.plot()

## Generation

As pointed out before, the process in each generation goes as follows, with hyperparameters:

$$
\mu = \text{Population Size}\\
\Delta = \text{Maximum Age of an individual}\\
\lambda = \text{Number of offsprings}
$$

During each generation, given $\mu$ individuals, the individuals first generate offspring. The offspring are then mutated. The offspring go through a learning phase where they are trained using SGD for a maximum of $\Delta$ iterations. Finally, the fitness score is calculated for each individual in the population and the offsprings and the best are kept for the next generation.

First things first, test the algorithm without crossover.

In [None]:
mu = 10
delta = 200
lmbda = 10
batch_size = 32
learning_rate = 1e-4
generations = 10

In [None]:
def learn(net):
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate)
    losses = []
    for iter in range(delta):
        optimizer.zero_grad()
        x, y = dataset.batched_sample(batch_size)
        y = y.to(torch.long)

        y_hat = net(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)

        loss.backward()
        optimizer.step()

        losses.append(float(loss.item()))

    return losses

def fitness(net, sample_size):
    x, y = dataset.batched_sample(sample_size)
    y_hat = net(x)
    y_hat = torch.argmax(y_hat, axis=-1)

    return float(torch.sum(y == y_hat) / sample_size)

def fitness_population(population):
    return np.array([fitness(i) for i in population])

def generate_offsprings(population, mutate_vertex_prob=0.3):
    offsprings = []

    for _ in range(lmbda):
        random_individual = random.choice(population).clone()

        # Only one type of mutation at a time. Could possibly alter this later.
        random_individual.mutate_vertex() if random.random() < mutate_vertex_prob else random_individual.mutate_connection()

        offsprings.append(random_individual)

    return offsprings

In [None]:
population = [Network(4, 3) for _ in range(mu)]
losses = [learn(i) for i in population]
fitness_scores = fitness_population(population)
np.max(fitness_scores), np.mean(fitness_scores), np.min(fitness_scores)

In [None]:
# @title The process for one iteration.
offsprings = generate_offsprings(population)
offspring_losses = [learn(individual) for individual in offsprings]

combined = population + offsprings
combined_fitness = [fitness(individual) for individual in combined]
sorted_fitness_indices = np.flip(np.argsort(combined_fitness))

new_population = [combined[i] for i in sorted_fitness_indices[:mu]]

fitness_scores = fitness_population(new_population)
np.max(fitness_scores), np.mean(fitness_scores), np.min(fitness_scores)

## Complete Algorithm

The algorithm below implements all the ideas with a mutation only generated next generation and SGD for learning during each generation. Since the population trains in the previous generation, it doesn't have to train, therefore, in each generation only train the offsprings since they were mutated. If the mutation works, then the offspring should have the optimal weight values assigned to it.

In [None]:
# Hyper parameters
mu = 15 # Population Size
delta = 300 # The Gestation Period (learning phase) of each individual.
lmbda = 10 # The number of offsprings generated
batch_size = 64 # The batch size used for SGD and fitness score calculation.
learning_rate = 1e-4 # The learning rate for SGD.
generations = 30 # The number of generations to perform the evolution for.
scaling_factor = 10 # The scaling factor for the sine curve.

mutate_vertex_probs = np.sin(np.arange(generations) / scaling_factor) + 1

# Keep track of some stuff.
generational_losses = []
generational_fitness = []

population = [Network(4, 3) for _ in range(mu)]
losses = np.array([learn(i) for i in population]) # The initial population has to learn too.
fitness_scores = fitness_population(population)

generational_losses.append(np.mean(losses))
generational_fitness.append((np.max(fitness_scores), np.mean(fitness_scores), np.min(fitness_scores)))

print(generational_fitness)

In [None]:
for g in range(generations):

    offsprings = generate_offsprings(population, mutate_vertex_prob=mutate_vertex_probs[g])
    offspring_losses = [learn(individual) for individual in offsprings]

    # Find the mu best amongst the offspring and the population.
    combined = population + offsprings
    combined_fitness = [fitness(individual) for individual in combined]
    sorted_fitness_indices = np.flip(np.argsort(combined_fitness))
    new_population = [combined[i] for i in sorted_fitness_indices[:mu]]

    fitness_scores = fitness_population(new_population)
    generational_losses.append(np.mean(offspring_losses))
    generational_fitness.append((np.max(fitness_scores), np.mean(fitness_scores), np.min(fitness_scores)))

    population = new_population

    print(f"Generation: {g + 1} / {generations}: {generational_fitness[-1]}\tReplacements: {np.sum(sorted_fitness_indices[:mu] > mu)}, Max Replace: {sorted_fitness_indices[0] == 0}")

In [None]:

fitness_scores = pd.DataFrame(generational_fitness, columns=["Max", "Mean", "Min"])
ax = fitness_scores.plot()
plt.xlabel("Generation #")
plt.ylabel("Fitness Score (out of 1)")
plt.title("Fitness Scores Over Generations")

best_individual = population[0].get_graph()
# population[0].plot()