# Import

In [16]:
import numpy as np
from random import choice, random, randint, sample
import copy

# Operations

In [17]:
OPERATIONS = [
    (np.add, 2, "({} + {})"),
    (np.subtract, 2, "({} - {})"),
    (np.divide, 2, "({} / {})"),
    (np.sin, 1, "sin({})"),
]

# Genothype definition

Let's consider the genotype as a list of elements with the following structure: [operator, first operand, secondo operand].
In this way we are creating a recursive function that computes a valid formula and represent it as a list (example: [ "+", ["sin", "x[0]" ], "x[1]"]).

In [18]:
def random_program(depth, input_dim):
    if depth == 0 or random() < 0.3:
        return f"x[{randint(0, input_dim - 1)}]" #new leaf node

    op, arity, symbol = choice(OPERATIONS)
    children = [random_program(depth - 1, input_dim) for _ in range(arity)]
    return [symbol] + children  

In [19]:
random_program(1, 4)

['({} - {})', 'x[0]', 'x[3]']

Now we need a function that given the genotype provide us with the output provided by the predicted function. This function must receive the input vector to perform his operation.

In [20]:
def evaluate_program(program, x):
    if isinstance(program, str):  # Leaf node
        return x[int(program[2:-1])]  # extract the value
    elif isinstance(program, list): 
        op = next(op for op, _, symbol in OPERATIONS if symbol == program[0])
        args = [evaluate_program(child, x) for child in program[1:]]
        try:
            return op(*args)
        except ZeroDivisionError:
            return np.inf

In [21]:
x = [2,3,4, 6]
program = ['({} / {})', 'x[1]', 'x[0]']
evaluate_program(program, x)

np.float64(1.5)

As you may notice this function verify with the function __isinstance(element, type)__ if "element" is an instance of the "type", with the objective of understanding if it is a __leaf node__. If this is the case we simply extract the value out of it, otherwise we still need to invoke the function recursively.

## Fitness function

For now, simply consider the fitness function of a solution as it's mean square error compared to the expected results.

In [22]:
def fitness_function(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return np.mean((predictions - y) ** 2)

In [23]:
x = np.random.uniform(-1, 1, size=(2, 1000))  # 2 feature, 1000 esempi
y = x[0] + np.sin(x[1]) / 5
#fitness_function(program, x, y)

## Tweak function

There is a lot of __variability__ that has to be considered for the tweak function. We can now imagine to implement a recursive function that receives the program (which indicates the current function that we are using for the task), the number of dimensions for the input and the maximum depth allowed for the tweaked solution.

Recursively, if we end up into a leaf node, or if the current solution is still a list but with 0.3 probability, we simply generate a new sub-program.

Otherwise, we simply invoke the same function for a random index.

In [24]:
def mutate_program(program, input_dim, depth=3):
    if random() < 0.3 or not isinstance(program, list):  
        return random_program(depth, input_dim)
    idx = randint(1, len(program) - 1)
    program[idx] = mutate_program(program[idx], input_dim, depth - 1)
    return program

Other function

In [None]:
def mutate(program, input_dim, max_depth=3):
    """Mutazione di un programma."""
    mutant = copy.deepcopy(program)

    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points = get_subtree_points(program)
    if not points:
        return random_program(max_depth, input_dim)  # Ritorna un nuovo programma se non ci sono punti

    point = choice(points)

    # Genera un nuovo sottoalbero casuale
    new_subtree = random_program(randint(1, max_depth), input_dim)
    mutant[point] = new_subtree

    return mutant

## Crossover

We can use a croossover function that receives only 2 parents and, if one of them is a leaf program simply return casually one of the 2 programs (avoiding to perform the operation for programs with no childrens). Otherwise, select random indexes for both the parents and combine the first part of the tree with the second part of the tree of the 2 parents, returning a new individual.

In [25]:
def crossover(parent1, parent2, max_depth=3):
    """Crossover tra due programmi."""
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)

    if len(parent1) < 2 or len(parent2) < 2:
        return parent1, parent2  # Evita crossover se i programmi sono troppo piccoli

    # Punti di crossover validi
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points1 = get_subtree_points(parent1)
    points2 = get_subtree_points(parent2)

    if not points1 or not points2:
        return parent1, parent2  # Nessun punto valido per il crossover

    point1 = choice(points1)
    point2 = choice(points2)

    # Scambia i sottoalberi
    subtree1 = parent1[point1]
    subtree2 = parent2[point2]

    child1[point1], child2[point2] = subtree2, subtree1

    return child1, child2

## Genetic algorithm

### Parameter selection

In [32]:
generations=50
population_size=100
offspring_size = 50

### Data selection

In [27]:
x = np.random.uniform(-1, 1, size=(2, 1000))  # 2 feature, 1000 esempi
y = x[0] + np.sin(x[1]) / 5

In [None]:
input_dim = x.shape[0]
population = [random_program(3, input_dim) for _ in range(population_size)]

for gen in range(generations):
    fitness = np.array([fitness_function(prog, x, y) for prog in population])

    # Seleziona la metà migliore della popolazione
    sorted_indices = fitness.argsort()
    population = [population[i] for i in sorted_indices[:population_size // 2]]
    fitness = fitness[sorted_indices[:population_size // 2]]

    print(f"Generazione {gen + 1}, miglior fitness: {fitness[0]:.6f}")

    # Nuova generazione con crossover e mutazione
    offspring = []
    while len(offspring) < offspring_size:
        parent1 = choice(population)
        parent2 = choice(population)
        child1, child2 = crossover(parent1, parent2)
        offspring.append(mutate_program(child1, input_dim))
        if len(offspring) < population_size:
            offspring.append(mutate_program(child2, input_dim))

    population.append(offspring)
    #best 100:
    sorted_indices = fitness.argsort()
    population = [population[i] for i in sorted_indices[:population_size]]

# Ritorna il migliore
best_program = population[0]
best_fitness = fitness_function(best_program, x, y)
print("Miglior programma:", best_program, "; Fitness:", best_fitness)

  return op(*args)


In [30]:

# Parametri del GP
generations = 50
population_size = 100
p_crossover = 0.7
p_mutation = 0.3
n_trial = 5
max_depth = 3

x = np.random.uniform(-1, 1, size=(2, 1000))  # 2 feature, 1000 esempi
y = x[0] + np.sin(x[1]) / 5

input_dim = x.shape[0]
population = [random_program(3, input_dim) for _ in range(population_size)]

for gen in range(generations):
    fitness = np.array([fitness_function(prog, x, y) for prog in population])

    # Seleziona la metà migliore della popolazione
    sorted_indices = fitness.argsort()
    population = [population[i] for i in sorted_indices[:population_size // 2]]
    fitness = fitness[sorted_indices[:population_size // 2]]

    print(f"Generazione {gen + 1}, miglior fitness: {fitness[0]:.6f}")

    next_population = [copy.deepcopy(ind) for ind in population]
    ori_expr = [str(ind) for ind in next_population]

    # Crossover
    for i in range(1, len(next_population), 2):
        if random() < p_crossover:
            c1_updated, c2_updated = False, False
            for _ in range(n_trial):
                if c1_updated and c2_updated:
                    break
                c1, c2 = crossover(next_population[i - 1], next_population[i])
                if n_trial <= 1:
                    next_population[i - 1], next_population[i] = c1, c2
                    c1_updated, c2_updated = True, True
                else:
                    if not c1_updated and str(c1) not in ori_expr:
                        next_population[i - 1] = c1
                        ori_expr.append(str(c1))
                        c1_updated = True
                    if not c2_updated and str(c2) not in ori_expr:
                        next_population[i] = c2
                        ori_expr.append(str(c2))
                        c2_updated = True

    # Mutazione
    for i in range(len(next_population)):
        if random() < p_mutation:
            for _ in range(n_trial):
                mutant = mutate(next_population[i], input_dim)
                if n_trial <= 1 or str(mutant) not in ori_expr:
                    next_population[i] = mutant
                    ori_expr.append(str(mutant))
                    break

    # Aggiorna la popolazione
    population = next_population

# Ritorna il migliore
best_program = population[0]
best_fitness = fitness_function(best_program, x, y)
print("Miglior programma:", best_program, "; Fitness:", best_fitness)


  return op(*args)


Generazione 1, miglior fitness: 0.004515
Generazione 2, miglior fitness: 0.010800
Generazione 3, miglior fitness: 0.010800
Generazione 4, miglior fitness: 0.010800
Generazione 5, miglior fitness: 0.010800
Generazione 6, miglior fitness: 0.010800
Generazione 7, miglior fitness: 0.010800
Generazione 8, miglior fitness: 0.010800
Generazione 9, miglior fitness: 0.010800
Generazione 10, miglior fitness: 0.010800


  return op(*args)


Generazione 11, miglior fitness: 0.010800
Generazione 12, miglior fitness: 0.010800
Generazione 13, miglior fitness: 0.014799
Generazione 14, miglior fitness: 0.014799
Generazione 15, miglior fitness: 0.012341
Generazione 16, miglior fitness: 0.006348
Generazione 17, miglior fitness: 0.006348
Generazione 18, miglior fitness: 0.010800


  return op(*args)


Generazione 19, miglior fitness: 0.012341
Generazione 20, miglior fitness: 0.014799
Generazione 21, miglior fitness: 0.014799
Generazione 22, miglior fitness: 0.010800
Generazione 23, miglior fitness: 0.010800
Generazione 24, miglior fitness: 0.010800
Generazione 25, miglior fitness: 0.010800
Generazione 26, miglior fitness: 0.010800
Generazione 27, miglior fitness: 0.010800
Generazione 28, miglior fitness: 0.010800
Generazione 29, miglior fitness: 0.010800
Generazione 30, miglior fitness: 0.010800
Generazione 31, miglior fitness: 0.010800
Generazione 32, miglior fitness: 0.010800
Generazione 33, miglior fitness: 0.010800
Generazione 34, miglior fitness: 0.010800
Generazione 35, miglior fitness: 0.010800
Generazione 36, miglior fitness: 0.014799
Generazione 37, miglior fitness: 0.010800
Generazione 38, miglior fitness: 0.010800
Generazione 39, miglior fitness: 0.007088
Generazione 40, miglior fitness: 0.010800
Generazione 41, miglior fitness: 0.010800
Generazione 42, miglior fitness: 0

  return op(*args)


Generazione 48, miglior fitness: 0.010800
Generazione 49, miglior fitness: 0.010800
Generazione 50, miglior fitness: 0.010800
Miglior programma: ['({} + {})', ['({} - {})', 'x[0]', 'x[0]'], 'x[0]'] ; Fitness: 0.010799566030572581
