# Import

In [211]:
import numpy as np
from random import choice, random, randint, sample
import copy
from dataclasses import dataclass

# Operations

In [212]:
OPERATIONS = [
    (np.add, 2, "({} + {})"),
    (np.subtract, 2, "({} - {})"),
    (np.divide, 2, "({} / {})"),
    (np.multiply, 2, "({} * {})"),
    (np.sin, 1, "sin({})"),
    (np.cos, 1, "cos({})"),
    (np.tan, 1, "tan({})"),
    (np.exp, 1, "exp({})"),
    (np.log, 1, "log({})"),
    (np.sqrt, 1, "sqrt({})"),
    (np.square, 1, "square({})"),
]

# Genothype definition

Let's consider the genotype as a list of elements with the following structure: [operator, first operand, secondo operand].
In this way we are creating a recursive function that computes a valid formula and represent it as a list (example: [ "+", ["sin", "x[0]" ], "x[1]"]).

In [213]:
#If the operator is one among sin, cos and the other with arity=1, don't return a costant
def random_program(depth, input_dim,unary=False):
    if depth == 0 or random() < 0.2:
        #Return a costant between 1 and 10 or a random variable (prob=0.3)
        if random() < 0.4 and not unary:
            return str(randint(2, 10)) #costant
        else:
            return f"x[{randint(0, input_dim - 1)}]" #new leaf node

    op, arity, symbol = choice(OPERATIONS)
    if arity == 1:
        unary = True
    else:
        unary = False
    children = [random_program(depth - 1, input_dim,unary) for _ in range(arity)]
    return [symbol] + children  

## Individual

In [214]:
@dataclass
class Individual:
    genome: list
    fitness : float = None

## Transform the program into a human readable function

In [215]:
def program_to_string(program):
    if isinstance(program, str):  # leaf
        return program  
    elif isinstance(program, list):  
        try:
            _, _, symbol = next((op, arity, s) for op, arity, s in OPERATIONS if s == program[0])
        except StopIteration:
            raise ValueError(f"Not known operation: {program[0]}")
        
        children = [program_to_string(child) for child in program[1:]]
        
        return symbol.format(*children)


Now we need a function that given the genotype provide us with the output provided by the predicted function. This function must receive the input vector to perform his operation.

In [216]:
def evaluate_program(program, x):
    if isinstance(program, str):  # Leaf node
        #If it's a leaf, it could be a costant or a variable
        if program[0] == 'x':
            return x[int(program[2:-1])]
        else:
            return int(program)
    elif isinstance(program, list): 
        op = next(op for op, _, symbol in OPERATIONS if symbol == program[0])
        args = [evaluate_program(child, x) for child in program[1:]]
        try:
            return op(*args)
        except ZeroDivisionError:
            return np.inf

As you may notice this function verify with the function __isinstance(element, type)__ if "element" is an instance of the "type", with the objective of understanding if it is a __leaf node__. If this is the case we simply extract the value out of it, otherwise we still need to invoke the function recursively.

## Fitness function

For now, simply consider the fitness function of a solution as it's mean square error compared to the expected results.

In [217]:
#Let's also introduce a fitness function that is the same used by professors
def fitness_evaluation_result(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return float(100 * np.mean(np.square(predictions - y)))

def fitness_function(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return np.mean((predictions - y) ** 2)

In [218]:
x = np.random.uniform(-1, 1, size=(2, 1000))  # 2 feature, 1000 esempi
y = x[0] + np.sin(x[1]) / 5
#fitness_evaluation_result(program, x, y)

## Tweak function

There is a lot of __variability__ that has to be considered for the tweak function. We can now imagine to implement a recursive function that receives the program (which indicates the current function that we are using for the task), the number of dimensions for the input and the maximum depth allowed for the tweaked solution.

Recursively, if we end up into a leaf node, or if the current solution is still a list but with 0.3 probability, we simply generate a new sub-program.

Otherwise, we simply invoke the same function for a random index.

In [219]:
def mutate_program(program, input_dim, depth=3):
    if random() < 0.3 or not isinstance(program, list):  
        return random_program(depth, input_dim)
    idx = randint(1, len(program) - 1)
    program[idx] = mutate_program(program[idx], input_dim, depth - 1)
    return program

Other function (FIGP based)

In [220]:
def mutate_program(program, input_dim, max_depth=3):
    """Mutazione per rimpiazzare un sottoalbero casuale."""
    mutant = copy.deepcopy(program)
    
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]
    
    points = get_subtree_points(mutant)
    if not points:
        return random_program(max_depth, input_dim)  # Genera nuovo programma
    
    point = choice(points)
    new_subtree = random_program(randint(1, max_depth), input_dim)  # Nuovo sottoalbero
    mutant[point] = new_subtree
    
    return mutant


Other function (FIGP based)

In [221]:
#Try to swap one single operation with another one
def mutate(program, input_dim, max_depth=3):
    """Mutazione di un programma."""
    mutant = copy.deepcopy(program)

    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points = get_subtree_points(program)
    if not points:
        return random_program(max_depth, input_dim)  # Ritorna un nuovo programma se non ci sono punti

    point = choice(points)

    # Genera un nuovo sottoalbero casuale
    new_subtree = random_program(randint(1, max_depth), input_dim)
    mutant[point] = new_subtree

    return mutant

## Crossover

We can use a croossover function that receives only 2 parents and, if one of them is a leaf program simply return casually one of the 2 programs (avoiding to perform the operation for programs with no childrens). Otherwise, select random indexes for both the parents and combine the first part of the tree with the second part of the tree of the 2 parents, returning a new individual.

In [222]:
def crossover(parent1, parent2, max_depth=3):
    """Crossover tra due programmi."""
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)

    if len(parent1) < 2 or len(parent2) < 2:
        return parent1, parent2  # Evita crossover se i programmi sono troppo piccoli

    # Punti di crossover validi
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points1 = get_subtree_points(parent1)
    points2 = get_subtree_points(parent2)

    if not points1 or not points2:
        return parent1, parent2  # Nessun punto valido per il crossover

    point1 = choice(points1)
    point2 = choice(points2)

    # Scambia i sottoalberi
    subtree1 = parent1[point1]
    subtree2 = parent2[point2]

    child1[point1], child2[point2] = subtree2, subtree1

    return child1, child2

New crossover (FIGP based):

In [223]:
def crossover(parent1, parent2):
    """Crossover a singolo punto tra due programmi."""
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)
    
    # Identifica punti di crossover validi
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]
    
    points1 = get_subtree_points(parent1)
    points2 = get_subtree_points(parent2)
    
    if not points1 or not points2:
        return parent1, parent2  # Nessun punto valido, restituisci genitori
    
    point1 = choice(points1)
    point2 = choice(points2)
    
    # Scambia sottoalberi selezionati
    subtree1 = parent1[point1]
    subtree2 = parent2[point2]
    child1[point1], child2[point2] = subtree2, subtree1
    
    return child1, child2


## Genetic algorithm

### Data loading

In [224]:
#load the problem with problem_X, for X that goes from 0 to 8
problem = np.load('data/problem_6.npz')
x = problem['x']
y = problem['y']
print(x.shape)
print(y.shape)

(2, 5000)
(5000,)


Let's call the previous fitness function "MSE":

In [225]:
def mse(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return float(100 * np.mean(np.square(predictions - y)))

In [226]:
#Compute the depth of a program
def depth(program):
    if isinstance(program, str):
        return 1
    elif isinstance(program, list):
        return 1 + max(depth(child) for child in program[1:])

Let's change the fitness function in a way that we penalize the more complex functions:

In [227]:
# Funzione di fitness avanzata ispirata a deep_based_FGP_NLS.py
import math


def fitness_function(program, x, y):
    """
    Valuta la fitness di un programma.
    """
    try:
        # Valutazione del programma
        predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            return np.inf  # Penalizza programmi invalidi

        # Calcolo dell'errore
        #error = np.mean((predictions - y) ** 2)  # Errore quadratico medio
        error = mse(program, x, y)
        if not math.isfinite(error):
            return np.inf

        # Aggiungi penalità basata sulla complessità del programma
        complexity_penalty = depth(program) * 0.01
        fitness = error + complexity_penalty

    except Exception as e:
        # Penalizza programmi che generano errori
        print(f"Errore nella valutazione del programma: {e}")
        return np.inf

    return fitness

## Tournament selection for parents, tau set to 10

In [228]:
#The professor said that fitness hole could work in this case->implemented
#Returning the worst one with a low probability, otherwise the best one
def tournament_selection(population,tau=15):
    tau = min(tau, len(population)) #not needed in theory
    tournament_indices = np.random.choice(len(population), tau, replace=False)

    considered_individuals = []
    for index in tournament_indices:
        considered_individuals.append(population[index])
    considered_individuals.sort(key=lambda i: i.fitness)
    if random() < 0.9:
        winner = considered_individuals[0].genome
    else:
        #Select one among the second and the worst
        winner = considered_individuals[randint(1,len(considered_individuals)-1)].genome
    return winner


## Simulated annealing

In [229]:
import math


def simulated_annealing(initial_program, x, y, max_iterations=500, initial_temperature=100, cooling_rate=0.95):
    
    # Programma corrente e relativa fitness
    current_program = initial_program
    current_fitness = fitness_function(current_program, x, y)
    
    # Memorizza il miglior programma trovato
    best_program = current_program
    best_fitness = current_fitness
    
    # Inizializza la temperatura
    temperature = initial_temperature
    
    for iteration in range(max_iterations):
        # Crea un programma candidato con una mutazione
        candidate_program = mutate_program(current_program, x.shape[0])
        candidate_fitness = fitness_function(candidate_program, x, y)
        
        # Calcola la variazione di fitness
        fitness_delta = candidate_fitness - current_fitness
        
        # Accetta il nuovo programma se migliora o con probabilità decrescente
        if fitness_delta < 0 or random() < math.exp(-fitness_delta / temperature):
            current_program = candidate_program
            current_fitness = candidate_fitness
            
            # Aggiorna il miglior programma trovato
            if current_fitness < best_fitness:
                best_program = current_program
                best_fitness = current_fitness
        
        # Riduci la temperatura
        temperature *= cooling_rate
        
        # Interrompi se la temperatura è troppo bassa
        if temperature < 1e-3:
            break
    
    return best_program

### Parameters

In [230]:
# Parametri del GP
#Reduced the population size to see how the algorithm evolves over generations faster
generations = 100
population_size = 200
p_crossover = 0.6
p_mutation = 0.4
tweak_probability = 0.2
max_depth = 3 #fixed or problem dependant?
elite_size = 5
offspring_size = population_size # Numero di discendenti generati per generazione

Let's create a new tweak function that is able to tweak a program by adding an unary operation in leaf nodes (if it not a constant value).

In [231]:
def tweak_program_2(program):
    """
    Modifica un sottoalbero del programma aggiungendo un operatore unario
    su una foglia, con una certa probabilità.
    """
    # Trova tutte le foglie del programma
    def get_leaf_indices(node, path=()):
        if isinstance(node, str) or isinstance(node, (int, float)):
            # Nodo foglia (variabile o costante)
            return [path]
        elif isinstance(node, list) and len(node) > 1:
            # Nodo non foglia valido: esplora ricorsivamente i figli
            indices = []
            for i, child in enumerate(node[1:], start=1):
                indices.extend(get_leaf_indices(child, path + (i,)))
            return indices
        return []
    
    # Ottieni tutte le foglie del programma
    leaf_indices = get_leaf_indices(program)
    if not leaf_indices:
        return program  # Nessuna modifica possibile

    # Seleziona casualmente una foglia
    selected_leaf_path = choice(leaf_indices)
    
    # Verifica che il percorso selezionato sia valido
    if not selected_leaf_path:
        return program  # Nessuna modifica possibile

    # Accedi alla foglia selezionata
    node = program
    for idx in selected_leaf_path[:-1]:
        node = node[idx]

    # Verifica che il nodo sia valido prima di modificare
    if isinstance(node, list) and len(selected_leaf_path) > 0:
        leaf = node[selected_leaf_path[-1]]
        # Modifica la foglia con un operatore unario
        if isinstance(leaf, str) and leaf.startswith("x"):  # Se è una variabile
            unary_operator = choice(["sin({})", "cos({})", "tan({})", "log({})", "sqrt({})"])
            node[selected_leaf_path[-1]] = [unary_operator, leaf]
    
    return program


In [232]:
def safe_copy(obj):
    if isinstance(obj, list):
        return obj.copy()  # Copia superficiale per liste
    elif isinstance(obj, str):
        return obj  # Le stringhe sono immutabili, restituisci direttamente
    else:
        raise TypeError("Tipo non supportato: solo stringhe o liste sono consentite.")

In [None]:
# Inizializza popolazione
input_dim = x.shape[0]
population = [Individual(genome=random_program(max_depth, input_dim)) for _ in range(population_size)]
for i in population:
    i.fitness=fitness_function(i.genome, x, y)

print(population)

# Loop principale per le generazioni
def run_genetic_algorithm():
    global population

    for gen in range(generations):
        population.sort(key=lambda i: i.fitness)
        mse_to_print = mse(population[0].genome, x, y)
        
        #break the cycle if you found the best solution you're able to find with training data
        if(mse_to_print==0.0000): 
            np.seterr(all='warn')
            print('Best program found with mse=0')
            return population[0].genome

        np.seterr(all='warn')
        print(f"Generazione {gen + 1}, miglior fitness: {mse_to_print:.6f}")
        #population is already sorted, so:
        print(f"Best formula: {program_to_string(population[0].genome)}")
        np.seterr(all='ignore')
        
        # Crea la nuova generazione
        next_population = []
        next_population.extend(population[:elite_size])  # Mantieni i migliori individui
        while len(next_population) < offspring_size:
            if random() < p_crossover:
                # Crossover
                #With random choice is much faster than tournament selection
                #Choose the best and the second best parent
                parent1, parent2 = tournament_selection(population), tournament_selection(population)
                child1, child2 = crossover(safe_copy(parent1), safe_copy(parent2))
                
                if random() < p_mutation:
                    child1 = mutate(safe_copy(child1), input_dim)
                if random() < p_mutation:
                    child2 = mutate(safe_copy(child2), input_dim)
                #let's add the new individuals:
                next_population.append(Individual(genome=child1, fitness=fitness_function(child1, x, y)))
                if len(next_population) < offspring_size:
                    next_population.append(Individual(genome=child2, fitness=fitness_function(child2, x, y)))
                if random() < tweak_probability and len(next_population) < offspring_size:
                    new_ind = tweak_program_2(safe_copy(child1))
                    next_population.append(Individual(genome=new_ind, fitness=fitness_function(new_ind, x, y)))
                
            else:
                # Mutate directly a parent
                parent = tournament_selection(population)
                mutant = mutate(safe_copy(parent), input_dim)
                next_population.append(Individual(genome=mutant, fitness=fitness_function(mutant, x, y)))
                    
        # the new population is the one generated in the offspring
        population = next_population

        # Remove duplicates
        unique_population = {}
        for prog in next_population:
            serialized = str(prog)
            if serialized not in unique_population:
                unique_population[serialized] = prog
        
        # update fitness of the new population
        population = list(unique_population.values())
        population.sort(key=lambda i: i.fitness)
        population = population[:population_size]
        for ind in population:
            ind.fitness = fitness_function(ind.genome, x, y)

    # Identify the best program
    population.sort(key=lambda i: i.fitness)
    best_program = population[0]
    best_mse = mse(best_program.genome, x, y)

    print("Miglior programma:", best_program.genome, "; Fitness:", best_mse)
    return best_program.genome

best_program = run_genetic_algorithm()

In [None]:
program_to_string(best_program)