# Import

In [94]:
import numpy as np
from random import choice, random, randint, sample
import copy

# Operations

In [95]:
OPERATIONS = [
    (np.add, 2, "({} + {})"),
    (np.subtract, 2, "({} - {})"),
    (np.divide, 2, "({} / {})"),
    (np.sin, 1, "sin({})"),
]

# Genothype definition

Let's consider the genotype as a list of elements with the following structure: [operator, first operand, secondo operand].
In this way we are creating a recursive function that computes a valid formula and represent it as a list (example: [ "+", ["sin", "x[0]" ], "x[1]"]).

In [96]:
def random_program(depth, input_dim):
    if depth == 0 or random() < 0.3:
        return f"x[{randint(0, input_dim - 1)}]" #new leaf node

    op, arity, symbol = choice(OPERATIONS)
    children = [random_program(depth - 1, input_dim) for _ in range(arity)]
    return [symbol] + children  

In [97]:
random_program(3, 4)

'x[1]'

## Transform the program into a human readable function

In [98]:
def program_to_string(program):
    if isinstance(program, str):  # leaf
        return program  
    elif isinstance(program, list):  
        try:
            _, _, symbol = next((op, arity, s) for op, arity, s in OPERATIONS if s == program[0])
        except StopIteration:
            raise ValueError(f"Not known operation: {program[0]}")
        
        children = [program_to_string(child) for child in program[1:]]
        
        return symbol.format(*children)


Now we need a function that given the genotype provide us with the output provided by the predicted function. This function must receive the input vector to perform his operation.

In [99]:
def evaluate_program(program, x):
    if isinstance(program, str):  # Leaf node
        return x[int(program[2:-1])]  # extract the value
    elif isinstance(program, list): 
        op = next(op for op, _, symbol in OPERATIONS if symbol == program[0])
        args = [evaluate_program(child, x) for child in program[1:]]
        try:
            return op(*args)
        except ZeroDivisionError:
            return np.inf

In [100]:
x = [2,3,4, 6]
program = ['({} / {})', 'x[1]', 'x[0]']
evaluate_program(program, x)

np.float64(1.5)

As you may notice this function verify with the function __isinstance(element, type)__ if "element" is an instance of the "type", with the objective of understanding if it is a __leaf node__. If this is the case we simply extract the value out of it, otherwise we still need to invoke the function recursively.

## Fitness function

For now, simply consider the fitness function of a solution as it's mean square error compared to the expected results.

In [101]:
def fitness_function(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return np.mean((predictions - y) ** 2)

In [102]:
x = np.random.uniform(-1, 1, size=(2, 1000))  # 2 feature, 1000 esempi
y = x[0] + np.sin(x[1]) / 5
#fitness_function(program, x, y)

## Tweak function

There is a lot of __variability__ that has to be considered for the tweak function. We can now imagine to implement a recursive function that receives the program (which indicates the current function that we are using for the task), the number of dimensions for the input and the maximum depth allowed for the tweaked solution.

Recursively, if we end up into a leaf node, or if the current solution is still a list but with 0.3 probability, we simply generate a new sub-program.

Otherwise, we simply invoke the same function for a random index.

In [103]:
def mutate_program(program, input_dim, depth=3):
    if random() < 0.3 or not isinstance(program, list):  
        return random_program(depth, input_dim)
    idx = randint(1, len(program) - 1)
    program[idx] = mutate_program(program[idx], input_dim, depth - 1)
    return program

Other function (FIGP based)

In [104]:
def mutate_program(program, input_dim, max_depth=3):
    """Mutazione per rimpiazzare un sottoalbero casuale."""
    mutant = copy.deepcopy(program)
    
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]
    
    points = get_subtree_points(mutant)
    if not points:
        return random_program(max_depth, input_dim)  # Genera nuovo programma
    
    point = choice(points)
    new_subtree = random_program(randint(1, max_depth), input_dim)  # Nuovo sottoalbero
    mutant[point] = new_subtree
    
    return mutant


Other function (FIGP based)

In [105]:
def mutate(program, input_dim, max_depth=3):
    """Mutazione di un programma."""
    mutant = copy.deepcopy(program)

    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points = get_subtree_points(program)
    if not points:
        return random_program(max_depth, input_dim)  # Ritorna un nuovo programma se non ci sono punti

    point = choice(points)

    # Genera un nuovo sottoalbero casuale
    new_subtree = random_program(randint(1, max_depth), input_dim)
    mutant[point] = new_subtree

    return mutant

## Crossover

We can use a croossover function that receives only 2 parents and, if one of them is a leaf program simply return casually one of the 2 programs (avoiding to perform the operation for programs with no childrens). Otherwise, select random indexes for both the parents and combine the first part of the tree with the second part of the tree of the 2 parents, returning a new individual.

In [106]:
def crossover(parent1, parent2, max_depth=3):
    """Crossover tra due programmi."""
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)

    if len(parent1) < 2 or len(parent2) < 2:
        return parent1, parent2  # Evita crossover se i programmi sono troppo piccoli

    # Punti di crossover validi
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]

    points1 = get_subtree_points(parent1)
    points2 = get_subtree_points(parent2)

    if not points1 or not points2:
        return parent1, parent2  # Nessun punto valido per il crossover

    point1 = choice(points1)
    point2 = choice(points2)

    # Scambia i sottoalberi
    subtree1 = parent1[point1]
    subtree2 = parent2[point2]

    child1[point1], child2[point2] = subtree2, subtree1

    return child1, child2

New crossover (FIGP based):

In [107]:
def crossover(parent1, parent2):
    """Crossover a singolo punto tra due programmi."""
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)
    
    # Identifica punti di crossover validi
    def get_subtree_points(prog):
        return [i for i, node in enumerate(prog) if isinstance(node, list)]
    
    points1 = get_subtree_points(parent1)
    points2 = get_subtree_points(parent2)
    
    if not points1 or not points2:
        return parent1, parent2  # Nessun punto valido, restituisci genitori
    
    point1 = choice(points1)
    point2 = choice(points2)
    
    # Scambia sottoalberi selezionati
    subtree1 = parent1[point1]
    subtree2 = parent2[point2]
    child1[point1], child2[point2] = subtree2, subtree1
    
    return child1, child2


## Genetic algorithm

### Parameter selection

In [108]:
generations=100
population_size=100
offspring_size = 50

### Data selection

In [109]:
def true_f(x: np.ndarray) -> np.ndarray:
    return x[0] + np.sin(x[1]) / 5

TEST_SIZE = 10_000
TRAIN_SIZE = 1000

x_validation = np.vstack(
    [
        np.random.random_sample(size=TEST_SIZE) * 2 * np.pi - np.pi,
        np.random.random_sample(size=TEST_SIZE) * 2 - 1,
    ]
)
y_validation = true_f(x_validation)
train_indexes = np.random.choice(TEST_SIZE, size=TRAIN_SIZE, replace=False)
x_train = x_validation[:, train_indexes]
y_train = y_validation[train_indexes]

x = x_train
y = y_train

Let's change the fitness function in a way that we penalize the more complex functions:

In [110]:
# Funzione di fitness avanzata ispirata a deep_based_FGP_NLS.py
def fitness_function(program, x, y):
    """
    Valuta la fitness di un programma.
    """
    try:
        # Valutazione del programma
        predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            return np.inf  # Penalizza programmi invalidi

        # Calcolo dell'errore
        error = np.mean((predictions - y) ** 2)  # Errore quadratico medio

        # Aggiungi penalità basata sulla complessità del programma
        complexity_penalty = len(program) * 0.01
        fitness = error + complexity_penalty

    except Exception as e:
        # Penalizza programmi che generano errori
        print(f"Errore nella valutazione del programma: {e}")
        return np.inf

    return fitness

Let's call the previous fitness function "MSE":

In [111]:
def mse(program, x, y):
    predictions = np.array([evaluate_program(program, x_row) for x_row in x.T])
    return np.mean((predictions - y) ** 2)

### Parameters

In [112]:
# Parametri del GP
generations = 50
population_size = 1000
p_crossover = 0.7
p_mutation = 0.3
max_depth = 3
offspring_size = population_size  # Numero di discendenti generati per generazione

In [113]:
# Inizializza popolazione
input_dim = x.shape[0]
population = [random_program(3, input_dim) for _ in range(population_size)]

# Loop principale per le generazioni
def run_genetic_algorithm():
    global population

    for gen in range(generations):
        # Calcola la fitness per ogni individuo
        fitness = np.array([fitness_function(prog, x, y) for prog in population])
        mse_value = np.array([mse(prog, x, y) for prog in population])

        # Mantieni l'elite (migliori individui)
        sorted_indices = fitness.argsort()
        elite = [population[i] for i in sorted_indices[:2]]  # Manteniamo 2 migliori
        best_fitness = fitness[sorted_indices[0]]
        mse_to_print = mse_value[sorted_indices[0]]

        print(f"Generazione {gen + 1}, miglior fitness: {mse_to_print:.6f}")

        # Crea la nuova generazione
        next_population = []

        while len(next_population) < offspring_size:
            if random() < p_crossover and len(population) > 1:
                # Crossover
                parent1, parent2 = choice(population), choice(population)
                child1, child2 = crossover(parent1, parent2)

                if random() < p_mutation:
                    child1 = mutate(child1, input_dim)
                if random() < p_mutation:
                    child2 = mutate(child2, input_dim)

                next_population.append(child1)
                if len(next_population) < offspring_size:
                    next_population.append(child2)
            else:
                # Mutazione diretta di un genitore
                parent = choice(population)
                mutant = mutate(parent, input_dim)
                next_population.append(mutant)

        # Combina elite e nuova generazione
        next_population.extend(elite)

        # Rimuovi duplicati
        unique_population = {}
        for prog in next_population:
            serialized = str(prog)
            if serialized not in unique_population:
                unique_population[serialized] = prog

        # Aggiorna la popolazione con individui unici
        population = list(unique_population.values())[:population_size]

    # Identifica il miglior programma
    fitness = np.array([fitness_function(prog, x, y) for prog in population])
    mse_value = np.array([mse(prog, x, y) for prog in population])
    sorted_indices = fitness.argsort()
    best_program = population[sorted_indices[0]]
    best_fitness = fitness[sorted_indices[0]]
    best_mse = mse_value[sorted_indices[0]]

    print("Miglior programma:", best_program, "; Fitness:", best_mse)
    return best_program

best_program = run_genetic_algorithm()

  return op(*args)
  return op(*args)
  return op(*args)


Generazione 1, miglior fitness: 0.003147
Generazione 2, miglior fitness: 0.003147
Generazione 3, miglior fitness: 0.003147
Generazione 4, miglior fitness: 0.003147
Generazione 5, miglior fitness: 0.003147
Generazione 6, miglior fitness: 0.001860
Generazione 7, miglior fitness: 0.001860
Generazione 8, miglior fitness: 0.001860
Generazione 9, miglior fitness: 0.001860
Generazione 10, miglior fitness: 0.001860
Generazione 11, miglior fitness: 0.001860
Generazione 12, miglior fitness: 0.001860
Generazione 13, miglior fitness: 0.001860
Generazione 14, miglior fitness: 0.001860
Generazione 15, miglior fitness: 0.001860


  return op(*args)


Generazione 16, miglior fitness: 0.001860
Generazione 17, miglior fitness: 0.001860
Generazione 18, miglior fitness: 0.001860
Generazione 19, miglior fitness: 0.001860
Generazione 20, miglior fitness: 0.001860
Generazione 21, miglior fitness: 0.001860
Generazione 22, miglior fitness: 0.001860
Generazione 23, miglior fitness: 0.001860
Generazione 24, miglior fitness: 0.001860
Generazione 25, miglior fitness: 0.001860
Generazione 26, miglior fitness: 0.001860
Generazione 27, miglior fitness: 0.001860
Generazione 28, miglior fitness: 0.001860
Generazione 29, miglior fitness: 0.001860
Generazione 30, miglior fitness: 0.001860
Generazione 31, miglior fitness: 0.001860
Generazione 32, miglior fitness: 0.001860
Generazione 33, miglior fitness: 0.001860
Generazione 34, miglior fitness: 0.001860
Generazione 35, miglior fitness: 0.001860
Generazione 36, miglior fitness: 0.001860
Generazione 37, miglior fitness: 0.001860
Generazione 38, miglior fitness: 0.001860
Generazione 39, miglior fitness: 0

  return op(*args)


Generazione 40, miglior fitness: 0.001860
Generazione 41, miglior fitness: 0.001860
Generazione 42, miglior fitness: 0.001860
Generazione 43, miglior fitness: 0.001860
Generazione 44, miglior fitness: 0.001860
Generazione 45, miglior fitness: 0.001860
Generazione 46, miglior fitness: 0.001860
Generazione 47, miglior fitness: 0.001860
Generazione 48, miglior fitness: 0.001860
Generazione 49, miglior fitness: 0.001860
Generazione 50, miglior fitness: 0.001860
Miglior programma: ['({} - {})', ['({} + {})', 'x[1]', 'x[0]'], ['sin({})', ['sin({})', 'x[1]']]] ; Fitness: 0.0018602010032673015


In [114]:
program_to_string(best_program)

'((x[1] + x[0]) - sin(sin(x[1])))'