In [1]:
import math
import numpy as np
from tqdm import tqdm

In [2]:
#warning settings
np.seterr(all="ignore") #ignore np warnings, the output will be nan or inf and will be handled correctly in the code. (using np.errstate slows down the code)

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
# Configuration
TRAIN_TEST_RATIO=0.6
PROBLEM_NUMBER=7

### Data Loading and Data Preprocessing

In [4]:
# Load problem data with context manager protocol
with np.load(f'data/problem_{PROBLEM_NUMBER}.npz') as problem:
    x_1 = problem['x']
    y_1 = problem['y']

# Shuffle the data
permutation = np.random.permutation(len(y_1))
x_1 = x_1[:, permutation]
y_1 = y_1[permutation]

# Determine train test split sizes
problem_len=len(y_1)
train_size=int(TRAIN_TEST_RATIO*problem_len)

# Split data
x_train = x_1[:, :train_size]
y_train = y_1[:train_size]

x_test = x_1[:, train_size:]
y_test = y_1[train_size:]


# Print dataset information
print(f"Problem number: {PROBLEM_NUMBER}, variables: {x_1.shape[0]}, train size: {train_size}, test size: {problem_len-train_size}")

print(f"Training data: x shape {x_train.shape}, y shape {y_train.shape}")
print(f"Testing data: x shape {x_test.shape}, y shape {y_test.shape}")



Problem number: 7, variables: 2, train size: 3000, test size: 2000
Training data: x shape (2, 3000), y shape (3000,)
Testing data: x shape (2, 2000), y shape (2000,)


### Numpy functions definition

In [5]:
unary_ops = [
    np.negative,
    np.abs,
    np.sqrt,
    np.exp,
    np.log,
    np.sin,
    np.cos,
    np.tan,
    np.arcsin,
    np.arccos,
    np.arctan,
    np.sinh,
    np.cosh,
    np.tanh,
    np.square,
    np.cbrt,
    np.reciprocal,

    np.ceil,
    np.floor
]

binary_ops = [
    np.add,
    np.subtract,
    np.multiply,
    np.divide,
    np.power,
    np.maximum,
    np.minimum,
    np.mod
]

### Symbolic Regression Class

In [6]:
from tree import Tree

class SymbolicRegression:
    def __init__(self, population_per_island,island_num, max_generations, mutation_rate, elitism_size, grow_full_ratio,max_mutations,migration_rate,collapse_rate):
        self.population_per_island = population_per_island
        self.island_num = island_num
        self.max_generations = max_generations
        self.mutation_rate = mutation_rate
        self.elitism_size = elitism_size
        self.grow_full_ratio = grow_full_ratio
        self.max_mutations = max_mutations
        self.unary_ops = unary_ops
        self.binary_ops = binary_ops
        self.migration_rate=migration_rate
        self.best_fitness_history = []
     
        self.population = [None] * island_num
        self.collapse_rate=collapse_rate

        for j in range(island_num):
            self.population[j] = np.array([
                Tree("grow") if i < int(population_per_island * self.grow_full_ratio) else Tree("full") for i in range(population_per_island)
            ])

        
         


    # Parents selection methods
    def select_parents_fitness_proportional(self, n_elems=2, epsilon=1e-10,island=0):
        """
        Fitness proportional selection method.
        Randomly selects n_elems individuals based on their fitness.
        Individuals with lower fitness have an higher probability to be selected.
        Premature convergence if few individuals have significantly better fitness than others.
        n_elems: number of elements to select.
        epsilon: small value to avoid division by zero.
        island: on which island the selection should be done.
        """
        fitnesses = [tree.fitness for tree in self.population[island]]
        inverted_fitnesses = [1 / (fitness + epsilon) for fitness in fitnesses]  # avoid division by zero
        probabilities = inverted_fitnesses / sum(inverted_fitnesses)
        parent1, parent2 = np.random.choice(self.population[island], size=n_elems, p=probabilities, replace=False)
        return parent1, parent2
    
    def select_parents_lexicase(self, population, num_parents,island_num=0):
        selected_parents = []
        test_cases = list(range(len(population[island_num].fitness)))  # assuming fitness is a list of test case results

        while len(selected_parents) < num_parents:
            eligible = population[island_num][:]
            np.random.shuffle(test_cases)  # Randomize order of test cases

            for test in test_cases:
                min_fitness = min(individual.fitness[test] for individual in eligible)
                eligible = [individual for individual in eligible if individual.fitness[test] == min_fitness]

                if len(eligible) <= 1:
                    break

            selected_parents.append(eligible[0])  # Select one of the remaining eligible individuals randomly

        return selected_parents

    def select_parents_rank_based(self, n_elems=2,island=0,exponential=False):
        """
        Rank-based selection method.
        Assigns probabilities based on inversed ranks instead of absolute fitness values.
        n_elems: number of elements to select.
        island: on which island the selection should be done.
        exponential: if True, the ranks are raised to the power of the exponential parameter.
        """
        fitnesses = np.array([tree.fitness for tree in self.population[island]])
        ranks = np.argsort(fitnesses)
        if exponential:
            ranks = ranks**exponential
        inversed_ranks = len(fitnesses) - ranks
        probabilities = inversed_ranks / np.sum(inversed_ranks)
        return np.random.choice(self.population[island], n_elems, p=probabilities, replace=False)
    
    def select_parents_tournament(self,island=0):
        """
        Tournament selection method.
        Randomly selects a subset of the population and selects the best individual from the subset.
        """
        tournament_size = 5
        tournament = list(np.random.choice(self.population[island], tournament_size, replace=True))
        tournament.sort(key=lambda x: x.fitness)
        return tournament[0], tournament[1]
    
    def select_parents(self, island, method="rank"):
        """
        Select parents based on the specified method.
        island: on which island the selection should be done
        method: which parent selection method to use. Default is "rank". Options are "rank", "fitness_proportional", "tournament".
        """
        match method:
            case "rank":
                return self.select_parents_rank_based(island=island)
            case "fitness_proportional":
                return self.select_parents_fitness_proportional(island=island)
            case "tournament":
                return self.select_parents_tournament(island=island)
            case _:
                return self.select_parents_rank_based(island=island)
            

    
    # Mutation methods
    def mutate(self, tree):
        if np.random.rand() < 0.5:
            tree.mutate_subtree()
        else:
            mutations = np.random.randint(1, self.max_mutations+1)
            tree.mutate_single_node(num_mutations=mutations)
        
       


    # Offsprings generation via mutation and crossover
    def offspring_generation(self,island):
        new_population = np.array([])

        # Elitism   
        elite_individuals = self.population[island][:self.elitism_size]
        new_population = elite_individuals

        # Main loop
        while len(new_population) < self.population_per_island//2: 
            parent1, parent2 = self.select_parents(island=island)
            # generate offsprings (one in mutation, two in crossover)
            offsprings = np.array([])

            if(np.random.rand() < self.mutation_rate):
                parent_clone = parent1.copy_tree()
                self.mutate(parent_clone)
              
                parent_clone.compute_fitness()
                #if the fitness is valid 
                if(parent_clone.fitness is not np.inf and parent_clone.fitness is not np.nan):
                    #to not re-add the same tree if the mutation was not possible (e.g. the tree is already a leaf and other edge cases)
                    if( parent_clone.fitness!=parent1.fitness):
                        offsprings = np.append(offsprings, [parent_clone])

            else:    
                offspring1, offspring2 = parent1.crossover(parent2)
                if(offspring1 is not None and offspring2 is not None):
               
                    offspring1.compute_fitness()
                    offspring2.compute_fitness()

                    offsprings = np.append(offsprings, [offspring1, offspring2])

            # Collapse branch
            for offsp in offsprings:
                if(np.random.rand() < self.collapse_rate):
                    #clone the tree and collapse the branch
                    tree_clone = offsp.copy_tree()
                
                    Tree.collapse_branch(tree_clone.root,force_collapse=True)
                    tree_clone.compute_fitness()
                    #if the fitness is not nan or inf after collapsing
                    if(tree_clone.fitness is not None or tree_clone.fitness is not np.inf and tree_clone.fitness is not np.nan):
                            offsp = tree_clone

            new_population = np.concatenate((new_population, offsprings))
                
        return new_population
    
    # Genetic Algorithm: Evolutionary Process
    def evolve(self,verbose=False,use_std_operators=False):
        best_tree_island = np.full(self.island_num, None, dtype=object)
        best_fitness_island = np.full(self.island_num, np.inf)
        global_best_fitness = np.inf
        global_best_tree = None
        take_over = np.full(self.island_num, False)
        # self.population_per_island.sort(key=lambda x: x.fitness) 
        #numpy sort of population over fitness
        for i in range(self.island_num):
            self.population[i].sort()
      


        for generation in tqdm(range(self.max_generations)):
           
            for i in range(self.island_num):
                if take_over[i]:
                    # print(f"Takeover at {generation} gen,island: {i}")
                    self.population[i] = np.unique(self.population[i])
                    new_trees = np.array([Tree("grow") for _ in range(self.population_per_island-len(self.population[i]))])
                 
                    self.population[i] = np.concatenate((self.population[i],new_trees))
                    self.population[i].sort()
                

                if np.random.rand()<self.migration_rate and self.island_num>1:
                    #pick a random island to migrate to 
                    island_to_migrate = np.random.randint(0,self.island_num)
                    while i == island_to_migrate:
                        island_to_migrate = np.random.randint(0,self.island_num)
                        
                    #select a random number from 0 to the population size of the current island

                    random_index = np.random.randint(0,len(self.population[i]))
                    
                    
                    self.population[island_to_migrate]=np.append(self.population[island_to_migrate],self.population[i][random_index])
                    #remove the tree from the current island
                    self.population[i]=np.delete(self.population[i],random_index)
                    

                    if(verbose):
                        print(f"Migration at {generation} gen from {i} to {island_to_migrate}")


                new_population=self.offspring_generation(island=i)
                    
                self.population[i]=np.concatenate((self.population[i],new_population))
                self.population[i].sort()
                
                generation_best_fitness_island = self.population[i][0].fitness

                if generation_best_fitness_island < best_fitness_island[i]:
                    best_fitness_island[i] = generation_best_fitness_island
                    best_tree_island[i] = self.population[i][0]
                    

                    if(best_fitness_island[i] < global_best_fitness):
                        global_best_fitness = best_fitness_island[i]
                        global_best_tree = best_tree_island[i]
                        self.best_fitness_history.append((best_fitness_island[i], generation))


                #trim the population to the best island_population
                self.population[i] = self.population[i][:self.population_per_island]

                
                
                n_best = [elem for elem in self.population[i] if elem.fitness == self.population[i][0].fitness]
                take_over[i] = False
                if len(n_best) > 0.5 * self.population_per_island:
                        take_over[i] = True
                        # print(f"Takeover at {generation} gen")     
                if(generation%100==0 and verbose):
                        print(f"Generation {generation + 1}, Island: {i}, Best Fitness: {best_fitness_island[i]}, Best Formula: {best_tree_island[i].to_np_formula(use_std_operators=use_std_operators)}")
            
                if global_best_fitness <= 1e-33:
                    break   

            if(generation%100==0 and not verbose):
                print(f"Generation {generation + 1}, Best Fitness: {global_best_fitness}, Best Formula: {global_best_tree.to_np_formula(use_std_operators=use_std_operators)}")

        return global_best_tree, global_best_fitness

### Problem Definition and Symbolic Regression Initialization

In [7]:
# --ISLAND SETTINGS--
ISLAND_POPULATION = 70
ISLAND_NUM = 4
MIGRATION_RATE = 0.0005


# --TREE SETTINGS--
TREE_MAX_DEPTH = 4
TREE_SPAWN_DEPTH = 3 #the max depth at which the tree will be spawned, they can grow up to TREE_MAX_DEPTH



# --GENETIC ALGORITHM SETTINGS--
MAX_GENERATIONS = 200000
ELITISM_SIZE = 1
VAR_NUM = x_train.shape[0]
CONST_RANGE = 10 # Constats will be in the range [-CONST_RANGE, CONST_RANGE]
MAX_MUTATIONS = 3  # Maximum number of mutations in a single mutation operation
MUTATION_RATE = 0.35
GROW_FULL_RATIO = 0.95
COLLAPSE_RATE = 0.3

# --DEPTH CHECK--
min_depth_data = math.ceil(math.log(x_train.shape[0],2))
if(TREE_SPAWN_DEPTH<min_depth_data):
    TREE_SPAWN_DEPTH = min_depth_data
    print(f"Spawn depth too low, set to {TREE_SPAWN_DEPTH}")

if(TREE_MAX_DEPTH<TREE_SPAWN_DEPTH):
    TREE_MAX_DEPTH = TREE_SPAWN_DEPTH+1
    print(f"Max depth too low, set to {TREE_MAX_DEPTH}")


Tree.set_params(unary_ops, binary_ops, VAR_NUM, CONST_RANGE,TREE_MAX_DEPTH,TREE_SPAWN_DEPTH, x_train, y_train, x_test, y_test)
regressor = SymbolicRegression(
    ISLAND_POPULATION,
    ISLAND_NUM,
    MAX_GENERATIONS,
    MUTATION_RATE,
    ELITISM_SIZE,
    GROW_FULL_RATIO,
    MAX_MUTATIONS,
    MIGRATION_RATE,
    COLLAPSE_RATE
    
)
#print the trees


### Algorithm Execution and Results Computation

In [8]:

# Execute the algorithm
best_tree, best_fitness = regressor.evolve(use_std_operators=False,verbose=True)   #use_std_operators=True to use standard operators (+,-,*,/), use verbose=True to print the best tree for each island every 50 iterations

print(f"\nTrain Fitness: {best_fitness}")
# Calculate the fitness on original data
best_tree.compute_fitness(test="test")




print(f"Test Fitness: {best_tree.fitness}")
print(f"Train-Test Discrepancy: {best_fitness-best_tree.fitness}")
best_tree.compute_fitness(test="all")
print(f"Global Fitness: {best_tree.fitness}")


#Print the best tree
print(f"Best Fitness History: {[tup[0] for tup in regressor.best_fitness_history]}, changed {len(regressor.best_fitness_history)} times\n")

print(f"Best Formula: {best_tree.to_np_formula(use_std_operators=False)}\n")
print(f"Best Formula with standard operators: {best_tree.to_np_formula(use_std_operators=True)}")



  0%|          | 0/200000 [00:00<?, ?it/s]


TypeError: Tree.__init__() got an unexpected keyword argument 'max_depth'

### Best Tree Drawing

In [None]:
tree_clone = best_tree.copy_tree()  
Tree.collapse_branch(tree_clone.root,0,force_collapse=True)
tree_clone.compute_fitness()           
if(tree_clone.fitness is not None or tree_clone.fitness is not np.inf and tree_clone.fitness is not np.nan):
                    best_tree = tree_clone


print(f"Collapsed formula: {best_tree.to_np_formula(use_std_operators=False)}\n") #use_std_operators=True to use standard operators (+,-,*,/)
print("Tree drawing (after collapsing):")
best_tree.add_drawing()


In [None]:
#Print graph of best fitness over generations
import matplotlib.pyplot as plt
plt.plot([tup[1] for tup in regressor.best_fitness_history], [tup[0] for tup in regressor.best_fitness_history])
plt.xlabel('Generation')
plt.ylabel('Best Fitness')
plt.title('Best Fitness over Generations')
plt.show()

In [106]:
# Testing: create a tree from a numpy fornmula, collapse the branch, compute the fitness and print the formula
# tree2 = Tree.create_tree_from_np_formula("np.add(np.arctan(np.add(np.cos(np.add(x[0], -2.643791764058358)), np.absolute(np.add(x[0], -3.1193709482751033)))), np.add(np.add(np.add(np.cos(x[1]), np.cos(x[1])), np.add(np.cos(x[1]), 2.1089875923126202)), np.add(np.add(np.cos(x[1]), np.cos(x[1])), np.add(np.cos(x[1]), np.cos(x[1])))))")

# Tree.collapse_branch(tree2.root,0,force_collapse=True)
# tree2.compute_fitness(test="all")
# print(f"Fitness: {tree2.fitness}")
# tree2.add_drawing()

# print(tree2.to_np_formula())


### Tests


In [221]:
import numpy as np


#PASTE HERE THE sXXXXX.py FILE SO WE CAN TEST IT BEFORE SUBMITTING IT


# f0 is provided by the professor
def f0(x: np.ndarray) -> np.ndarray:
    return 0




def f1(x: np.ndarray) -> np.ndarray: 
    return np.sin(x[0])


def f2(x: np.ndarray) -> np.ndarray: 
    return 0


def f3(x: np.ndarray) -> np.ndarray: 
    return 0


def f4(x: np.ndarray) -> np.ndarray: 
    return (np.cos(x[1]) * (((0.23024387669493285 * (9.49261846513043 / np.cos(x[1]))) + ((1.0918467832820689 / np.cos(x[1])) - -6.9747658112794575)) - ((0.009416198631637944 * (9.671215081247784 / np.cos(x[1]))) * (x[0] - (0.2702890211737155 * np.cos(x[1]))))))

def f5(x: np.ndarray) -> np.ndarray: 
    return ((x[0] / -1.1029017263054506) / np.cosh(((9.856509033734056 * -2.594225902466807) + x[1])))


def f6(x: np.ndarray) -> np.ndarray: 
    return (((x[1] / 0.6163310509098459) / 0.9575008504359739) - (x[0] / 1.4398424197751116))


def f7(x: np.ndarray) -> np.ndarray: 
    return  np.exp(np.square((0.19412533614260724 * (((x[1] * x[0]) * (1.2981600350564866 - np.sin(((0.7048732443752681 - (x[1] * x[0])) * -0.12545609726467288)))) + ((((0.9401730004516422 / np.power(0.9190484173090112, np.remainder(x[0], 0.17908034227130276))) * np.power(np.power(1.0075774564033242, np.remainder(x[0], -0.5453290513341242)), ((x[1] + -0.5453290513341242) * x[0]))) - np.absolute(np.arctan(np.arctan((x[0] - x[1]))))) * ((1.7743339625324417 * (0.5437591283769042 + np.power(1.0272695518587667, np.remainder(-1.646242327114337, x[1])))) + np.exp((np.power(0.9704417384263868, np.remainder(x[0], 0.9659013507711744)) + np.power(0.9561606006321369, (x[0] - x[1]))))))))))


def f8(x: np.ndarray) -> np.ndarray:
    return np.minimum(((((np.sinh(x[5]) * np.maximum(9.280652473985764, ((x[0] / x[0]) - (x[1] - 0.4059194895949467)))) * np.maximum(np.maximum(np.minimum(np.cosh(x[5]), 18.44238954070858), -1.9818022307528853), (np.minimum(17.6972179719256, np.square(x[5])) + np.minimum(np.square(x[5]), np.absolute(x[5]))))) + np.minimum((np.minimum(np.sinh(x[4]), 17.6972179719256) * 36.72752662020359), (np.minimum(36.33547306342092, (np.sinh(x[5]) + np.cosh(x[5]))) + ((np.sinh(x[4]) * -4.070986083664829) * 8.977972163360176)))) - np.minimum(np.maximum(np.remainder((np.maximum(0.4059194895949467, (x[3] - 0.7506252992942812)) + (np.remainder(x[4], -4.109412935321963) + 3.025150828938454)), np.minimum(np.minimum(np.sinh(x[4]), 2.6207832272206506), (np.minimum(-9.174199129628121, x[2]) + -17.86111661118901))), (np.maximum((np.remainder(x[4], -4.109412935321963) * 97.8102957631067), -82.98976582413408) + ((np.maximum(-9.174199129628121, x[3]) * 97.8102957631067) / ((5.980665508500083 / x[3]) * 1.0638269899324038)))), np.remainder(np.maximum((100.01994107697637 / (x[3] - 2.1332481808545545)), -0.09228238684307954), (np.minimum(36.33547306342092, (-4.109412935321963 * np.maximum(-9.174199129628121, x[3]))) * 17.361335511767116)))), ((((np.minimum((np.maximum(-9.174199129628121, x[3]) + np.sinh(x[5])), (np.cosh(x[5]) + -1.4254744449215995)) * (np.minimum(9.720508873329518, np.square(x[5])) + np.minimum(7.990198483701521, np.absolute(x[5])))) + (np.maximum((44.82781934259213 / np.absolute(x[5])), (np.absolute(x[5]) + 6.685971936727686)) * np.maximum((np.sinh(x[5]) + np.cos(x[5])), 1.0072334326500667))) * 8.680667755883558) - np.minimum(np.square(np.maximum((-1.4904791954484304 * np.sinh(x[5])), (np.minimum(17.6972179719256, np.cosh(x[5])) + np.minimum(np.sinh(x[4]), 23.467486856065708)))), np.square(np.minimum(np.minimum(-5.420044949989919, np.minimum(np.square(x[5]), np.square(x[5]))), (-33.91491388432617 + np.maximum(-4.067873225649736, (x[3] - 3.025150828938454))))))))


functions = [f0, f1, f2, f3, f4, f5, f6, f7, f8]

# np_form = "np.divide(np.divide(x[0],-1.1029017263054506),np.divide(np.cosh(np.add(np.multiply(9.856509033734056,-2.594225902466807),x[1])))"
np_form = "np.multiply(np.cos(x[1]), np.subtract(np.add(np.multiply(np.remainder(-8.239662110483511, np.power(2.3116869488022758, -1.242914485095067)), np.divide(9.49261846513043, np.cos(x[1]))), np.subtract(np.divide(1.0918467832820689, np.cos(x[1])), np.minimum(-0.07343659820862669, -6.9747658112794575))), np.multiply(np.multiply(np.multiply(np.exp(-6.570662608077371), 6.721684507957864), np.divide(9.671215081247784, np.cos(x[1]))), np.subtract(x[0], np.multiply(np.exp(-1.3082634437139973), np.cos(x[1]))))))"
tree = Tree.create_tree_from_np_formula(np_form)
np_form_collapsed = Tree.collapse_branch(tree.root,0,force_collapse=True)
new_tree = Tree("full",empty=True)
new_tree.root = np_form_collapsed
np_form_final = new_tree.to_np_formula(use_std_operators=True)
print(np_form_final)


for i in range(1,9):
    # Load the data
    with np.load(f'../data/problem_{i}.npz') as problem:
        x = problem['x']
        y = problem['y']

    
    # Test the functions and compute the mse
    y_pred = functions[i](x)
    mse = np.mean(np.square(y - y_pred))
    print("Problem", i, "MSE:", mse)




(np.cos(x[1]) * (((0.23024387669493285 * (9.49261846513043 / np.cos(x[1]))) + ((1.0918467832820689 / np.cos(x[1])) - -6.9747658112794575)) - ((0.009416198631637944 * (9.671215081247784 / np.cos(x[1]))) * (x[0] - (0.2702890211737155 * np.cos(x[1]))))))
Problem 1 MSE: 7.125940794232773e-34
Problem 2 MSE: 29616986382722.125
Problem 3 MSE: 3011.296290597625
Problem 4 MSE: 3.7640592662693526e-06
Problem 5 MSE: 1.967033114810461e-18
Problem 6 MSE: 3.0878043331576044e-14
Problem 7 MSE: 30.341754456981853
Problem 8 MSE: 6842.140380297294
