#### Import required library


In [147]:
import numpy as np
import random
import math
from sklearn.model_selection import train_test_split

#### Loading the data as x and y

In [190]:
data = np.load('../data/problem_6.npz')
x_real= data['x']
y_real = data['y']
print(f"x_real shape: {x_real.shape}")
print(f"y_real shape: {y_real.shape}")

x_real shape: (2, 5000)
y_real shape: (5000,)


#### Spliting the data into two sets


In [191]:
x_train, x_test, y_train, y_test = train_test_split(x_real.T, y_real, test_size=0.2, random_state=42)

#### defining constants, functions and variables

In [192]:
OPERATORS = ['+', '-', '*', '/']  # Basic binary operators
FUNCTIONS = ['sin', 'cos', 'log', 'tan', 'exp', 'sqrt']        # Example unary operators
VARIABLES = [f"x{i}" for i in range(x_real.shape[0])] #used to have the variables according to the dataset
# FUNCTIONS = [np.sin, np.cos, np.log, np.tan, np.exp, np.sqrt]
def random_constant():
    return round(random.uniform(-5, 5), 2)

#### Generate Random formula

In [213]:
def generate_random_expr(max_depth, prob_function=0.7):
    # Step 1: Generate a base expression that includes all variables
    base_expr = VARIABLES.copy()
    random.shuffle(base_expr)  # Shuffle to introduce randomness
    
    # Combine all variables with random operators
    while len(base_expr) > 1:
        left = base_expr.pop()
        right = base_expr.pop()
        op = random.choice(OPERATORS)
        base_expr.append(f"({left} {op} {right})")
    
    # At this point, base_expr contains one string with all variables used
    formula = base_expr[0]
    
    # Step 2: Randomly extend the formula to the specified depth
    def extend_formula(expr, depth):
        if depth == 0:
            return expr
        if random.random() < prob_function:  # Add a function
            func = random.choice(FUNCTIONS)
            return f"{func}({extend_formula(expr, depth - 1)})"
        else:  # Add an operator with a new random term
            op = random.choice(OPERATORS)
            new_term = random.choice(VARIABLES + [random_constant()])
            if random.random() < 0.5:
                return f"({extend_formula(expr, depth - 1)} {op} {new_term})"
            else:
                return f"({new_term} {op} {extend_formula(expr, depth - 1)})"
    
    # Extend the base formula
    formula = extend_formula(formula, max_depth)
    return formula

#### Transform and evaluation of the formula

In [195]:

# Safe versions of functions (handles edge cases like negative inputs to log and sqrt)
def safe_log(x):
    if x <= 0:
        return None  # Invalid input for log
    return np.log(x)

def safe_sqrt(x):
    if x < 0:
        return None  # Invalid input for sqrt
    return np.sqrt(x)

# Define other operations like sin, cos, tan, exp, etc.
def safe_sin(x):
    return np.sin(x)

def safe_cos(x):
    return np.cos(x)

def safe_tan(x):
    return np.tan(x)

def safe_exp(x):
    return np.exp(x)

# Apply these safe functions to the transformation process
def transform_formula(formula):
    # Replace the string-based functions with numpy functions
    if isinstance(formula, str):
        formula = formula.replace("log", "safe_log")  # Replace 'log' with safe_log
        formula = formula.replace("sqrt", "safe_sqrt")  # Replace 'sqrt' with safe_sqrt
        formula = formula.replace("sin", "np.sin")  # Replace 'sin' with np.sin
        formula = formula.replace("cos", "np.cos")  # Replace 'cos' with np.cos
        formula = formula.replace("tan", "safe_tan")  # Replace 'tan' with safe_tan
        formula = formula.replace("exp", "np.exp")  # Replace 'exp' with np.exp
    return formula

def evaluate_expr(expr, x_values):
    # Define the mapping of x-values to variables like x0, x1, etc.
    variables = ['x0', 'x1']
    
    # Remove spaces and tokenize the expression
    if isinstance(expr, str):
        expr = expr.replace(" ", "")
    
    # Try evaluating the expression with x_values
    try:
        # Use Python eval function to evaluate the formula with the safe functions
        return eval(expr, {"x0": x_values[0], "x1": x_values[1], 
                           "np": np, "safe_log": safe_log, "safe_sqrt": safe_sqrt, "safe_tan": safe_tan})
    except Exception as e:
        # print(f"Error in expression: {expr}, {e}")
        return np.nan  # Return NaN for invalid formulas



#### Fitness computation (MSE)

In [196]:


def compute_fitness(expr, x_values, y_values, complexity_weight=0.01):
    y_pred = []
    for x_values in x_train:  # Iterate over each column (each sample)
        prediction = evaluate_expr(expr, x_values)
        if prediction is not None:
            y_pred.append(prediction)
        else:
            y_pred.append(float('inf'))  # Assign infinity if evaluation failed
    y_pred = np.array(y_pred)
    mse = np.mean((y_pred - y_values) ** 2)
    return mse 


### fitness filter

In [197]:
def filter_valid_fitness(fitness_scores, population):
    valid_population = []
    valid_fitness_scores = []
    
    for i, fitness in enumerate(fitness_scores):
        if np.isfinite(fitness):  # Check if the fitness score is a valid number
            valid_population.append(population[i])
            valid_fitness_scores.append(fitness)
    
    return valid_population, valid_fitness_scores

### Selection


##### Tournment


In [198]:
def tournament_selection(population, fitness_scores, num_parents, tournament_size=3):
    """
    Tournament Selection to select the best individuals (with lowest fitness).
    """
    parents = []
    population_size = len(population)
    
    # Ensure tournament size is valid
    if population_size < tournament_size:
        tournament_size = population_size  # Reduce the tournament size to the population size if needed

    for _ in range(num_parents):
        # Randomly select a subset of individuals for the tournament
        tournament_indices = random.sample(range(population_size), tournament_size)
        
        # Get the fitness scores of the selected tournament candidates
        tournament_scores = [fitness_scores[i] for i in tournament_indices]

        if not tournament_scores:  # Check if tournament_scores is empty
            print("Error: Tournament scores list is empty.")
            continue

        # Select the best individual from the tournament
        best_index = tournament_scores.index(min(tournament_scores))
        best_individual_index = tournament_indices[best_index]

        parents.append(population[best_individual_index])

    return parents


In [199]:
def run_selection(population_size, fitness_scores, population):
    # Filter out invalid fitness scores
    valid_population, valid_fitness_scores = filter_valid_fitness(fitness_scores, population)
    
    if len(valid_population) == 0:
        print("No valid fitness scores. Exiting the selection process.")
        return None, None  # Return None if there are no valid individuals
    
    num_parents = population_size // 2  # Half of the population is selected for the next generation
    
    if len(valid_population) < 3:
        print("Not enough individuals for tournament selection. Reducing tournament size to match population.")
        # Use tournament size as the size of the valid population
        tournament_parents = tournament_selection(valid_population, valid_fitness_scores, num_parents, tournament_size=len(valid_population))
    else:
        # Tournament Selection
        tournament_parents = tournament_selection(valid_population, valid_fitness_scores, num_parents)
    
    return tournament_parents

#### Crossover of 2 parents

In [200]:
import ast
import random


def parse_formula_to_tree(formula):
    """
    Parse a mathematical formula into an abstract syntax tree (AST).
    """
    return ast.parse(formula, mode='eval').body

def tree_to_formula(tree):
    """
    Convert an AST back to a formula string.
    """
    return ast.unparse(tree)


In [201]:
def select_random_subtree(tree):
    """
    Randomly select a subtree from the given AST node.
    """
    if isinstance(tree, ast.BinOp) or isinstance(tree, ast.UnaryOp):
        children = [tree.left, tree.right] if isinstance(tree, ast.BinOp) else [tree.operand]
        if random.random() < 0.5:
            return tree  # Return the current tree
        return select_random_subtree(random.choice(children))
    return tree  # Return leaf nodes (constants or variables) directly

def replace_subtree(tree, target, replacement):
    """
    Replace a target subtree in the tree with the replacement subtree.
    """
    if tree == target:
        return replacement
    if isinstance(tree, ast.BinOp):
        tree.left = replace_subtree(tree.left, target, replacement)
        tree.right = replace_subtree(tree.right, target, replacement)
    elif isinstance(tree, ast.UnaryOp):
        tree.operand = replace_subtree(tree.operand, target, replacement)
    return tree

def crossover(parent1, parent2):
    """
    Perform subtree crossover between two parent formulas.
    """
    tree1 = parse_formula_to_tree(parent1)
    # print("tree 1 output is ")
    # print(ast.dump(tree1, indent=4))
    tree2 = parse_formula_to_tree(parent2)
    
    # Select random subtrees
    subtree1 = select_random_subtree(tree1)
    subtree2 = select_random_subtree(tree2)
    
    # Swap subtrees
    offspring_tree1 = replace_subtree(tree1, subtree1, subtree2)
    offspring_tree2 = replace_subtree(tree2, subtree2, subtree1)
    
    # Convert back to formulas
    offspring1 = tree_to_formula(offspring_tree1)
    offspring2 = tree_to_formula(offspring_tree2)
    
    return offspring1, offspring2


In [202]:
def crossover_population(population, num_to_select):
    offspring = []
    selected_parents = random.sample(population, num_to_select)
    
    # Ensure even number of parents
    if len(selected_parents) % 2 != 0:
        selected_parents = selected_parents[:-1]
    
    # Perform crossover in pairs
    for i in range(0, len(selected_parents), 2):
        parent1 = selected_parents[i]
        parent2 = selected_parents[i + 1]
        child1, child2 = crossover(parent1, parent2)
        offspring.extend([child1, child2])
    
    return offspring


#### Mutation in the formula

In [203]:
import random
import ast

def mutate_formula(formula, mutation_rate=0.3):
    """Applies mutation to a formula with a given probability."""
    if random.random() > mutation_rate:
        return formula  # No mutation occurs

    # Convert the formula to an AST tree
    try:
        tree = ast.parse(formula, mode='eval')
    except Exception as e:
        print(f"Error parsing formula '{formula}' to AST: {e}")
        return formula  # Return the original formula if parsing fails

    # Mutate the tree
    mutated_tree = mutate_tree(tree, mutation_rate, VARIABLES)
    return ast.unparse(mutated_tree)  # Convert back to string formula

def mutate_tree(tree, mutation_rate, variables):
    """Mutates an AST tree."""
    mutations = 0
    for node in ast.walk(tree):
        # Mutate constants
        if isinstance(node, ast.Constant) and random.random() < mutation_rate:
            node.value = random.uniform(-10, 10)  # Replace with a random number
            mutations += 1

        # Mutate binary operators
        elif isinstance(node, ast.BinOp) and random.random() < mutation_rate:
            node.op = random.choice([ast.Add(), ast.Sub(), ast.Mult(), ast.Div()])
            mutations += 1

        # Mutate function calls (e.g., sin, cos, log)
        elif isinstance(node, ast.Call) and random.random() < mutation_rate:
            if isinstance(node.func, ast.Name):
                node.func.id = random.choice(["sin", "cos", "tan", "log", "exp", "sqrt"])
                mutations += 1

        # Mutate the variables themselves (e.g., change x0 to x1)
        elif isinstance(node, ast.Name) and random.random() < mutation_rate:
            if variables is not None:
                node.id = random.choice(variables)  # Use the passed variables list
                mutations += 1

    if mutations == 0:
        # If no mutations were made, force one mutation to ensure change
        node = random.choice(list(ast.walk(tree)))
        if isinstance(node, ast.Constant):
            node.value = random.uniform(-10, 10)
        elif isinstance(node, ast.BinOp):
            node.op = random.choice([ast.Add(), ast.Sub(), ast.Mult(), ast.Div()])
        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
            node.func.id = random.choice(["sin", "cos", "tan", "log", "exp", "sqrt"])
        elif isinstance(node, ast.Name):
            node.id = random.choice(VARIABLES)

    return tree



In [None]:
# Function to mutate a formula by incorporating a random formula
def mutate_with_random_formula(best_formula, random_formula):
    """
    Mutate the best formula by adding a subtree or component from a random formula.
    """
    # Split the formulas into components (dummy logic, replace with actual parsing logic)
    components_best = best_formula.split("+")
    components_random = random_formula.split("+")

    # Select a random component from the random formula
    random_component = random.choice(components_random).strip()

    # Mutate the best formula by appending the random component
    mutated_formula = f"{best_formula} + {random_component}"

    return mutated_formula

#### Genetic algorithm

In [215]:
# Parameters
num_generations = 1
population_size = 20
num_to_select = 10  # Half of the population is selected for the next generation
mutation_rate = 0.3



# Print each formula
# for i, formula in enumerate(population):
#     print(f"Formula {i + 1}: {formula}")


# # Initialize variables to store the best formula and fitness score across all generations
best_formula = None
best_fitness = float('inf')  # Set to infinity for minimization, or -inf for maximization

# # For logging and debugging purposes, we can track the best fitness score in each generation
best_fitness_scores_per_generation = []

for generation in range(num_generations):
    print(f"\nGeneration {generation + 1}/{num_generations}")

    # Initial population
    population = [generate_random_expr(max_depth=4) for _ in range(population_size)]

#     # 1. **Evaluate Fitness Scores for Current Population**
    transformed_formula = [transform_formula(formula) for formula in population]
    fitness_scores = [compute_fitness(formula, x_train, y_train) for formula in transformed_formula]

    # Print the fitness scores (optional, for debugging)
    for i, (formula, score) in enumerate(zip(population, fitness_scores)):
        print(f"Formula {i + 1}: {formula} | Fitness = {score}")

    # Track the best fitness score in this generation (optional)
    # best_fitness_scores_per_generation.append(min(fitness_scores))  # Assuming lower fitness is better (in case of minimization)
 
#     # 2. **Select Tournament Parents**
    tournament_parents = run_selection(population_size, fitness_scores, population)
    # Print tournment parents
    # for i, formula in enumerate(tournament_parents):
    #     print(f"tournment selected parent Formula {i + 1}: {formula}")
    tournament_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in tournament_parents]
    
    for i, (formula, score) in enumerate(zip(tournament_parents, tournament_fitness_scores)):
        print(f"Tournment Formula {i + 1}: {formula} | Fitness = {score}")

    # 3. **Crossover to Generate Offspring**
    # offspring = crossover_population(tournament_parents, int(len(tournament_parents)))
    # #fitness of offsprings
    # offspring_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in offspring]
    # # Print offspring after crossover (optional)
    # print("\nOffspring after crossover:")
    # for i, (formula, score) in enumerate(zip(offspring, offspring_fitness_scores)):
    #     print(f"offspring Formula {i + 1}: {formula} | Fitness = {score}")

    # 4. **Mutation on the Offspring**
    mutated_offspring = [mutate_formula(child, mutation_rate) for child in tournament_parents]

    mutate_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in mutated_offspring]
    # Print offspring after crossover and mutation (optional)
    print("mutation:")
    for i, (formula, score) in enumerate(zip(mutated_offspring, mutate_fitness_scores)):
        print(f"Mutate Formula {i + 1}: {formula} | Fitness = {score}")

#     # 5. **Evaluate Fitness for Mutated Offspring**
#     mutated_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in mutated_offspring]
#     # print("\nFitness scores for mutated offspring:")
#     # for i, (formula, score) in enumerate(zip(mutated_offspring, mutated_fitness_scores)):
#     #     print(f"Mutated Child {i}: Fitness = {score}")

    # 6. **Combine Parents and Offspring for Selection**
    combined_population = tournament_parents + mutated_offspring
    combined_fitness_scores = tournament_fitness_scores + mutate_fitness_scores

    print("\nCombined population:")
    for i, (formula, score) in enumerate(zip(combined_population, combined_fitness_scores)):
        print(f"combined Formula {i + 1}: {formula} | Fitness = {score}")

    assert len(combined_population) == len(combined_fitness_scores), "Population and fitness scores do not match in length!"

    # 7. **Select the Best Individuals for the Next Generation**
    next_generation_parents = run_selection(population_size, combined_fitness_scores, combined_population)

    #fitness of next generation
    nextgen_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in next_generation_parents]
    print("\nSelected parents for the next generation:")
    for i, (formula, score) in enumerate(zip(next_generation_parents, nextgen_fitness_scores)):
        print(f"next Formula {i + 1}: {formula} | Fitness = {score}")

    # Mutate the best formula from generation 1 using a random formula from generation 2
    # mutated_formula = mutate_with_random_formula(gen1_best_formula, random_formula_gen2)

#     # 8. **Generate the Next Generation Population**
#     offspring_next_gen = crossover_population(next_generation_parents, population_size // 2)

#     # print("\nOffspring after crossover (next generation):")
#     # for i, child in enumerate(offspring_next_gen, start=1):
#     #     print(f"Child {i}: {child}")

#     # 9. **Mutation on the Next Generation**
#     mutated_next_gen = [mutate_formula(child, mutation_rate) for child in offspring_next_gen]

#     # print("\nOffspring after crossover and mutation (next generation):")
#     # for i, child in enumerate(mutated_next_gen, start=1):
#     #     print(f"Child {i}: {child}")

#     # **Update Population for the Next Generation**
#     population = mutated_next_gen  # Update population to the next generation

#     # Combine fitness scores of parents and offspring
#     combined_fitness_scores_all = fitness_scores + mutated_fitness_scores
#     combined_population_all = population + mutated_offspring
#     combined_population_all=transform_formula(combined_population_all)
#     # Find the index of the best fitness score
#     current_best_fitness = min(combined_fitness_scores_all)

#     # If the current best fitness is better than the previous best, update
#     if current_best_fitness < best_fitness:
#         best_fitness = current_best_fitness
#         # Find the index of the formula with the best fitness score
#         best_index = combined_fitness_scores_all.index(best_fitness)
#         best_formula = combined_population_all[best_index]

# # After all generations, track the best fitness score seen across all generations
# print("\nBest formula across all generations:")
# print(f"Best Fitness = {best_fitness}")
# print(f"Best Formula = {best_formula}")

# # If you'd like to see the best fitness score across generations
# print("\nBest fitness scores across generations:")
# for i, score in enumerate(best_fitness_scores_per_generation):
#     print(f"Generation {i + 1}: Best Fitness = {score}")



Generation 1/1


  mse = np.mean((y_pred - y_values) ** 2)


Formula 1: sin(exp(exp(((x0 / x1) / 4.73)))) | Fitness = nan
Formula 2: log(log((x1 * sqrt((x1 * x0))))) | Fitness = nan
Formula 3: exp(exp(tan(exp((x1 * x0))))) | Fitness = inf
Formula 4: (x0 / tan((sin((x0 / x1)) / x1))) | Fitness = 62028.213620174654
Formula 5: log(cos(sin(log((x1 / x0))))) | Fitness = nan
Formula 6: cos(log(cos(log((x0 / x1))))) | Fitness = nan
Formula 7: sqrt(cos(cos(log((x1 + x0))))) | Fitness = nan
Formula 8: (x0 * log(log(exp((x1 / x0))))) | Fitness = nan
Formula 9: log(log(sqrt(log((x1 - x0))))) | Fitness = nan
Formula 10: exp(exp((tan((x1 - x0)) - x0))) | Fitness = inf
Formula 11: log((x1 + sin(((x1 * x0) + x1)))) | Fitness = inf
Formula 12: cos(sin(sin(tan((x1 * x0))))) | Fitness = 33.47834032704054
Formula 13: (x1 - sqrt(tan(exp((x1 * x0))))) | Fitness = nan
Formula 14: (sqrt(sin(((x0 * x1) / x0))) * x1) | Fitness = nan
Formula 15: sin(cos(sin(cos((x1 * x0))))) | Fitness = 32.41775560695773
Formula 16: log(cos(sin((x1 * (x1 / x0))))) | Fitness = 24.84012315

In [208]:
# def run_evolution(num_generations, population_size, mutation_rate, tournament_size, max_depth):
#     best_formula = None
#     best_fitness = float('inf')  # Initialize with infinity for minimization
#     best_fitness_scores_per_generation = []

#     for generation in range(num_generations):
#         print(f"\nGeneration {generation + 1}/{num_generations}")

#         # 1. **Initialize Population** with a mix of new random formulas and selected parents
#         population = [generate_random_expr(max_depth) for _ in range(population_size)]

#         # 2. **Evaluate Fitness Scores** for Current Population
#         transformed_population = [transform_formula(formula) for formula in population]
#         fitness_scores = [compute_fitness(formula, x_train, y_train) for formula in transformed_population]

#         # Print the formulas and their fitness scores
#         for i, (formula, score) in enumerate(zip(population, fitness_scores)):
#             print(f"Formula {i + 1}: {formula} | Fitness = {score}")

#         # 3. **Select Parents for Crossover** using tournament selection
#         tournament_parents = run_selection(population_size, fitness_scores, population)

#         # Print the tournament selected parents
#         tournament_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in tournament_parents]
#         for i, (formula, score) in enumerate(zip(tournament_parents, tournament_fitness_scores)):
#             print(f"Tournament Formula {i + 1}: {formula} | Fitness = {score}")

#         # 4. **Crossover to Generate Offspring**
#         offspring = crossover_population(tournament_parents, len(tournament_parents) // 2)

#         # 5. **Mutation on the Offspring**
#         mutated_offspring = [mutate_formula(child, mutation_rate) for child in offspring]

#         # Print mutated offspring
#         mutate_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in mutated_offspring]
#         print("Mutated Offspring:")
#         for i, (formula, score) in enumerate(zip(mutated_offspring, mutate_fitness_scores)):
#             print(f"Mutated Formula {i + 1}: {formula} | Fitness = {score}")

#         # 6. **Generate New Random Formulas** for diversity
#         num_random = population_size // 4  # You can adjust how many new random formulas you want
#         new_random_formulas = [generate_random_expr(max_depth) for _ in range(num_random)]

#         # 7. **Combine Parents, Offspring, and New Random Formulas**
#         combined_population = tournament_parents + mutated_offspring + new_random_formulas

#         # 8. **Evaluate Fitness Scores for the Combined Population**
#         combined_transformed_population = [transform_formula(formula) for formula in combined_population]
#         combined_fitness_scores = [compute_fitness(formula, x_train, y_train) for formula in combined_transformed_population]

#         # Print combined population with fitness scores
#         print("\nCombined Population:")
#         for i, (formula, score) in enumerate(zip(combined_population, combined_fitness_scores)):
#             print(f"Formula {i + 1}: {formula} | Fitness = {score}")

#         # 9. **Select the Best Individuals for the Next Generation**
#         next_generation_parents = run_selection(population_size, combined_fitness_scores, combined_population)

#         # Track best fitness in this generation
#         nextgen_fitness_scores = [compute_fitness(transform_formula(formula), x_train, y_train) for formula in next_generation_parents]
#         print("\nNext Generation Selected Parents:")
#         for i, (formula, score) in enumerate(zip(next_generation_parents, nextgen_fitness_scores)):
#             print(f"Next Gen Formula {i + 1}: {formula} | Fitness = {score}")

#         # **Update the Best Formula Across Generations**
#         current_best_fitness = min(nextgen_fitness_scores)
#         if current_best_fitness < best_fitness:
#             best_fitness = current_best_fitness
#             best_formula = next_generation_parents[nextgen_fitness_scores.index(best_fitness)]

#         # Track the best fitness per generation
#         best_fitness_scores_per_generation.append(best_fitness)

#     print("\nBest Formula Across All Generations:")
#     print(f"Best Fitness = {best_fitness}")
#     print(f"Best Formula = {best_formula}")
    
#     # Return the best formula after all generations
#     return best_formula

# # Run the evolutionary process
# best_formula = run_evolution(num_generations=50, population_size=20, mutation_rate=0.3, tournament_size=3, max_depth=4)
