In [1]:
import numpy 
import pandas 
import sympy as sp
import scipy 
import random
import copy
import tqdm

In [None]:
class GP:
    def __init__(self, data: list):
        """
        :param data: variables and last one is answer 
        """
        self.data = data
        self.max_variables = len(data[0]) - 1
        self.functions = ["+", "-", "/", "*", "cos", "sin", "tan", "max", "min", "log", "exp", "%", "**", "sqrt", "abs", "floor", "ceil"]
        self.mutation_operations = ["add_variable {number}", "add function", "add constant"]
        self.unary_functions = ["cos", "sin", "tan", "log", "exp", "sqrt", "abs", "floor", "ceil"]
        
    def create_tree(self, max_depth=0):
        """Randomly create a tree."""
        if max_depth == 0:
            if random.random() > 0.5:
                return f"x{random.randint(1, self.max_variables)}"
            else:
                return random.uniform(-10, 10)  
        else:
            func = random.choice(self.functions)
            if func in self.unary_functions:
                return [func, self.create_tree(max_depth - 1)]
            else:
                return [func, self.create_tree(max_depth - 1), self.create_tree(max_depth - 1)]
    
    def tree_to_sympy(self, tree):
        """Convert tree to sympy expression."""
        if isinstance(tree, str):  
            return sp.Symbol(tree)
        elif isinstance(tree, (int, float)):  
            return sp.Float(tree)
        elif isinstance(tree, list):  
            func = tree[0]
            args = [self.tree_to_sympy(arg) for arg in tree[1:]]
            
            operator_map = {
                "+": sp.Add,
                "*": sp.Mul,
                "**": sp.Pow,
                "%": sp.Mod,
                "max": sp.Max,
                "min": sp.Min
            }
            
            if func in operator_map:
                return operator_map[func](*args)
            elif func == "-":
                return sp.Add(args[0], -args[1])  
            elif func == "/":
                return sp.Mul(args[0], sp.Pow(args[1], -1))  

            return getattr(sp, func)(*args) if hasattr(sp, func) else sp.Function(func)(*args)

    def is_valid_expression(self, sympy_expr):
        """Check if a sympy expression is valid."""
        try:
            test_values = {f"x{i}": 1 for i in range(1, self.max_variables + 1)}
            value = sympy_expr.subs(test_values).evalf()
            return not value.has(sp.S.ComplexInfinity) and not value.is_infinite
        except Exception:
            return False  


    def evaluate_tree(self, tree):
        """Evaluate the tree across the dataset."""
        try:
            sympy_expr = self.tree_to_sympy(tree)
            if not self.is_valid_expression(sympy_expr):
                return float('inf')
            f = sp.lambdify([f"x{i}" for i in range(1, self.max_variables + 1)], sp.sympify(sympy_expr), "numpy")
            scores = []
            for row in self.data:
                inputs, target = row[:-1], row[-1]
                try:
                    prediction = f(*inputs)
                    if numpy.isnan(prediction):
                        raise Exception
                    scores.append(abs(prediction - target))  
                except Exception:
                    scores.append(float('inf'))  
            return numpy.mean(scores)
        except Exception:
            return float('inf')  

    
    def mutate_tree(self, tree):
        """Mutate the tree."""
        func = random.choice(self.functions)
        if(func in self.unary_functions):
            return [func, tree]
        
        mutation_type = random.choice(["add_variable", "add_constant"])
        addition = f"x{random.randint(1, self.max_variables)}" if mutation_type == "add_variable" else random.uniform(-10, 10)
        return [func, tree, addition]

    def crossover_trees(self, tree1, tree2):
        """Perform crossover between two trees by swapping random subtrees."""
        def select_random_subtree(tree, path=None):
            """Randomly select a subtree in the tree."""
            if path is None:
                path = []
            if not isinstance(tree, list):
                return tree, path  
            if random.random() < 0.5 or len(tree) == 1:  
                return tree, path
            else:
                child_index = random.randint(0, len(tree) - 1)
                return select_random_subtree(tree[child_index], path + [child_index])

        def replace_subtree(tree, path, new_subtree):
            """Replace a subtree at the given path with a new subtree."""
            if not path:  
                return new_subtree
            subtree = tree
            for i in path[:-1]:  
                subtree = subtree[i]
            subtree[path[-1]] = new_subtree
            return tree

        subtree1, path1 = select_random_subtree(tree1)
        subtree2, path2 = select_random_subtree(tree2)

        new_tree1 = copy.deepcopy(tree1)
        new_tree2 = copy.deepcopy(tree2)
        new_tree1 = replace_subtree(new_tree1, path1, subtree2)
        new_tree2 = replace_subtree(new_tree2, path2, subtree1)

        return new_tree1, new_tree2


    def run(self, population_size, offspring_size, num_generations):
        """Run the genetic programming algorithm."""
        assert offspring_size >= population_size

        population = [self.create_tree() for _ in range(population_size)]
        best_tree = None 
        best_score = float('inf')

        def tournament_selection(scored_population, tournament_size):
            """Perform tournament selection."""
            selected = []
            for _ in range(population_size // 2):  
                tournament = random.sample(scored_population, tournament_size)
                winner = min(tournament, key=lambda x: x[1])  
                selected.append(winner[0])
            return selected

        for generation in tqdm.tqdm(range(num_generations)):
            try:
                print(f"Generation {generation + 1}")

                scored_population = [(tree, self.evaluate_tree(tree)) for tree in population]
                scored_population.sort(key=lambda x: x[1])

                selected = tournament_selection(scored_population, 3)
                
                offspring = []
                while len(offspring) < offspring_size:
                    parent1, parent2 = random.sample(selected, 2)
                    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)
                    child1, child2 = self.crossover_trees(child1, child2)
                    child1 = self.mutate_tree(child1)
                    child2 = self.mutate_tree(child2)
                    if child1 in offspring or child2 in offspring:
                        continue
                    offspring.extend([child1, child2])

                offspring_evaluation = [(tree, self.evaluate_tree(tree)) for tree in offspring]
                sorted_offspring = sorted(offspring_evaluation, key=lambda x: x[1])
                population = selected + [tree for tree, _ in sorted_offspring if tree not in selected]
                population.sort(key=lambda x: self.evaluate_tree(x))
                population = population[:population_size]
                top1score = self.evaluate_tree(population[0])
                if top1score < best_score:
                    best_tree, best_score = population[0], top1score
            except KeyboardInterrupt:
                break

        final_population = [(tree, self.evaluate_tree(tree)) for tree in population]
        final_population.sort(key=lambda x: x[1])
        print(f"Best tree: {best_tree} with score: {best_score}")
        return best_tree, best_score

        

In [10]:
def open_dataset(file_path):
    table = []
    with open(file_path, 'r') as file:
        rows = file.readlines()

    for row in rows:
        values = list(map(float, row.split()))
        table.append(values)

    return table

import warnings
warnings.filterwarnings('ignore')

def run_gp(problem_name):
    train_data = open_dataset("train/" + problem_name + ".txt")
    validate_data = open_dataset("validate/" + problem_name + ".txt")
    gp = GP(train_data)
    best_tree, best_score = gp.run(50, 700, 10)
    print(f"Best tree: {best_tree} with score: {best_score}")
    scores_for_validation = []
    for data in validate_data:
        argument = data[:-1]
        target = data[-1]
        sympy_expr = gp.tree_to_sympy(best_tree)
        f = sp.lambdify([f"x{i}" for i in range(1, gp.max_variables + 1)], sp.sympify(sympy_expr), "numpy")
        prediction = f(*argument)
        scores_for_validation.append(abs(prediction - target))
    print(f"Mean score for validation: {numpy.mean(scores_for_validation)}")

In [13]:
run_gp("easy-I.12.5")

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [03:13<29:02, 193.56s/it]

Generation 2


 20%|██        | 2/10 [04:00<14:20, 107.51s/it]

Generation 3


 30%|███       | 3/10 [06:20<14:15, 122.22s/it]

Generation 4


 40%|████      | 4/10 [09:01<13:44, 137.43s/it]

Generation 5


 50%|█████     | 5/10 [10:42<10:22, 124.52s/it]

Generation 6


 60%|██████    | 6/10 [12:33<07:58, 119.66s/it]

Generation 7


 70%|███████   | 7/10 [15:52<07:16, 145.59s/it]

Generation 8


 80%|████████  | 8/10 [17:21<04:15, 127.59s/it]

Generation 9


 90%|█████████ | 9/10 [19:44<02:12, 132.64s/it]

Generation 10


100%|██████████| 10/10 [24:54<00:00, 149.42s/it]


Best tree: ['+', ['*', 'x1', 'x2'], 'x3'] with score: 1.6801948735754466e-07
Best tree: ['+', ['*', 'x1', 'x2'], 'x3'] with score: 1.6801948735754466e-07
Mean score for validation: 1.5515631046720755e-07


In [4]:
data_test = open_dataset("train/easy-I.12.5.txt")
gp = GP(data_test)
best_tree, best_score = gp.run(50, 700, 10)
#should be [*, x1, x2] and x3 ignored
sympy_expr = gp.tree_to_sympy(best_tree)
f = sp.lambdify([f"x{i}" for i in range(1, len(data_test[0]))], sp.sympify(sympy_expr), "numpy")
print(f(*data_test[0][:-1]), data_test[0][-1])

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [03:33<32:03, 213.68s/it]

Generation 2


 20%|██        | 2/10 [04:55<18:07, 135.93s/it]

Generation 3


 30%|███       | 3/10 [06:10<12:36, 108.13s/it]

Generation 4


 40%|████      | 4/10 [07:38<10:00, 100.13s/it]

Generation 5


 50%|█████     | 5/10 [08:15<06:27, 77.41s/it] 

['+', ['+', ['+', ['max', ['*', 'x2', 'x1'], ['*', 'x2', 'x1']], 'x3'], 'x3'], 'x3']
['+', ['*', 'x2', 'x1'], 'x3']
['+', ['max', ['*', 'x2', 'x1'], ['*', 'x2', 'x1']], 'x3']
['+', ['max', ['*', 'x2', 'x1'], ['*', 'x2', 'x1']], 'x3']
['*', 'x2', 'x1']
['*', 'x2', 'x1']
['*', 'x1', 'x2']
['*', 'x2', 'x1']
['*', 'x2', 'x1']
['*', 'x1', 'x2']
['*', 'x1', 'x2']
['-', ['+', ['max', ['*', 'x2', 'x1'], ['*', 'x2', 'x1']], 'x3'], 'x3']
['-', ['*', 'x1', 'x2'], 'x3']
['-', ['*', 'x2', 'x1'], 'x3']
['-', ['-', ['*', 'x1', 'x2'], 'x3'], 'x3']
['-', ['-', ['*', 'x1', 'x2'], 'x3'], 'x3']
['+', ['*', 'x1', 'x2'], -0.13806016506189245]
['+', ['floor', ['floor', ['*', 'x1', 'x2']]], 0.9420711064528717]
['+', ['floor', ['floor', ['*', 'x1', 'x2']]], 0.9420711064528717]
['-', ['+', ['floor', ['floor', ['*', 'x1', 'x2']]], 0.9420711064528717], 'x3']
['-', ['floor', ['+', ['floor', ['floor', ['*', 'x1', 'x2']]], 0.9420711064528717]], -0.023130017440378126]
['-', ['floor', ['+', ['floor', ['floor', ['*', '

 60%|██████    | 6/10 [11:03<07:13, 108.31s/it]

Generation 7


 70%|███████   | 7/10 [11:33<04:08, 82.79s/it] 

Generation 8


 80%|████████  | 8/10 [11:58<02:08, 64.18s/it]

Generation 9


 90%|█████████ | 9/10 [12:15<00:49, 49.41s/it]

Generation 10


100%|██████████| 10/10 [13:53<00:00, 83.38s/it]


Best tree: ['/', ['+', ['+', ['*', 'x2', ['+', ['+', ['*', 'x1', 'x2'], 'x3'], 'x3']], 'x3'], 'x3'], 'x2'] with score: 1.6801948735230928e-07
-0.09634144647592272 -0.09634145349264145


In [5]:
data_test = open_dataset("train/easy-I.18.12.txt")
gp = GP(data_test)
best_tree, best_score = gp.run(50, 700, 10)
#should be [*, x1, x2, [sin, x4]] and x3 ignored
sympy_expr = gp.tree_to_sympy(best_tree)
f = sp.lambdify([f"x{i}" for i in range(1, len(data_test[0]))], sp.sympify(sympy_expr), "numpy")
print(f(*data_test[0][:-1]), data_test[0][-1])

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [02:25<21:49, 145.47s/it]

Generation 2


 20%|██        | 2/10 [05:31<22:35, 169.42s/it]

Generation 3


 30%|███       | 3/10 [07:11<16:04, 137.75s/it]

Generation 4


 40%|████      | 4/10 [07:47<09:44, 97.45s/it] 

Generation 5


 50%|█████     | 5/10 [09:41<08:38, 103.64s/it]

['ceil', ['*', ['*', ['sin', 'x4'], 'x2'], 'x1']]
['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']]
['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']]
['*', ['max', ['*', ['sin', ['sin', ['sin', 'x4']]], 'x1'], -3.0834215952820436], 'x2']
['max', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2'], -3.9749103970376742]
['ceil', ['min', ['*', ['*', ['sin', 'x4'], 'x2'], 'x1'], 7.279882934158493]]
['min', ['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']], 9.108018525093051]
['min', ['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']], 7.382936752647609]
['/', ['*', ['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']], 'x1'], 6.632369660367594]
['floor', ['min', ['*', ['sin', ['sin', 'x4']], 'x1'], ['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']]]]
['-', ['ceil', ['*', ['-', ['*', ['sin', ['sin', 'x4']], 'x1'], 'x3'], 'x2']], -1.662732713404285]
['+', ['+

 60%|██████    | 6/10 [10:59<06:18, 94.59s/it] 

Generation 7


 70%|███████   | 7/10 [14:32<06:40, 133.61s/it]

Generation 8


 80%|████████  | 8/10 [16:37<04:21, 130.82s/it]

Generation 9


 90%|█████████ | 9/10 [18:19<02:01, 121.66s/it]

Generation 10


100%|██████████| 10/10 [19:10<00:00, 115.09s/it]


Best tree: ['+', ['*', ['*', ['*', ['*', ['sin', 'x4'], 'x2'], 'x1']]], 'x3'] with score: 2.1928592598622905e-06
-0.07179479421010175 -0.07179489731788635


In [14]:
data_test = open_dataset("train/medium-I-12.11.txt")
gp = GP(data_test)
best_tree, best_score = gp.run(50, 700, 10)
#should be [*, [/, 3, 2], [*, x1, x2]]
sympy_expr = gp.tree_to_sympy(best_tree)
f = sp.lambdify([f"x{i}" for i in range(1, len(data_test[0]))], sp.sympify(sympy_expr), "numpy")
print(f(*data_test[0][:-1]), data_test[0][-1])

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [00:31<04:46, 31.88s/it]

Generation 2


 20%|██        | 2/10 [03:55<17:42, 132.86s/it]

Generation 3


 30%|███       | 3/10 [05:28<13:24, 114.87s/it]

Generation 4


 40%|████      | 4/10 [07:32<11:48, 118.14s/it]

Generation 5


 50%|█████     | 5/10 [08:58<08:53, 106.60s/it]

Generation 6


 60%|██████    | 6/10 [09:56<06:00, 90.07s/it] 

Generation 7


 70%|███████   | 7/10 [11:50<04:54, 98.07s/it]

Generation 8


 80%|████████  | 8/10 [13:49<03:29, 104.57s/it]

Generation 9


 90%|█████████ | 9/10 [15:48<01:49, 109.21s/it]

Generation 10


100%|██████████| 10/10 [18:26<00:00, 110.63s/it]


Best tree: ['*', ['ceil', 'x2'], 'x1'] with score: 5.418158217819037
-1.2815875740797322 -1.261507643692644


In [20]:
run_gp("medium-I-12.11")

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [00:13<02:05, 13.94s/it]

Generation 2


 20%|██        | 2/10 [01:06<04:55, 36.92s/it]

Generation 3


 30%|███       | 3/10 [03:11<09:00, 77.16s/it]

Generation 4


 40%|████      | 4/10 [05:22<09:49, 98.31s/it]

Generation 5


 50%|█████     | 5/10 [07:46<09:32, 114.55s/it]

Generation 6


 60%|██████    | 6/10 [08:48<06:27, 96.93s/it] 

Generation 7


 70%|███████   | 7/10 [11:06<05:31, 110.39s/it]

Generation 8


 80%|████████  | 8/10 [14:59<04:58, 149.18s/it]

Generation 9


 90%|█████████ | 9/10 [17:31<02:30, 150.09s/it]

Generation 10


100%|██████████| 10/10 [18:55<00:00, 113.54s/it]


Best tree: ['+', ['*', 'x1', 'x2'], 'x1'] with score: 5.480249074508971
Best tree: ['+', ['*', 'x1', 'x2'], 'x1'] with score: 5.480249074508971
Mean score for validation: 6.97018856450081


In [19]:
run_gp("medium-I-39.10")

  0%|          | 0/10 [00:00<?, ?it/s]

Generation 1


 10%|█         | 1/10 [00:12<01:50, 12.28s/it]

Generation 2


 20%|██        | 2/10 [00:26<01:47, 13.39s/it]

Generation 3


 30%|███       | 3/10 [02:08<06:18, 54.02s/it]

Generation 4


 40%|████      | 4/10 [05:45<11:49, 118.21s/it]

Generation 5


 50%|█████     | 5/10 [07:32<09:30, 114.06s/it]

Generation 6


 60%|██████    | 6/10 [10:28<09:01, 135.38s/it]

Generation 7


 70%|███████   | 7/10 [12:31<06:33, 131.32s/it]

Generation 8


 80%|████████  | 8/10 [14:14<04:04, 122.29s/it]

Generation 9


 90%|█████████ | 9/10 [16:26<02:05, 125.34s/it]

Generation 10


100%|██████████| 10/10 [18:06<00:00, 108.63s/it]


Best tree: ['abs', ['+', ['*', ['-', ['max', ['-', ['floor', ['-', ['ceil', ['min', ['+', ['*', 'x2', 'x1'], 6.307646796365486], 'x1']], 'x2']], 0.7836555580220388], ['floor', ['*', ['ceil', ['abs', ['floor', ['-', ['floor', ['min', ['+', ['*', 'x2', 'x1'], 6.307646796365486], 'x1']], 0.5029912753360737]]]], 1.2713396393142258]]], 2.39924835390317], 1.2713396393142258], -2.126492181146345]] with score: 7.043828512620394
Best tree: ['abs', ['+', ['*', ['-', ['max', ['-', ['floor', ['-', ['ceil', ['min', ['+', ['*', 'x2', 'x1'], 6.307646796365486], 'x1']], 'x2']], 0.7836555580220388], ['floor', ['*', ['ceil', ['abs', ['floor', ['-', ['floor', ['min', ['+', ['*', 'x2', 'x1'], 6.307646796365486], 'x1']], 0.5029912753360737]]]], 1.2713396393142258]]], 2.39924835390317], 1.2713396393142258], -2.126492181146345]] with score: 7.043828512620394
Mean score for validation: 7.096589173671177
