In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from gplearn.genetic import SymbolicRegressor


In [2]:
problem = np.load('data/problem_4.npz')
x = problem['x'].T
y = problem['y']

x.shape, y.shape

((5000, 2), (5000,))

In [3]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

# Define the symbolic regressor
est = SymbolicRegressor(
    population_size=2000,
    generations=20,
    stopping_criteria=0.01,
    function_set = ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'tan'),
    p_crossover=0.7,
    p_subtree_mutation=0.1,
    p_hoist_mutation=0.05,
    p_point_mutation=0.1,
    max_samples=0.9,
    verbose=1,
    parsimony_coefficient=0.01,
    random_state=42
)

# Fit the model
est.fit(x_train, y_train)

# Predict on test data
y_pred = est.predict(x_valid)

# Print the resulting formula
print("Best formula:", est._program)



    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.57          12.0339       28          3.17364          3.12112      1.29m
   1     6.54          4.36951       21          3.02403          3.03547      1.20m
   2     6.79      6.85289e+06       11          2.67816          2.45827      1.09m
   3     8.22          4.70177       23          2.15173          2.31636     44.94s
   4    10.64          4.09322       18          1.63629          1.64615     52.66s
   5    16.10          3.55388       27          1.01976          1.03818     57.20s
   6    23.21          16924.9       33         0.571169         0.541955     56.20s
   7    29.68          2.57494       59         0.548706         0.526379      1.00m
   8    31.25          2.28648       82         0.517565         0.530979  

In [4]:
# Evaluate and visualize
mse = mean_squared_error(y_valid, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 0.12296306427653768
