# Cycle
This cycle uses mixture experimentalist, BMS theorist, and equation sampler as a source for the ground truth. 

In [50]:
import copy
from dataclasses import dataclass, field
from typing import List

from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import numpy as np
import math
import pandas as pd
from autora.variable import VariableCollection, Variable
from autora.state.bundled import StandardState
from autora.state.delta import on_state
from autora.state.wrapper import state_fn_from_estimator
from autora.theorist.bms import BMSRegressor
from equation_tree import sample 
from equation_tree.tree import instantiate_constants
from equation_tree.prior import DEFAULT_PRIOR_FUNCTIONS, DEFAULT_PRIOR_OPERATORS, \
    structure_prior_from_max_depth
import pprint
from autora.experiment_runner.synthetic.abstract.equation import equation_experiment
from autora.experimentalist.mixture import sample as mixture_sample
from autora.experimentalist.grid_ import grid_pool
from autora.experimentalist.random_ import random_sample
from autora.state.delta import Delta
from autora.experimentalist.falsification import falsification_score_sample
from autora.experimentalist.model_disagreement import model_disagreement_score_sample
from autora.experimentalist.novelty import novelty_score_sample

In [53]:
N_CONDITIONS = 50000
TEMPERATURE = 1.
WEIGHTS = {'falsification':[.1], 'novelty':[.5]}
NUM_SAMPLES = 100

## Ground truth
Sampling the ground truth for this simulation.

In [54]:
structure_prior = structure_prior_from_max_depth(10)
pprint.pprint(structure_prior)
pprint.pprint(DEFAULT_PRIOR_FUNCTIONS)
pprint.pprint(DEFAULT_PRIOR_OPERATORS)
feature_prior = {'constants': .3, 'variables': .7}
prior = {'functions': DEFAULT_PRIOR_FUNCTIONS, 'operators': DEFAULT_PRIOR_OPERATORS, 'structures': structure_prior, 'features': feature_prior}
pprint.pprint(prior)

{'[0, 1, 1, 2, 2, 3, 3, 4, 4, 5]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4, 5, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4, 5, 5]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4, 5, 6]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4, 5]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 3]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 3, 4, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 3, 4, 5]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 3, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 3]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 3, 4]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 3]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 5, 3]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 5, 5]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 5, 6]': 0.0007288629737609329,
 '[0, 1, 1, 2, 2, 3, 4, 4, 5]': 0.0007288629737

In [55]:
equation_raw = sample(n=1, prior=prior, max_num_variables=4)
equation_raw[0].sympy_expr

Processing: 100%|██████████| 1/1 [00:00<00:00, 24.41iteration/s]


sin(c_1 + x_2)/c_2

In [56]:
equation = instantiate_constants(equation_raw[0], lambda: np.random.rand()*100)
equation.sympy_expr


0.01550742633323318*sin(x_2 + 18.155000926266617)

In [66]:
features = {'x_1': np.linspace(-10, 10, 100), 'x_2': np.linspace(1, 11, 100), 'x_3': np.linspace(1, 11, 100)}

In [67]:
equation_raw[0]._evaluate(features)
equation_raw[0].has_valid_value

KeyError: 'c_1'

Defining the metadata based on the sampled ground truth.

In [73]:
independent_variables = [
    #Variable("x_1", allowed_values=np.linspace(-10, 10, 15)),
    Variable("x_2", allowed_values=np.linspace(1, 11, 50000)),
    #Variable("x_3", allowed_values=np.linspace(1, 11, 15)),
]
# for v in range(equation.n_variables_unique):
#     # taking a floor depending on n of variables so that each experimental space has roughly the same coarseness
#     independent_variables.append(Variable(f"x_{v+1}",allowed_values=np.linspace(-10, 10, math.floor(N_CONDITIONS**(1/equation.n_variables_unique)))))



variables=VariableCollection(
        independent_variables=independent_variables,
        dependent_variables=[Variable("y")]
    )

Defining experiment runner from the equation and the variable collection

In [104]:
experiment = equation_experiment(equation.sympy_expr, variables.independent_variables, variables.dependent_variables[0], rename_output_columns=False)

### Defining the state
We can define an initial state for our discovery problem based on the variable specification above. Wrapping experiment runner into the state.

In [105]:
@dataclass(frozen=True)
class ExtendedState(StandardState):
    models_bms: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )
    models_linear: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )
    models_polynom: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )

state = ExtendedState(
    variables=variables
)
runner_on_state = on_state(experiment.experiment_runner, output=["experiment_data"])

### Pooler

In [106]:
experimentalist_pooler = on_state(grid_pool, output=["conditions"])

In [107]:
state1 = experimentalist_pooler(state)
print(type(state1.conditions))
runner_on_state(state1)

<class 'pandas.core.frame.DataFrame'>


ExtendedState(variables=VariableCollection(independent_variables=[Variable(name='x_2', value_range=None, allowed_values=array([ 1.        ,  1.0002    ,  1.00040001, ..., 10.99959999,
       10.9998    , 11.        ]), units='', type=<ValueType.REAL: 'real'>, variable_label='', rescale=1, is_covariate=False)], dependent_variables=[Variable(name='y', value_range=None, allowed_values=None, units='', type=<ValueType.REAL: 'real'>, variable_label='', rescale=1, is_covariate=False)], covariates=[]), conditions=           x_2
0       1.0000
1       1.0002
2       1.0004
3       1.0006
4       1.0008
...        ...
49995  10.9992
49996  10.9994
49997  10.9996
49998  10.9998
49999  11.0000

[50000 rows x 1 columns], experiment_data=           x_2         y
0       1.0000  0.004968
1       1.0002  0.003626
2       1.0004  0.005420
3       1.0006  0.005613
4       1.0008  0.002724
...        ...       ...
49995  10.9992 -0.011667
49996  10.9994 -0.010129
49997  10.9996 -0.011556
49998  10.9998 -

## Mixture experimentalist
Defining the mixture experimentalist and wrapping it into the state

In [108]:
@on_state()
def experimentalist_sample(conditions, models, experiment_data, variables, temperature, weights, num_samples):
    if not models or not experiment_data:
        conditions_ = random_sample(conditions, num_samples)
    else:
        experiment_conditions = experiment_data[[v.name for v in variables.independent_variables]]
        experiment_observations = experiment_data[[v.name for v in variables.dependent_variables]]
        params_ = {} #copy.deepcopy(params)
        params_["falsification"] = {"reference_conditions": experiment_conditions, "reference_observations": experiment_observations, "model": models[-1]}
        params_["novelty"] = {"reference_conditions": experiment_conditions}

        samplers = [
            [novelty_score_sample, "novelty", weights["novelty"]],
            [falsification_score_sample, "falsification", weights["falsification"]]
        ],

        conditions_ = mixture_sample(conditions, temperature, samplers, params_, num_samples)
        conditions_ = conditions_.drop("score", axis = 1)
    
    d = Delta(conditions = conditions_)
    return d

## BMS theorist
Defining the BMS theorist and wrapping it into the state

In [109]:
@on_state()
def bms_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = BMSRegressor(epochs=1000).set_params(**kwargs).fit(X, y)
    return Delta(models_bms=new_model)

@on_state()
def linear_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = LinearRegression().set_params(**kwargs).fit(X, y)
    return Delta(models_linear=new_model)


def PolynomialRegression(degree=3, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))


@on_state()
def polynomial_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = PolynomialRegression()
    new_model.fit(X, y)
    return Delta(models_polynom=new_model)


In [110]:
@on_state()
def best_model(models_bms, models_linear, models_polynom, experiment_data, variables):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    prediction_bms = models_bms[-1].predict(X)
    prediction_linear = models_linear[-1].predict(X)
    prediction_polynomial = models_polynom[-1].predict(X)
    mad_bms = mean_absolute_error(y, prediction_bms)
    mad_linear = mean_absolute_error(y, prediction_linear)
    mad_poly = mean_absolute_error(y, prediction_polynomial)
    if mad_bms <= mad_linear and mad_bms <= mad_poly:
        new_model = models_bms[-1]
    elif mad_linear <= mad_bms and mad_linear <= mad_poly:
        new_model = models_linear[-1]
    elif mad_poly <= mad_linear and mad_poly <= mad_bms:
        new_model = models_polynom[-1]

    return Delta(model=new_model)


In [111]:
def cycle(s):
    s_pool = experimentalist_pooler(s)
    s_conditions = experimentalist_sample(s_pool, temperature=TEMPERATURE, weights=WEIGHTS, num_samples=NUM_SAMPLES)
    print(type(s_conditions.conditions))
    s_run = runner_on_state(s_conditions)
    print(s_run)
    # s_theory = bms_theorist(s_run)
    # s_theory = linear_theorist(s_theory)
    # s_theory = polynomial_theorist(s_theory)
    # s_best = best_model(s_theory)
    #return s_best


In [112]:
state_1 = cycle(state)


<class 'pandas.core.frame.DataFrame'>
ExtendedState(variables=VariableCollection(independent_variables=[Variable(name='x_2', value_range=None, allowed_values=array([ 1.        ,  1.0002    ,  1.00040001, ..., 10.99959999,
       10.9998    , 11.        ]), units='', type=<ValueType.REAL: 'real'>, variable_label='', rescale=1, is_covariate=False)], dependent_variables=[Variable(name='y', value_range=None, allowed_values=None, units='', type=<ValueType.REAL: 'real'>, variable_label='', rescale=1, is_covariate=False)], covariates=[]), conditions=             x_2
7302    2.460429
29701   6.940319
41252   9.250565
27276   6.455309
11462   3.292446
...          ...
16071   4.214264
47298  10.459789
1936    1.387208
10514   3.102842
20629   5.125883

[100 rows x 1 columns], experiment_data=       x_2         y
7302   NaN  0.015983
29701  NaN -0.002424
41252  NaN  0.011601
27276  NaN -0.008829
11462  NaN  0.010886
...    ...       ...
16071  NaN -0.005628
47298  NaN -0.006075
1936   NaN  0.008