# Cycle
This cycle uses mixture experimentalist, BMS theorist, and equation sampler as a source for the ground truth. 

In [161]:
import copy
from dataclasses import dataclass, field
from typing import List

from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import numpy as np
import math
import pandas as pd
from autora.variable import VariableCollection, Variable
from autora.state.standard import StandardState
from autora.state import on_state
from autora.state.wrapper import state_fn_from_estimator
from autora.theorist.bms import BMSRegressor
from equation_tree import sample 
from equation_tree.tree import instantiate_constants
from equation_tree.prior import DEFAULT_PRIOR_FUNCTIONS, DEFAULT_PRIOR_OPERATORS, \
    structure_prior_from_max_depth
import pprint
from autora.experiment_runner.synthetic.abstract.equation import equation_experiment
from autora.experimentalist.mixture import sample as mixture_sample
from autora.experimentalist.grid_ import grid_pool
from autora.experimentalist.random_ import random_sample, random_pool
from autora.state import Delta
from autora.experimentalist.falsification import falsification_score_sample
from autora.experimentalist.model_disagreement import model_disagreement_score_sample
from autora.experimentalist.novelty import novelty_score_sample

In [185]:
# SAMPLING
N_CONDITIONS = 50000
TEMPERATURE = 1.
WEIGHTS = {'falsification':[.1, .1], 'novelty':[.5, .5]}
NUM_SAMPLES = 100
POOL_RANGE = 10

# EQUATION
MAX_TREE_DEPTH = 6
MAX_NUM_VARIABLES = 4
NUM_POOL_SAMPLES = 10_000
CONSTANT_SIZE = 5

## Ground truth
Sampling the ground truth for this simulation.

In [186]:
structure_prior = structure_prior_from_max_depth(MAX_TREE_DEPTH)
pprint.pprint(structure_prior)
pprint.pprint(DEFAULT_PRIOR_FUNCTIONS)
pprint.pprint(DEFAULT_PRIOR_OPERATORS)
feature_prior = {'constants': .3, 'variables': .7}
prior = {'functions': DEFAULT_PRIOR_FUNCTIONS, 'operators': DEFAULT_PRIOR_OPERATORS, 'structures': structure_prior, 'features': feature_prior}

In [187]:
equation_raw = sample(n=1, prior=prior, max_num_variables=MAX_NUM_VARIABLES)
equation_raw[0].sympy_expr

TypeError: sample() got an unexpected keyword argument 'n'

In [188]:
equation = instantiate_constants(equation_raw[0], lambda: np.random.rand()*CONSTANT_SIZE)
equation.sympy_expr


log(x_2)*tan(x_1)

Defining the metadata based on the sampled ground truth.

In [189]:
independent_variables = []
for v in range(equation.n_variables_unique):
    independent_variables.append(Variable(equation.variables_unique[v],value_range=(-POOL_RANGE, POOL_RANGE)))

variables=VariableCollection(
        independent_variables=independent_variables,
        dependent_variables=[Variable("y")]
    )

Defining experiment runner from the equation and the variable collection

In [190]:
experiment = equation_experiment(equation.sympy_expr, variables.independent_variables, variables.dependent_variables[0], rename_output_columns=False)

### Defining the state
We can define an initial state for our discovery problem based on the variable specification above. Wrapping experiment runner into the state.

In [191]:
@dataclass(frozen=True)
class ExtendedState(StandardState):
    models_bms: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )
    models_linear: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )
    models_polynom: List[BaseEstimator] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )
    rejections: List[int] = field(
        default_factory=list,
        metadata={"delta": "extend"},
    )

state = ExtendedState(
    variables=variables
)
runner_on_state = on_state(experiment.experiment_runner, output=["experiment_data"])

### Pooler

In [15]:
@on_state()
def experimentalist_pooler(variables, equation):
    conditions_ = pd.DataFrame(columns=[v.name for v in variables.independent_variables])
    i = 0
    while i < 1_000_000 and len(conditions_.index) < NUM_POOL_SAMPLES:
        _sample = random_pool(variables, NUM_POOL_SAMPLES)
        evaluation = equation.evaluate(_sample)
        bad_indices = np.where(np.isnan(evaluation) | np.isinf(evaluation))[0]
        df_cleaned = _sample.drop(bad_indices)
        if np.isnan(evaluation).any() or np.isinf(evaluation).any():
            i+=len(bad_indices)
        conditions_.append(_sample, ignore_index=True)
    if i >= 1_000_000:
        return None
    conditions_ = conditions_.head(NUM_POOL_SAMPLES)
    return Delta(conditions=conditions_, rejections=i)

## Mixture experimentalist
Defining the mixture experimentalist and wrapping it into the state

In [None]:
"""
Mixture Experimentalist Sampler
"""

import numpy as np
from typing import Optional, Union

import pandas as pd


def adjust_distribution(p_, temperature):
    # temperature cannot be 0
    assert temperature != 0, 'Temperature cannot be 0'
    p = np.array(p_)
    # If the temperature is very low (close to 0), then the sampling will become almost deterministic, picking the event with the highest probability.
    # If the temperature is very high, then the sampling will be closer to uniform, with all events having roughly equal probability.

    p = p / np.sum(np.abs(p))  # Normalizing the initial distribution

    p = np.exp(p / temperature)
    final_p = p / np.sum(p)  # Normalizing the final distribution
    return final_p


def sample(conditions: Union[pd.DataFrame, np.ndarray], temperature: float,
                   samplers: list, params: dict,
                   num_samples: Optional[int] = None) -> pd.DataFrame:
    """

    Args:
        conditions: pool of experimental conditions to evaluate: pd.Dataframe
        temperature: how random is selection of conditions (cannot be 0; (0:1) - the choices are more deterministic than the choices made wrt
        samplers: tuple containing sampler functions, their names, and weights
        for sampler functions that return both positive and negative scores, user can provide a list with two weights: the first one will be applied to positive scores, the second one -- to the negative
        params: nested dictionary. keys correspond to the sampler function names (same as provided in samplers),
        values correspond to the dictionaries of function arguments (argument name: its value)
        num_samples: number of experimental conditions to select

    Returns:
        Sampled pool of experimental conditions with the scores attached to them
    """

    condition_pool = pd.DataFrame(conditions)

    rankings = pd.DataFrame()
    mixture_scores = np.zeros(len(condition_pool))
    ## getting rankings and weighted scores from each function
    for (function, name, weight) in samplers:
        try:
            sampler_params = params[name]
            pd_ranking = function(conditions=condition_pool, **sampler_params)
        except:
            pd_ranking = function(conditions=condition_pool)
        # sorting by index
        pd_ranking = pd_ranking.sort_index()
        # if only one weight is provided, use it for both negative and positive dimensions
        if isinstance(weight, float) or isinstance(weight, int):
            pd_ranking["score"] = pd_ranking["score"] * weight
        else:
            if len(pd_ranking["score"] < 0) > 0 and len(pd_ranking["score"] > 0) > 0:  # there are both positive and negative values

                pd_ranking.loc[pd_ranking["score"] > 0]["score"] = pd_ranking.loc[pd_ranking["score"] > 0]["score"] * weight[0]  # positive dimension gets the first weight
                pd_ranking.loc[pd_ranking["score"] < 0]["score"] = pd_ranking.loc[pd_ranking["score"] < 0]["score"] * weight[1]  # negative dimension gets the second weight
            else:
                pd_ranking["score"] = pd_ranking["score"] * weight[0]

        pd_ranking.rename(columns={"score": f"{name}_score"}, inplace=True)
        # sum_scores are arranged based on the original conditions_ indices
        mixture_scores = mixture_scores + pd_ranking[f"{name}_score"]

        rankings = pd.merge(rankings, pd_ranking, left_index=True, right_index=True, how="outer")

    # adjust mixture scores wrt temperature
    weighted_mixture_scores_adjusted = adjust_distribution(mixture_scores, temperature)
    print(weighted_mixture_scores_adjusted)

    if num_samples is None:
        num_samples = condition_pool.shape[0]

    condition_indices = np.random.choice(np.arange(len(condition_pool)), num_samples,
                                         p=weighted_mixture_scores_adjusted, replace=False)
    conditions_ = condition_pool.iloc[condition_indices]
    conditions_["score"] = mixture_scores

    return conditions_


mixture_sample_test = sample

In [None]:
@on_state()
def experimentalist_sample(conditions, models, experiment_data, variables, temperature, weights, num_samples):
    print(models)
    print(experiment_data)
    if models is None or experiment_data is None:
        print('First cycle: Using random sampler')
        conditions_ = random_sample(conditions, num_samples)
    else:
        experiment_conditions = experiment_data[[v.name for v in variables.independent_variables]]
        experiment_observations = experiment_data[[v.name for v in variables.dependent_variables]]
        params_ = {} #copy.deepcopy(params)
        params_["falsification"] = {"reference_conditions": experiment_conditions, "reference_observations": experiment_observations, "model": models[-1]}
        params_["novelty"] = {"reference_conditions": experiment_conditions}


        samplers = [
            [novelty_score_sample, "novelty", weights["novelty"]],
            [falsification_score_sample, "falsification", weights["falsification"]]
        ]
        print(samplers)


        conditions_ = mixture_sample_test(conditions, temperature, samplers, params_, num_samples)
        conditions_ = conditions_.drop("score", axis = 1)
    #d = Delta(conditions=conditions)
    d = Delta(conditions = conditions_)
    return d

## BMS theorist
Defining the BMS theorist and wrapping it into the state

In [None]:
@on_state()
def bms_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = BMSRegressor(epochs=10).set_params(**kwargs).fit(X, y)
    return Delta(models_bms=[new_model])

@on_state()
def linear_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = LinearRegression().set_params(**kwargs).fit(X, y)
    return Delta(models_linear=[new_model])


def PolynomialRegression(degree=3, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))


@on_state()
def polynomial_theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    new_model = PolynomialRegression()
    new_model.fit(X, y)
    return Delta(models_polynom=[new_model])


In [181]:
@on_state()
def best_model(models_bms, models_linear, models_polynom, experiment_data, variables):
    ivs = [v.name for v in variables.independent_variables]
    dvs = [v.name for v in variables.dependent_variables]
    X, y = experiment_data[ivs], experiment_data[dvs]
    prediction_bms = models_bms[-1].predict(X)
    prediction_linear = models_linear[-1].predict(X)
    prediction_polynomial = models_polynom[-1].predict(X)
    mad_bms = mean_absolute_error(y, prediction_bms)
    mad_linear = mean_absolute_error(y, prediction_linear)
    mad_poly = mean_absolute_error(y, prediction_polynomial)
    if mad_bms <= mad_linear and mad_bms <= mad_poly:
        new_model = models_bms[-1]
    elif mad_linear <= mad_bms and mad_linear <= mad_poly:
        new_model = models_linear[-1]
    elif mad_poly <= mad_linear and mad_poly <= mad_bms:
        new_model = models_polynom[-1]

    return Delta(model=new_model)


In [182]:
def cycle(s):
    s_pool = experimentalist_pooler(s, equation=equation)
    print(s_pool)
    # s_conditions = experimentalist_sample(s_pool, temperature=TEMPERATURE, weights=WEIGHTS, num_samples=NUM_SAMPLES)
    # s_run = runner_on_state(s_conditions)
    # s_theory = bms_theorist(s_run)
    # s_theory = linear_theorist(s_theory)
    # s_theory = polynomial_theorist(s_theory)
    # s_best = best_model(s_theory)
    #return s_best


In [183]:
state = ExtendedState(variables=variables)
for _ in range(10):
    state = cycle(state)

print(state)


INFO:autora.theorist.bms.regressor:BMS fitting started


First cycle: Using random sampler


  3%|▎         | 104/3000 [00:02<01:06, 43.61it/s]


KeyboardInterrupt: 