# Meta Bayes module

This is a short preview of the Meta Bayes module.
Meta Bayes strives to compute the optimal prior for a set of tasks.

It relies on the penalized regression formulation of the inner PAC-Bayesian algorithm:

$$\hat\theta =\arg\inf_\theta \tilde{S}_i(\theta, \theta_0) := \pi(\theta)[S_i] + \lambda \text{KL}(\pi(\theta), \pi(\theta_0))$$ 

Noting $A_i(\theta_0)$ the solution of the task $i$ using prior $\theta_0$, the meta score can be written as

$$\sum S_{i}^{meta}(\theta_0) = \tilde{S}_i(A_i(\theta_0), \theta_0).$$

The meta learning algorithm uses gradient descent to minimize the meta_score, relying on 

$$\nabla S_i^{meta} = \lambda \nabla F_i $$ 
where $F_i(\theta) = \text{KL}(\pi(A_i(\theta_0)), \pi(\theta))$.

In [None]:
from surpbayes.meta_bayes import Task, MetaLearningEnv
from surpbayes.proba import GaussianMap, TensorizedGaussianMap, BlockDiagGaussMap
import numpy as np
import numba as nb

In [None]:
# Choose dimension/Number of tasks
d = 4
n_tasks = 100

# Generate tasks
def make_score(x):
#     @nb.njit(nb.float64[:](nb.float64[:,:]))
    def score(xs):
        return ((x - xs) ** 2).sum(-1)

    return score


x0 = 0.5 + np.random.normal(0, 0.2, d)
# x0[[1, 3]] = x0[[0, 2]]
x_middles = x0 + np.random.normal(0, 0.1, (n_tasks, d))

list_task = [
    Task(make_score(x_mid), temperature=0.1, vectorized=True) for x_mid in x_middles
]

task_trains = list_task[::2]
task_test = list_task[1::2]

# Define distribution family
proba_map = GaussianMap(d)
# proba_map = BlockDiagGaussMap([[0,1], [2,3]])
# proba_map = TensorizedGaussianMap(d)

# Define Meta Learning Environnement
mlearn = MetaLearningEnv(
    proba_map,
    list_task=task_trains,
    per_step=50,
    chain_length=2,
    kl_max=100.0,
    silent=True,
    n_max_eval=75
)

In [None]:
x_middles = x0 + np.random.normal(0, 0.1, (10, d))

test_tasks = [
    Task(make_score(x_mid), temperature=0.1, vectorized=True) for x_mid in x_middles
]

In [None]:
mlearn.meta_learn(epochs=1, eta=20.0, kl_max=1.0)
mlearn.hyperparams.update({"per_step":10**5, "chain_length":1})
mlearn.meta_learn(epochs=20, eta=20.0, kl_max=1.0)

In [None]:
import shutil

shutil.rmtree("my_learning_env")
mlearn.save("my_learning_env")

In [None]:
plt.plot(mlearn.)

In [None]:
import numpy as np

from surpbayes.meta_bayes.task import Task
from surpbayes.proba import ProbaMap
from surpbayes.bayes import pacbayes_minimize
from surpbayes.types import ProbaParam, ProbaParams

from surpbayes.misc import par_eval, blab

def test_eval_task(meta_param:ProbaParam, test_task:Task, proba_map:ProbaMap, n_test:int, hyperparams:dict):
    """Evaluate a test task at meta_param
    
    The task training is performed from scratch (empty accu),
    starting from meta_param prior. This prevents potential bias due
    to more accurate solutions being found after some iterations.
    """
    opt_res = pacbayes_minimize(
        fun=test_task.score,
        proba_map=proba_map,
        prior_param=meta_param,
        post_param=meta_param,
        temperature=test_task.temp,
        prev_eval=None,
        vectorized=test_task.vectorized,
        parallel=test_task.parallel,
        **hyperparams,
    )
    post_param = opt_res.opti_param
    
    post = proba_map(post_param)
    if test_task.vectorized:
        mean_score = np.mean(test_task.score(post(n_test)))
    else:
        mean_score = np.mean(par_eval(test_task.score, post(n_test), parallel=test_task.parallel))

    return mean_score + test_task.temp * proba_map.kl(post_param, meta_param)

def eval_meta_param(meta_param:ProbaParam, test_tasks:list[Task], proba_map:ProbaMap, n_test:int, hyperparams, silent:bool=False):
    """Evaluate a meta_param on a list of test_tasks"""
    accu = np.zeros(len(test_tasks))
    for i, task in enumerate(test_tasks):
        perf = test_eval_task(meta_param=meta_param, test_task=task, proba_map=proba_map, n_test=n_test, hyperparams=hyperparams)
        blab(silent, f"Task {i}: {perf}")
        accu[i] = perf
    return accu

def eval_meta_hist(meta_params:ProbaParams, test_tasks:list[Task], proba_map:ProbaMap, n_test:int=100, hyperparams:dict={}, silent:bool=False):
    """Evaluate a succession of meta_params"""

    accu = np.zeros((len(meta_params), len(test_tasks)))
    for j, meta_param in enumerate(meta_params):
        blab(silent, f"Starting meta_param {j}")
        accu[j] = eval_meta_param(meta_param=meta_param, test_tasks=test_tasks, proba_map=proba_map, n_test= n_test,hyperparams=hyperparams, silent=silent)
    return accu



In [None]:
from surpbayes.meta_bayes.test_assess import eval_meta_hist

In [None]:
mlearn.hist_meta.meta_params()

In [None]:
res = eval_meta_hist(mlearn.hist_meta.meta_params()[::2], test_tasks, proba_map = proba_map, hyperparams = {"per_step": 50, "chain_length":1, "silent":True})

In [None]:
import matplotlib.pyplot as plt
for i in range(res.shape[1]):
    plt.plot(res[:, i])

In [None]:
mlearn.meta_learn(epochs=20, eta=20.0, kl_max=1.0)

In [None]:
# Launch training (either through meta_learn or meta_learn_batch. meta_learn_batch is more stable)
# mlearn.hyperparams["chain_length"] = 2
# mlearn.hyperparams["per_step"] = 0
# mlearn.converged = False
mlearn.meta_learn_batch(epochs=10, eta=200.0, kl_max=0.4, silent=False, kl_tol=10**-7)
# mlearn.hyperparams["per_step"] = 10
# mlearn.meta_learn_batch(epochs=40, eta=40.0, kl_max=0.5, silent=False, kl_tol=10**-5)

In [None]:
proba_map(mlearn.prior_param)

In [None]:
x0

In [None]:
mlearn.save("my_learning_env", overwrite=True)

In [None]:
from surpbayes import load_accu
from surpbayes.meta_bayes.task import load_task

task = load_task("my_learning_env/tasks/task_0/")

In [None]:
task.accu_sample_val.ts()

In [None]:
[
    proba_map.kl(par2, par1)
    for par2, par1 in zip(
        mlearn.hist_meta.meta_params(20)[1:], mlearn.hist_meta.meta_params(20)
    )
]

In [None]:
mlearn.list_task[0].accu_sample_val.n_filled

In [None]:
proba_map(mlearn.prior_param).devs

In [None]:
from surpbayes.proba import FactCovGaussianMap

In [None]:
help(FactCovGaussianMap)

In [None]:
import numpy as np
from surpbayes.proba import FactCovGaussianMap

In [None]:
## Double check implementation of g function
d = 4
# proba_map = TensorizedGaussianMap(d)
# proba_map = BlockDiagGaussMap([[0,2, 1], [ 3]])
proba_map = FactCovGaussianMap(d)

param0 = np.random.normal(0, 1, proba_map.proba_param_shape)
param1 = np.random.normal(0, 1, proba_map.proba_param_shape)
param2 = np.random.normal(0, 1, proba_map.proba_param_shape)

proba0 = proba_map(param0)
proba1 = proba_map(param1)
proba2 = proba_map(param2)

xs = proba0(1000)

ldens_0 = proba0.log_dens(xs)  # = h(x) + T(x).theta - g(theta)
ldens_1 = proba1.log_dens(xs)
ldens_2 = proba2.log_dens(xs)

tpar0 = proba_map.param_to_T(param0)
tpar1 = proba_map.param_to_T(param1)
tpar2 = proba_map.param_to_T(param2)

ts = proba_map.T(xs)

ldens_t_0 = (ts * tpar0).sum(-1) - proba_map.g(tpar0)  # T(x). theta - g(theta)
ldens_t_1 = (ts * tpar1).sum(-1) - proba_map.g(tpar1)
ldens_t_2 = (ts * tpar2).sum(-1) - proba_map.g(tpar2)

print(np.mean(np.abs(ldens_0 - ldens_t_0 - ldens_1 + ldens_t_1)))
print(np.mean(np.abs(ldens_0 - ldens_t_0 - ldens_2 + ldens_t_2)))

In [None]:
# proba_map = BlockDiagGaussMap([[0,2, 1], [ 3]])
proba_map = FactCovGaussianMap(4)
epsilon = 10 ** (-7)

accu = np.zeros(100)
for i in range(100):
    param0 = np.random.normal(0, 1, proba_map.proba_param_shape)
    tpar0 = proba_map.param_to_T(param0)

    delta = epsilon * np.random.normal(0, 1, proba_map.t_shape)
    #     delta[:4] = 0.0

    g0 = proba_map.g(tpar0)
    g1 = proba_map.g(tpar0 + delta)

    res = (g1 - g0) / np.sum(delta * proba_map.grad_g(tpar0))

    if res < 0:
        print(f"Sign problem: {res}")
        proba = proba_map(proba_map.T_to_param(tpar0))
        print(np.max(proba.vals) / np.min(proba.vals))
        proba = proba_map(proba_map.T_to_param(tpar0 + delta))
        print(np.max(proba.vals) / np.min(proba.vals))
        print()
    elif np.log(res) > 0.1:
        print(f"Unstable: {np.log(res)}")
        proba = proba_map(proba_map.T_to_param(tpar0))
        print(np.max(proba.vals) / np.min(proba.vals))
        proba = proba_map(proba_map.T_to_param(tpar0 + delta))
        print(np.max(proba.vals) / np.min(proba.vals))
        print()
    #     else:
    #         print("Stable")
    #         proba = proba_map(proba_map.T_to_param(tpar0))
    #         print(np.max(proba.devs)/np.min(proba.devs))
    #         proba = proba_map(proba_map.T_to_param(tpar0 + delta))
    #         print(np.max(proba.devs)/np.min(proba.devs))
    #         print()

    accu[i] = res

# print(np.max(np.abs(np.log(accu))))

In [None]:
tpar = np.array(
    [
        -1156.27631014,
        -796.41509995,
        8591.1302817,
        3995.3597264,
        167.82754323,
        80.06845902,
        9251.03475923,
        2002.14959828,
        115.59336995,
        -1245.32377343,
        -579.39681635,
        -857.58412415,
        -399.04037178,
        4302.99413866,
    ]
)

In [None]:
proba_map(proba_map.T_to_param(tpar))

In [None]:
plt.plot([proba_map(par).devs[1] for par in mlearn.hist_meta.meta_params()])
plt.yscale("log")

In [None]:
np.cov(x_middles.T)

In [None]:
import matplotlib.pyplot as plt

plt.plot(mlearn.hist_meta.meta_scores(20))

In [None]:
proba_map(mlearn.prior_param).cov

In [None]:
proba_map(mlearn.hist_meta.meta_params(1)[0]).means

In [None]:
np.cov(x_middles.T)

In [None]:
x_middles.mean(0)

In [None]:
import matplotlib.pyplot as plt

print(
    f"Center recovery error: {(mlearn.prior_param[0] - x_middles.mean(0))/ x_middles.mean(0)}"
)

plt.plot(mlearn.hist_meta.meta_params(1000)[:, 0, 0], label=r"$\theta_0$")
plt.plot(mlearn.hist_meta.meta_params(1000)[:, 0, 1], label=r"$\theta_1$")
plt.legend()
plt.title("Evolution of prior mean")
plt.show()

## Covariance case

In [None]:
# Choose dimension/Number of tasks
d = 4
true_dim = 1
n_tasks = 20

# Generate tasks
def make_score(x):
    def score(xs):
        return ((x - xs) ** 2).sum(-1)

    return score


matrix = np.random.normal(0, 1, (true_dim, d))
x_middles = np.random.normal(0, 1.0, (n_tasks, true_dim)) @ matrix + np.random.normal(
    0, 0.01, (n_tasks, d)
)

list_task = [
    Task(make_score(x_mid), temperature=0.1, vectorized=True) for x_mid in x_middles
]

# Define distribution family
proba_map = GaussianMap(d)

# Define Meta Learning Environnement
mlearn = MetaLearningEnv(
    proba_map,
    list_task=list_task,
    per_step=25,
    chain_length=1,
    n_estim_weights=50,
    kl_max=100.0,
    silent=True,
)

In [None]:
# Launch training (either through meta_learn or meta_learn_batch. meta_learn_batch is more stable)
mlearn.meta_learn_batch(epochs=20, eta=2.0, kl_max=1.0, silent=True, kl_tol=10**-5)
mlearn.meta_learn_batch(epochs=20, eta=1.0, kl_max=1.0, silent=True, kl_tol=10**-5)
mlearn.meta_learn_batch(epochs=20, eta=0.5, kl_max=1.0, silent=True, kl_tol=10**-5)
mlearn.meta_learn_batch(epochs=20, eta=0.25, kl_max=1.0, silent=True, kl_tol=10**-5)
mlearn.meta_learn_batch(epochs=20, eta=0.1, kl_max=1.0, silent=True, kl_tol=10**-5)

In [None]:
plt.plot(mlearn.hist_meta.meta_scores(100))

In [None]:
mlearn.proba_map(mlearn.prior_param)

In [None]:
x_middles.mean(0)

In [None]:
np.cov(x_middles.T) / mlearn.proba_map(mlearn.prior_param).cov

## Future improvements

### Sample size for the inner task

In the current implementation, the inner algorithm evaluates a fixed number of parameters generated from the current posterior. This might slow down the algorithm significantly, as once the space has been thoroughly explored, it is not necessary to evaluate many new points (at least not as much as during the early stages). The number of new points evaluated should be estimated depending on how well the current sample explores the posterior.

On the same lines, the positions of the samples evaluated could be optimized.

### Step size adaptation