## Bayes module

### Variational inference

Given a score $S$, a parametric family of distributions $(\nu_{\theta})_{\theta \in \Theta}$ and a prior distribution $\pi$, we consider the variational problem

$$\hat{\theta} = \arg\inf \nu_{\theta}[S] + \lambda * KL(\nu_{\theta}, \pi).$$

The function variational_inference is designed to tackle such problems in the setting where $\pi =  \nu_{\theta_0}$. This is in order to benefit from potential closed form expressions when computing the Kullback--Leibler divergence and its derivative.

# VarBUQ algorithm

VarBUQ algorithm relies on surpbayes.proba submodule. The current form of the algorithm requires the family of distributions of interest to be an exponential family (classes "surpbayes.proba.PreExpFamily" and "surpbayes.proba.ExponentialFamily"). A slightly modified version is used in the case of Gaussian distributions (inherited from "surpbayes.proba.PreExpFamily").

To demonstrate VarBUQ, a one dimensional problem is considered. However, VarBUQ can be used for problems of any dimension. First, we define a tailor-made error function. For maximum efficiency, this function is vectorized.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from surpbayes.bayes import iter_prior, variational_inference, iter_prior_vi

from surpbayes.proba import GaussianMap, TensorizedGaussianMap, BlockDiagGaussMap

xs = np.linspace(-10, 10, 10**3)


def fun(x):
    return (np.tan(np.sqrt(1 + np.arctan(x - 1.57) ** 2) - 1.0) - 3) ** 2


def gun(x):
    return (np.tanh(x) + 1) / 2


def S(x):
    return (2 - fun(0.7 * (x - 0.4))) * gun(-0.2 - 0.7 * (x - 0.4)) + 3


def score(x):
    return (S(x[:, 0]+3) +.25) /3.25


plt.plot(xs, score(xs[:, np.newaxis]), label="Score fun.")
plt.xlabel("param")
plt.ylabel("Score")
plt.legend()
plt.show()

In [None]:
ys = score(xs[:, np.newaxis])

np.min(ys)

In [None]:
from surpbayes.bayes import variational_inference, OptimResultVI
from surpbayes.proba import GaussianMap
from time import time

gmap = GaussianMap(1)

tic = time()
opt_res_news = [
    variational_inference(
    score,
    gmap,
    optimizer="score_approx",
    temperature=0.015,  # the lambda term in the variational inference problem
    per_step=4,
    parallel=False,
    vectorized=True,
    print_rec=2,
    chain_length=101,
    n_estim_weights=4 * 10**4,
    kl_max=0.25,
    m_max=15,
    xtol=10**-12,
    kltol=10**-12,
    dampen=0.0,
    silent=True)
 for _ in range(40)]
tac = time()
print("Time elapsed:", tac - tic)

In [None]:
grad_per_step = 4

opt_res_grads = [
    variational_inference(
        score,
        gmap,
        prior_param=gmap.ref_param,
        temperature=0.015,  # the lambda term in the variational inference problem
        per_step=grad_per_step,
        optimizer="corr_weights",
        gen_decay=100,
        k=grad_per_step,
        parallel=False,
        vectorized=True,
        print_rec=100,
        chain_length=101,
        refuse_conf=1.1,
        momentum=0.9,
        eta=2.0,
        silent=True,
    ) 
for _ in range(50)]

In [None]:
def clean_mscore(fun, prob, n= 10**6):
    samples = prob(n)
    return fun(samples).mean()


In [None]:
def prep_plot(fun, pmap, opt_res:OptimResultVI, per_step:int, temp:float, p_param = None, n= 10**4):
    if p_param is None:
        p_param = pmap.ref_param
    n_steps = len(opt_res.hist_param)
    vscores = np.zeros(n_steps)
    for i, param in enumerate(opt_res.hist_param):
        mscore = clean_mscore(fun, pmap(param), n)
        vscores[i] = mscore + temp * pmap.kl(param, p_param)
    return np.arange(n_steps) * per_step, vscores


In [None]:
data_grad = [
    prep_plot(score, gmap, opt_res_grad, per_step=grad_per_step, temp=0.015)
    for opt_res_grad in opt_res_grads]

xs_grad = data_grad[0][0]
ys_grad = np.array([d[1] for d in data_grad])

min_grad, max_grad = np.apply_along_axis(lambda x : np.quantile(x, [0.2, 0.8]), 0,  ys_grad)

In [None]:
data_new = [
    prep_plot(score, gmap, opt_res_new, per_step=4, temp=0.015)
    for opt_res_new in opt_res_news]

xs_new = data_new[0][0]
ys_new = np.array([d[1] for d in data_new])

min_new, max_new = np.apply_along_axis(lambda x : np.quantile(x, [0.2, 0.8]), 0,  ys_new)

In [None]:
min_grad, max_grad = np.apply_along_axis(lambda x : np.quantile(x, [0.1, 0.9]), 0,  ys_grad)

In [None]:
for ys in ys_grad:
    plt.plot(xs_grad, ys, linewidth=0.2, color="tab:blue")
# plt.fill_between(xs_grad, min_grad, max_grad)
for ys in ys_new:
# plt.fill_between(xs_new, min_new, max_new)
    plt.plot(xs_new, ys, linewidth=0.2, color="tab:orange")

plt.xlabel("Number of risk evaluations")
plt.ylabel("Optimisation objective")

In [None]:
for opt_res_new in opt_res_news:
    plt.plot(*prep_plot(score, gmap, opt_res_new, per_step=4, temp=0.015), color="tab:blue", linewidth=0.5)
for opt_res_grad in opt_res_grads:
    plt.plot(*prep_plot(score, gmap, opt_res_grad, per_step=4, temp=0.015), color="tab:orange", linewidth=0.5)

In [None]:
n_steps = len(opt_res_new.hist_score)
per_step = 4

plt.plot(per_step * np.arange(n_steps), opt_res_new.hist_score)

n_steps = len(opt_res_grad.hist_score)
plt.plot(32 * np.arange(n_steps), opt_res_grad.hist_score)

In [None]:
from surpbayes.bayes import variational_inference
from surpbayes.proba import GaussianMap

from time import time

gmap = GaussianMap(1)

tic = time()
opt_res_new = variational_inference(
    score,
    gmap,
    optimizer="score_approx",
    temperature=0.015,  # the lambda term in the variational inference problem
    per_step=160,
    parallel=False,
    vectorized=True,
    print_rec=2,
    chain_length=51,
    n_estim_weights=10**5,
    kl_max=0.1,
    m_max=20,
    xtol=10**-6,
    kltol=10**-6,
    dampen=0.7,
)
tac = time()
print("Time elapsed:", tac - tic)

In [None]:
opt_res_grad = variational_inference(
    score,
    gmap,
    temperature=0.015,  
    per_step=8,
    optimizer="corr_weights",
    # gen_decay=np.log(1.2),
    # k=8 * 20,
    parallel=False,
    vectorized=True,
    print_rec=2,
    chain_length=101,
    refuse_conf=1.0,
    # momentum=0.9,
    eta=1.0,
    silent=False,
)

The algorithm converged in less than 20 steps.
The evolution of the approximation of the posterior can be easily represented in this 1D setting by plotting the densities.

In [None]:
xs = np.linspace(-3.5, 3.9, 2000)


def repr_gauss(param):
    distr = gmap(param)
    return xs, distr.dens(xs[:, np.newaxis])


plt.plot(*repr_gauss(opt_res_new.hist_param[0]), linewidth=1.0, label="prior")
for i, param in enumerate(opt_res_new.hist_param[:]):
    if i % 1 == 0:

        xs, ys = repr_gauss(param)
        plt.plot(xs, ys, color="black", linewidth=0.2)

plt.plot(*repr_gauss(opt_res_new.hist_param[-1]), linewidth=1.0, label="posterior")
plt.title("Evolution of the posterior estimation")
plt.legend()
plt.show()

Let us see what happens in detail during this training phase.

In [None]:
from surpbayes.bayes.score_approx.weighing import get_weights_mc
from surpbayes.bayes.score_approx.score_approx_solver import exp_approximation
from scipy.stats import norm

np.random.seed(0)

temp = 0.02
dampen = 0.975
n_sample = 40

xs = np.linspace(-3.5, 3.5, 1000)

prior_param = gmap.ref_param
prior = gmap(prior_param)
prior_norm = norm(loc=prior.means[0], scale=np.sqrt(prior.cov[0, 0]))

sample = prior(n_sample)
score_sample = score(sample)

# Plotting
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax2.set_ylabel("Density")
ax1.set_ylabel("Score")

ax2.fill_between(xs, prior_norm.pdf(xs), color="0.7", alpha=0.2, label="prior")
ax1.plot(
    xs, score(xs[:, np.newaxis]), "--", linewidth=1.0, color="black", label="Score"
)

ax1.plot(sample, score_sample, "o", markersize=6.0, label="Score eval.")
ax1.set_ylim(-0.3, 3.2)
ax1.legend()
ax2.legend()

fig.tight_layout()
plt.show()

Parameters are drawn from the current posterior approximation. The scores of these parameters are evaluated and added to the stack of parameters evaluated so far.

In [None]:
# # Infer estimation of score
weights = get_weights_mc(prior, sample, n_sample_estim=10**5)
m_score = np.sum(score_sample * weights)

# For plottinf
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax2.set_ylabel("Density")
# ax1.set_ylabel("Score")


ax2.fill_between(xs, prior_norm.pdf(xs), color="0.7", alpha=0.2, label="prior")


loc_sample = sample.copy()
loc_weights = get_weights_mc(prior, loc_sample, n_sample_estim=10**6)

sorter = np.argsort(loc_sample.flatten())
loc_sample = loc_sample[sorter]
loc_weights = loc_weights[sorter]

cut = (loc_sample[1:] + loc_sample[:-1]) / 2

loc_weights_renorm = loc_weights.copy()
loc_weights_renorm[0] = 0.0
loc_weights_renorm[-1] = 0.0

loc_weights_renorm[1:-1] = loc_weights_renorm[1:-1] / (cut[1:, 0] - cut[:-1, 0])

cut_plot = [xs[0]] + list(np.array([[a, a] for a in cut]).flatten()) + [xs[-1]]
weights_for_plot = list(np.array([[a, a] for a in loc_weights_renorm]).flatten())
ax2.plot(cut_plot, weights_for_plot, color="black", label="Weights", linewidth=1)

ax2.plot(loc_sample, loc_weights_renorm, "o", color="tab:blue", markersize=4)
ax1.set_yticks([])
fig.tight_layout()

ax2.legend()
plt.show()

Weights are computed for each parameter in the stack of evaluated parameter.

In [None]:
T_s = gmap.T(sample)
T_approx = exp_approximation(Ts=T_s, scores=score_sample, weights=weights)

T_xs = gmap.T(xs[:, np.newaxis])
score_approx = (T_xs * T_approx).sum(-1)

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax2.set_ylabel("Density")
ax1.set_ylabel("Score")

ax2.fill_between(xs, prior_norm.pdf(xs), color="0.7", alpha=0.2, label="prior")
ax1.plot(
    xs, score(xs[:, np.newaxis]), "--", linewidth=1.0, color="black", label="Score"
)

ax1.plot(sample, score_sample, "o", markersize=9.0, label="Score eval.")
ax1.set_ylim(-0.3, 3.2)

score_sample_approx = (T_s * T_approx).sum(-1)

delta = np.sum(weights * (score_sample - score_sample_approx))

ax1.plot(xs, delta + score_approx, color="peru", linewidth=2.5, label="Score approx.")
fig.tight_layout()
ax1.legend()
ax2.legend()
plt.show()

The weighing process is used to compute the best L2 approximation of the score of a given form (here for gaussians, quadratic forms)

In [None]:
delta = np.sum(weights * (score_sample - (T_s * T_approx).sum(-1)))

T_prior = gmap.param_to_T(prior_param)
T_updt_dir = -(temp**-1) * T_approx
T_new = T_prior + (1 - dampen) * T_updt_dir

post_param = gmap.T_to_param(T_new)

post = gmap(post_param)
post_norm = norm(loc=post.means[0], scale=np.sqrt(post.cov[0, 0]))

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax2.set_ylabel("Density")
ax1.set_ylabel("Score")

ax2.fill_between(xs, prior_norm.pdf(xs), color="0.7", alpha=0.2, label="prior")

ax1.set_ylim(-0.3, 3.2)

fig.tight_layout()

ax1.plot(xs, delta + score_approx, color="peru", linewidth=2.5, label="Score approx.")
fig.tight_layout()

ax2.fill_between(
    xs, post_norm.pdf(xs), color="skyblue", alpha=0.2, label="post. approx."
)
ax2.set_ylabel("Density")

ax1.set_ylabel("Score")

fig.tight_layout()
ax1.legend()
ax2.legend()
plt.show()

The approximated score is used to compute the posterior update.
This learning cycle is then repeated until either convergence is achieved or the maximal number of optimisation steps is achieved.

The algorithm is now showcased on a more complex function

In [None]:
import numpy as np
import numba as nb
import matplotlib.pyplot as plt

from surpbayes.bayes import iter_prior, variational_inference, iter_prior_vi
from surpbayes.proba import GaussianMap, TensorizedGaussianMap

# For plotting purposes
from math import pi

angles = np.linspace(0, 2.001 * pi, 1000)
circle = np.array([np.cos(angles), np.sin(angles)])


def half_cov(cov):
    vals, vects = np.linalg.eigh(cov)
    return (np.sqrt(vals) * vects) @ vects.T


def repr_gauss(mean, cov, rad=1.0):
    loc_circle = circle.copy()
    return mean + rad * (half_cov(cov) @ loc_circle).T


arr_1 = np.array([0.0, 1.0])
arr_2 = np.array([1, -1])
_shift = np.array([0.0, 0.5])


def score(x):
    z = x - _shift
    return 4 * np.arctan(0.5 * ((z @ arr_1 - 1.0) ** 2 + 100.0 * (z @ arr_2) ** 2))

In [None]:
from surpbayes.proba import FactCovGaussianMap, FixedCovGaussianMap
from time import time

cov = np.array([[0.5, 0.4], [0.4, 0.5]])

facgm = FactCovGaussianMap(2, cov=cov)
ficgm = FixedCovGaussianMap(2, cov=cov)

gmap = GaussianMap(2)

tic = time()
opt_res_new = variational_inference(
    score,
    gmap,
    #     facgm,
    temperature=0.1,  # the lambda term in the variational inference problem
    optimizer="score_approx",
    per_step=96,
    parallel=False,
    vectorized=True,
    print_rec=10,
    chain_length=141,
    n_estim_weights=10**5,
    kl_max=0.04,
    m_max=20,
    xtol=10**-7,
    kltol=10**-7,
    silent=False,
    dampen=0.1,
)
tac = time()
print("Time elapsed:", tac - tic)

for i, param in enumerate(opt_res_new.hist_param[:]):
    if i % 5 == 0:
        proba = gmap(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
        plt.plot(proba.means[0], proba.means[1], "x")

plt.title("Evolution of the posterior estimation")
plt.show()

In [None]:
for i, param in enumerate(opt_res_new.hist_param[:]):
    if i % 1 == 0:
        proba = gmap(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        #         xs, ys = shift(proba_repr[:, 0], proba_repr[:, 1])
        xs, ys = proba_repr[:, 0], proba_repr[:, 1]
        sns.lineplot(x=xs, y=ys, color="black", sort=False, linewidth=0.2)
#         plt.plot(proba.means[0], proba.means[1], "x")

In [None]:
import seaborn as sns

x_min = -1.1
x_max = 1.8
n_x = 801

y_min = -1.1
y_max = 1.8
n_y = 801

x_axis_labels = np.linspace(
    x_min, x_max, n_x
)  # Avoid renormalisation issue at the angles
y_axis_labels = np.linspace(y_min, y_max, n_y)


def shift(xs, ys):
    return n_x * (xs - x_min) / (x_max - x_min), n_y * (ys - y_min) / (y_max - y_min)


values = np.array(np.meshgrid(y_axis_labels, x_axis_labels)).T

z = score(values)
alpha = np.linspace(0, 8, 100)

sns.heatmap(
    z.T,
    xticklabels=y_axis_labels,
    yticklabels=x_axis_labels,
    cmap=sns.color_palette("Blues", as_cmap=True),
)
# sns.heatmap(z)
for i, param in enumerate(opt_res_new.hist_param):
    if i % 2 == 0:
        proba = gmap(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        #         proba_repr  = repr_gauss(np.array([1.38,-1.1]), 0.01 * np.eye(2))

        xs, ys = shift(proba_repr[:, 0], proba_repr[:, 1])
        sns.lineplot(x=xs, y=ys, sort=False, color="yellow", linewidth=1.0)
#         plt.plot(proba.means[0], proba.means[1], "x")


proba = gmap(opt_res_new.opti_param)
proba_repr = repr_gauss(proba.means, proba.cov)
#         proba_repr  = repr_gauss(np.array([1.38,-1.1]), 0.01 * np.eye(2))

xs, ys = shift(proba_repr[:, 0], proba_repr[:, 1])
sns.lineplot(x=xs, y=ys, sort=False, color="red", linewidth=1.0)

# x,y = shift(0.0, 0.0)
# sns.pointplot(x=x, y=y)

# plt.title("A Copula")
plt.xticks([])
plt.yticks([])
plt.savefig("Rosenbrock0_01.png", dpi=900, transparent=True)

In [None]:
%%timeit
np.asfortranarray(c)

In [None]:
c = np.random.normal(0, 1, 10**5).reshape((10, 100, 100))

In [None]:
# help(plot_score_evol)
plot = plot_score_evol(opt_res_new, cmap=sns.color_palette("Blues", as_cmap=True))
plot.plot(opt_res_new.log_vi.means())
plot.savefig("score_density_evolution.pdf", transparent=True)

In [None]:
from surpbayes.bayes.plot.optim_result import plot_score_evol

In [None]:
plot_score_evol

In [None]:
x_min = 0
x_max = n_y

z = x - x_min

In [None]:
def shift(xs, ys):
    return n_x * (xs - x_min) / (x_max - x_min), n_y * (y_max - ys) / (y_max - y_min)

In [None]:
proba = gmap(opt_res_new.hist_param[0])
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(proba_repr[:, 0], proba_repr[:, 1], linewidth=1.0, label="prior")

for i, param in enumerate(opt_res_new.hist_param[:-5]):
    if i % 1 == 0:
        proba = gmap(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
#         plt.plot(proba.means[0], proba.means[1], "x")

proba = gmap(opt_res_new.opti_param)
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(
    proba_repr[:, 0],
    proba_repr[:, 1],
    linewidth=1.2,
    color="crimson",
    label="posterior",
)
plt.plot([1], [1], "x", c="k", markersize=10, label="Glob. Min.")

plt.title("Evolution of the posterior estimation (score approx)")
plt.legend()
plt.xticks([-1.0, 0.0, 1.0])
plt.yticks([-1.0, 0.0, 1.0])
# plt.savefig("score_approx_training_low_temp.png")
plt.show()

In [None]:
from surpbayes.proba import FactCovGaussianMap, FixedCovGaussianMap
from time import time

cov = np.array([[0.5, 0.4], [0.4, 0.5]])

facgm = FactCovGaussianMap(2, cov=cov)
ficgm = FixedCovGaussianMap(2, cov=cov)

tic = time()
opt_res_new = variational_inference(
    score,
    facgm,
    temperature=0.01,  # the lambda term in the variational inference problem
    optimizer="score_approx",
    per_step=320,
    parallel=False,
    vectorized=True,
    print_rec=10,
    chain_length=61,
    n_estim_weights=10**5,
    kl_max=0.05,
    m_max=20,
    xtol=10**-9,
    #     alpha_filter=0.99,
    silent=False,
    dampen=0.5,
)
tac = time()
print(tac - tic)
for i, param in enumerate(opt_res_new.hist_param):
    if i % 1 == 0:
        proba = facgm(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
        plt.plot(proba.means[0], proba.means[1], "x")

plt.title("Evolution of the posterior estimation")
plt.show()

In [None]:
from surpbayes.bayes.plot.score_approx import plot_weights_per_gen

plot_weights_per_gen(opt_res_new, n_sample_estim_weight=10**6)

In [None]:
from surpbayes.bayes.plot.optim_result import plot_score_evol

plot = plot_score_evol(opt_res_new)
plot.show()

In [None]:
from surpbayes.bayes.plot import plot_hist_vi, plot_scores

plot = plot_scores(sample_val=opt_res_new.sample_val, marker="x", s=0.03)
plot.show()

In [None]:
from surpbayes.proba import FactCovGaussianMap, FixedCovGaussianMap

cov = np.array([[0.5, 0.4], [0.4, 0.5]])

facgm = FactCovGaussianMap(2, cov=cov)
ficgm = FixedCovGaussianMap(2, cov=cov)

opt_res_new = variational_inference(
    score,
    facgm,
    temperature=0.1,  # the lambda term in the variational inference problem
    optimizer="score_approx",
    per_step=160,
    parallel=False,
    vectorized=True,
    print_rec=10,
    chain_length=41,
    n_estim_weights=10**5,
    kl_max=10**-2,
    kltol=0.0,
    m_max=20,
    xtol=10**-6,
    silent=False,
    dampen=0.1,
)

In [None]:
from surpbayes.bayes.plot import plot_score_push_begin_end, plot_score_evol

plot = plot_score_evol(opt_res_new)
plot.show()
plot.clf()
plot = plot_score_push_begin_end(opt_res_new)
plot.show()
plot.clf()

In [None]:
import seaborn as sns

sample_val = opt_res_new.sample_val
sns.kdeplot(sample_val.vals()[sample_val.gen_tracker() == 0])
sns.kdeplot(
    sample_val.vals()[sample_val.gen_tracker() == np.max(sample_val.gen_tracker())]
)

We need to define the space of probability distributions on which we wish to optimize. Here we consider a score defined on a two dimensional space, and therefore use gaussian distributions on $\mathbb{R}^2$. The prior will be the standard distribution

## Gradient based algorithm

To use the gradient based routine, the parameter 'VI_method' must be either "corr_weights" or "knn". It is advised to use 'corr_weights'.
It is normal behavior that the optimisation procedure raises some ProbaBadGrad warnings. These indicate that a problematic gradient estimation was rejected as it damaged significantly the score. No need to worry about those.

In [None]:
gauss_map = GaussianMap(2)

# We define the prior as the reference gaussian distribution, i.e. N(0,Id)
prior_param = gauss_map.ref_param

# To solve the variational inference problem, we use the variational_inference function.
opt_res_grad = variational_inference(
    score,
    gauss_map,
    prior_param=prior_param,
    temperature=0.1,  # the lambda term in the variational inference problem
    per_step=160,
    optimizer="corr_weights",
    gen_decay=np.log(1.2),
    k=160 * 20,
    parallel=False,
    vectorized=True,
    print_rec=50,
    chain_length=501,
    refuse_conf=0.9,
    momentum=0.9,
    eta=0.05,
    silent=False,
)

# It is normal behavior that the optimisation procedure raises some ProbaBadGrad warnings.
# These indicate that a problematic gradient estimation was rejected as it damaged significantly
# the score. No need to worry about those.

# We can access the parameter describing the posterior through the opti_param attribute
post_param = opt_res_grad.opti_param

In [None]:
proba = gmap(opt_res_grad.hist_param[0])
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(proba_repr[:, 0], proba_repr[:, 1], linewidth=1.0, label="prior")

for i, param in enumerate(opt_res_grad.hist_param[:]):
    if i % 3 == 0:
        proba = gmap(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
#         plt.plot(proba.means[0], proba.means[1], "x")

proba = gmap(opt_res_grad.opti_param)
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(
    proba_repr[:, 0],
    proba_repr[:, 1],
    linewidth=1.2,
    color="crimson",
    label="posterior",
)
plt.plot([1], [1], "x", c="0.0", markersize=10, label="Glob. Min.")

plt.title("Evolution of the posterior estimation (corr weights)")
plt.legend()
plt.xticks([-1.0, 0.0, 1.0])
plt.yticks([-1.0, 0.0, 1.0])
plt.savefig("corr_weights_training.png")
plt.show()


# for i, param in enumerate(opt_res_grad.hist_param[:-450]):
#     if i % 2 == 0:
#         proba = gauss_map(param)
#         proba_repr = repr_gauss(proba.means, proba.cov)
#         plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)

In [None]:
# The optimisation start by modification of the covariance

for i, param in enumerate(opt_res_grad.hist_param[:25:2]):
    if i % 1 == 0:
        proba = gauss_map(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)

In [None]:
# The distribution then shifts towards the correct mean value
for i, param in enumerate(opt_res_grad.hist_param[25:500:30]):
    if i % 1 == 0:
        proba = gauss_map(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)

In [None]:
# The evolution of the VI score can also be tracked:
plt.plot(opt_res_grad.hist_score)
plt.yscale("log")

In [None]:
plot_score_push_begin_end(opt_res_grad)

## Score approximation

Score approximation is now the default way to optimise Catoni's bound for Gaussian, BlockDiagonalGaussian and ExponentialFamily distribution map. These yield much more stable result, but have more processing time between steps. The number of calls to the model is greatly reduced, and the results are more accurate, especially when the temperature is low.

In [None]:
from surpbayes.proba import GaussianMap

gauss_map = GaussianMap(2)
opt_res_sa = variational_inference(
    score,
    gauss_map,
    # prior_param=prior_param,
    temperature=0.05,  # the lambda term in the variational inference problem
    optimizer="score_approx",
    per_step=160,
    parallel=False,
    vectorized=True,
    print_rec=1,
    chain_length=51,
    n_estim_weights=10**4,
    kl_max=0.1,
    m_max=20,
    xtol=10**-6,
    alpha_filter=0.9,
    silent=False,
)

In [None]:
proba = gauss_map(opt_res_sa.opti_param)
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(
    proba_repr[:, 0],
    proba_repr[:, 1],
    color="blue",
    linewidth=0.2,
    label="Approx. Score",
)

proba = gauss_map(opt_res_grad.opti_param)
proba_repr = repr_gauss(proba.means, proba.cov)
plt.plot(
    proba_repr[:, 0],
    proba_repr[:, 1],
    color="black",
    linewidth=0.2,
    label="Gradient Descent",
)
plt.legend()
plt.title("Result of optimisation routines")

The differences are significative. Both distributions exhibit high correlations. The score approximation routine migrated towards the minimum value $(1,1)$, while the gradient descent algorithm ended its migtration too early on.

We plot the successive distributions to exhibiti the improved stability of the training algorithm.

In [None]:
for i, param in enumerate(opt_res_sa.hist_param):
    proba = gauss_map(param)
    proba_repr = repr_gauss(proba.means, proba.cov)
    plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
plt.title("Evolution of the posterior estimation")
plt.show()

The score now decreases in a much more regular fashion.

In [None]:
plt.plot(opt_res_sa.hist_score)

### Checks for other distribution maps

In [None]:
from surpbayes.proba import FactCovGaussianMap, FixedCovGaussianMap

cov = np.array([[1, 0.5], [0.5, 1]])

facgm = FactCovGaussianMap(2, cov=cov)
ficgm = FixedCovGaussianMap(2, cov=cov)

opt_res = variational_inference(
    score,
    facgm,
    temperature=0.001,  # the lambda term in the variational inference problem
    optimizer="score_approx",
    per_step=320,
    parallel=False,
    vectorized=True,
    print_rec=10,
    chain_length=51,
    n_estim_weights=10**4,
    kl_max=0.1,
    m_max=20,
    xtol=10**-6,
    alpha_filter=0.9,
    silent=False,
)

The setting above was a special case, where the true score was quadratic. As the score_approx algorithm for gaussians relies on quadratic approximation, this heavily favors this approach. We investigate a case where the approximations looked for (quadratic with diagonal matrix) does not fit the true score (quadratic).

In [None]:
for i, param in enumerate(opt_res.hist_param[:5]):
    if i % 1 == 0:
        proba = facgm(param)
        proba_repr = repr_gauss(proba.means, proba.cov)
        plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
        plt.plot(proba.means[0], proba.means[1], "x")

plt.title("Evolution of the posterior estimation")
plt.show()

In [None]:
# Set corr between -1 and 1. The higher the absolute value of corr, the harder the problem is.
corr = 0.7


def make_mat(vars, corr=0.0):
    assert np.abs(corr) < 1.0
    anti_diag = np.sqrt(vars[0] * vars[1]) * corr
    return np.array([[vars[0], anti_diag], [anti_diag, vars[1]]])


center = np.array([-1.0, 1.0])
mat = make_mat([4.0, 1.0], corr)


def score(xs):
    return (((xs - center) @ mat) * (xs - center)).sum(-1)


from surpbayes.proba import BlockDiagGaussMap

bgmap = BlockDiagGaussMap([[1], [0]])

opt_res = variational_inference(
    score,
    bgmap,
    temperature=0.2,
    per_step=320,
    kl_max=0.5,
    chain_length=30,
    dampen=0.0,
    alpha_filter=0.95,
    n_estim_weights=3 * 10**5,
    vectorized=True,
)

### Amount of reuse of the previous evaluations
The whole point of the score_approx technique is to make most use of all previous evaluations of the score. We can track the impact these previous evaluations have by checking the weight given to each sample at the posterior distribution.

In [None]:
from surpbayes.bayes.plot import plot_weight_per_gen

plot_weight_per_gen(opt_res)

In [None]:
from surpbayes.bayes.score_approx.weighing.monte_carlo import get_weights_mc_gauss

weights = get_weights_mc_gauss(
    bgmap(opt_res.opti_param), opt_res.sample_val.sample(), n_sample_estim=10**6
)
plt.plot(weights)
plt.show()
weights_per_gen = weights.reshape((30, 320)).sum(1)
plt.plot(weights_per_gen, label="Tot weight per generation")
plt.legend()
plt.show()

While the first 5 generations have smaller impact, all generations after generation 8 have non negligible weight.

In [None]:
[bgmap.kl(par1, par0) for par1, par0 in zip(opt_res.hist_param[1:], opt_res.hist_param)]

In [None]:
# The distribution then shifts towards the correct mean value
for i, param in enumerate(opt_res.hist_param):
    proba = bgmap(param)
    proba_repr = repr_gauss(proba.means, proba.cov)
    plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
plt.show()
plt.clf()
plt.plot(opt_res.hist_score)

Under the hood, variational_inference can redirect to two routines (VI_method argument): either "corr_weights" or "KNN". The name refers to the method used in order to make most use of the evaluations to the score function.

The 'variational_inference' function was designed for situations where evaluating the 'score' is rather expensive. It is still, however, an accelarated gradient descent algorithm. The change is that the gradient's expression involves an expectation with respect to the current distribution. The naïve approach consisting in sampling iid samples from the current distribution to obtain an unbiased estimation of the expectation is improved upon by recycling previous samples. These are generated from distributions similar to the current one, if small optimization steps are done ('eta' parameter is small).

As it is not possible to use these samples directly, two procedures are proposed. "corr_weights" consists in giving each sample a weight to adjust for the difference of probability for it being drawn between the current and previous distributions. "KNN" consists in constructing a surrogate score using a K-Nearest neighbor algorithm, then using this surrogate on a large number of samples to compute the derivative.

The number of samples used all in all when evaluating the derivative is controlled by the argument 'k'. By default it is None, amounting to all samples being used.

For "corr_weights", it is possible and advisable to set the 'gen_decay' parameter higher than 0 (default value). The 'gen_decay' parameter gives a decreasing weights to older generations when computing the derivative. While generations just before tend to be close to the current one, older ones would no longer be representative, and could have a negative impact when computing the derivative. The higher 'gen_decay', the lower will be the influence of older generation (exponentially decreasing weights of $\exp(-t \times gen\_decay)$ are used).

For "KNN", the number of neighbors used by the K-nearest neighbors algorithm is NOT controlled by the argument 'k', but by "n_neighbors". As stated above, "k" controls the number of samples used. By default, "n_neighbors" is 5.


The 'corr_weights' method has the edge in most cases. For instance, 'KNN' by design does not like situations where the Hessian near the minima has eigenvalues of different magnitudes, which is the case for the Rosenbrock function tested here. This could be improved upon by learning the distance used in 'KNN', or by training different surrogates.

In [None]:
# For comparison, variational_inference with KNN method

opt_res = variational_inference(
    score,
    gauss_map,
    prior_param=prior_param,
    temperature=0.1,
    per_step=1600,
    optimizer="knn",
    k=None,
    parallel=False,
    print_rec=20,
    chain_length=600,
    vectorized=True,
    momentum=0.99,
    eta=0.1,
    silent=True,
)

end_proba = gauss_map(opt_res.opti_param)

print(
    f"Mean score of estimated posterior: {end_proba.integrate(score, n_sample = 1000)}"
)

# The evolution of the VI score can also be tracked:
plt.plot(opt_res.hist_score)

### Iter prior procedure

The iterated prior procedure is not a Bayesian technique at all. It is actually an optimisation routine, using a Bayesian flavored technique.

The goal is minimizing $S(x)$, a score function. In order to do that, parameters are drawn from a distribution. The distribution for the next generation is then obtained by centering around the best parameter found so far, and by using the top parameters found so far to construct the covariance. Each dimension of the parameter is drawn independantly from a gaussian distribution, so that the covariance is diagonal and can be defined by using the empirical standard deviations of the top parameters.

In [None]:
# The initial prior_param is a parameter for the TensorizedGaussianMap.
ini_prior = np.zeros((2, 2))
ini_prior[1] = 1.0

opt_res = iter_prior(
    score,
    ini_prior_param=ini_prior,
    gen_per_step=800,
    chain_length=50,
    keep=100,
    frac_sparse=0.0,
    parallel=False,
)

# The opti_param attribute of opt_res gives a distribution and NOT a parameter
opti_proba_param = opt_res.opti_param

# The optimal parameter can still be found:
opti_param = opt_res.full_sample[0]
print(opti_param)

The technique used in iter prior can still be useful in the context of variational inference, in order to construct quickly a good initial distribution. The function iter_prior_vi is designed precisely for that purpose.

In [None]:
opt_res = iter_prior_vi(
    score,
    prior_param=ini_prior,
    temperature=0.1,
    gen_per_step=800,
    chain_length=50,
    keep=100,
    frac_sparse=0.0,
    parallel=False,
    vectorized=True,
)

# The opti_param attribute of opt_res gives a distribution and NOT a parameter
opti_proba_param = opt_res.opti_param

start_post = np.zeros((3, 2))

start_post[0] = opti_proba_param[0]
start_post[1:] = np.diag(opti_proba_param[1])

opt_res = variational_inference(
    score,
    gauss_map,
    prior_param=prior_param,
    post_param=start_post,
    temperature=0.1,
    per_step=160,
    VI_method="corr_weights",
    gen_decay=np.log(1.2),
    k=160 * 20,
    parallel=False,
    vectorized=True,
    print_rec=2,
    chain_length=50,
    refuse_conf=0.95,
    momentum=0.95,
    eta=0.1,
    silent=False,
)

In [None]:
plt.plot(opt_res.hist_score)

## Uniform priors - Gaussian computations

The proba module offers a class of distributions on the hypercube benefitting from Gaussian like interpretation when the distribution are sufficiently concentrated and exact computations for KL.

In [None]:
from surpbayes.proba.gauss import GaussHypercubeMap

dim = 2

# Toy score function
def score(x):
    return np.arctan(
        0.2 * (x @ np.array([1.0, 0.0], dtype=np.float64) - 0.6) ** 2
        + 20 * (x @ np.array([1.0, -2.0], dtype=np.float64)) ** 2
    )


pmap = GaussHypercubeMap(2)

In [None]:
opt_res = variational_inference(
    score,
    pmap,
    temperature=0.01,  # the lambda term in the variational inference problem
    per_step=80,
    dampen=0.1,
    kl_max=0.2,
    parallel=False,
    vectorized=True,
    print_rec=10,
    chain_length=51,
    silent=False,
)

The posterior can adapt to scores with strong identifiability issues such as Rosenbrock, since the probabilities can exhibit strong correlation structure

In [None]:
import seaborn as sns

proba = pmap(opt_res.opti_param)
# The log density of the function can be accessed through log_dens
x_axis_labels = np.linspace(10**-4, 1 - 10**-4, 121)
y_axis_labels = np.linspace(10**-4, 1 - 10**-4, 121)

values = np.array(np.meshgrid(y_axis_labels, x_axis_labels)).T
z = proba.dens(values)

sns.heatmap(z, xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.title("Posterior distribution")
plt.xticks([])
plt.yticks([])

In [None]:
plt.plot(opt_res.hist_score, label="Evolution of the VI score")
plt.yscale("log")
plt.legend()
plt.show()

In [None]:
## Tests for problems of larger dimensions

d = 80
k = 5

blocks = [list(range(i * k, min(d, (i + 1) * k))) for i in range(int(np.ceil(d / k)))]

mat = (np.random.uniform(0.5, 2, d)) * np.random.normal(0, 1, (4 * d, d))
mat = mat.T @ mat / (4 * d)

print(np.linalg.eigvalsh(mat))

center = np.random.normal(0, 1, d)

In [None]:
def score(xs):
    deltas = np.arctan(xs - center)
    return (deltas * (deltas @ mat)).sum(-1)


bgmap = BlockDiagGaussMap(blocks)

In [None]:
opt_res = variational_inference(
    score,
    bgmap,
    per_step=800,
    kl_max=3.0,
    temperature=0.1,
    chain_length=20,
    vectorized=True,
)

In [None]:
[bgmap.kl(par1, par0) for par1, par0 in zip(opt_res.hist_param[1:], opt_res.hist_param)]

In [None]:
plt.plot(opt_res.hist_score)

# Checking stability

In [None]:
def score(x):
    return -(
        1.2 * np.exp(-4 * (x @ [1] - 4) ** 2) + 1.0 * np.exp(-1.0 * (x @ [1]) ** 2)
    )


xs = np.linspace(-4, 8, 4000).reshape((4000, 1))

plt.plot(xs, score(xs))

In [None]:
from surpbayes.accu_xy import AccuSampleVal

xs = np.linspace(3.5, 4.5, 1000).reshape((1000, 1))
accu = AccuSampleVal((1,), 1000)
accu.add(xs, score(xs))

In [None]:
2 * (1 - norm.cdf(4))

In [None]:
from scipy.stats import norm

gmap = GaussianMap(1)

tic = time()
opt_res_stable = variational_inference(
    score,
    gmap,
    #     facgm,
    #     prev_eval=accu, # Check if this improves matter or not
    temperature=0.002,  # the lambda term in the variational inference problem
    VI_method="score_approx",
    prior_param=np.array([[0.0], [2.0]]),
    post_param=np.array([[4.0], [1.0]]),
    per_step=100,
    parallel=True,
    vectorized=False,
    print_rec=10,
    chain_length=101,
    n_estim_weights=10**6,
    kl_max=0.1,
    m_max=20,
    xtol=10**-8,
    kltol=10**-8,
    alpha_filter=1.0,
    silent=False,
    dampen=0.01,
)
tac = time()
print("Time elapsed:", tac - tic)

xs = np.linspace(-4, 6, 1000)

for i, param in enumerate(opt_res_stable.hist_param[:]):
    if i % 2 == 0:
        proba = gmap(param)
        plt.plot(
            xs,
            norm(proba.means[0], np.sqrt(proba.cov[0])).pdf(xs),
            color="black",
            linewidth=0.4,
        )
#         proba_repr = repr_gauss(proba.means, proba.cov)
#         plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
#         plt.plot(proba.means[0], proba.means[1], "x")

plt.title("Evolution of the posterior estimation")
plt.show()

In [None]:
opt_res_stable.opti_param

In [None]:
for i, param in enumerate(opt_res_stable.hist_param[10:30]):
    if i % 1 == 0:
        proba = gmap(param)
        plt.plot(
            xs,
            norm(proba.means[0], np.sqrt(proba.cov[0])).pdf(xs),
            color="black",
            linewidth=0.4,
        )
#         proba_repr = repr_gauss(proba.means, proba.cov)
#         plt.plot(proba_repr[:, 0], proba_repr[:, 1], color="black", linewidth=0.2)
#         plt.plot(proba.means[0], proba.means[1], "x")

plt.title("Evolution of the posterior estimation")
plt.show()