# Natural gradients

This shows some basic usage of the natural gradient optimizer, both on its own and in combination with other optimizers using the Actions class.

In [1]:
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import gpflow

### Natural gradients turn VGP into GPR in a single step, if the likelihood is Gaussian

In [2]:
N, D = 100, 2
np.random.seed(0)
X = np.random.uniform(size=(N, D))
Y = np.sin(10*X)

make_kern = lambda : gpflow.kernels.Matern52(D)

m_vgp = gpflow.models.VGP(X, Y, make_kern(), gpflow.likelihoods.Gaussian())
m_gpr = gpflow.models.GPR(X, Y, make_kern())

for model in m_vgp, m_gpr:
    model.likelihood.variance = 0.1
    
print('exact GP likelihood: {:.4f}'.format(m_gpr.compute_log_likelihood()))

print('VGP likelihood is before nat grad step: {:.4f}'.format(m_vgp.compute_log_likelihood()))

natgrad_optimizer = gpflow.training.NatGradOptimizer(gamma=1.)

natgrad_optimizer.minimize(m_vgp, maxiter=1, var_list=[[m_vgp.q_mu, m_vgp.q_sqrt]])

# should be the same as GP likelihoo (up to discrepanceies caused by jitter etc)
print('VGP likelihood after a single nat grad step: {:.4f}'.format(m_vgp.compute_log_likelihood()))



exact GP likelihood: -254.9348
VGP likelihood is before nat grad step: -1404.0805
VGP likelihood after a single nat grad step: -254.9348


### Interleaving an ordinary gradient step with a nat grad step
In this case (Gaussian likelihood) it achieves optimization of hyperparameters as if the model were GPR

In [3]:
lr = 0.01
iterations = 100

# use Adam on GPR 
gpflow.training.AdamOptimizer(lr).minimize(m_gpr, maxiter=iterations)

# use Adam + nat grads on VGP. The hyperparameters at the end should match the GPR model
def run_nat_grads_with_adam(model, lr, gamma, iterations, var_list=None):
    
    # we'll make use of this later when we use a XiTransform
    if var_list is None:
        var_list = [[model.q_mu, model.q_sqrt]]

    # we don't want adam optimizing these
    model.q_mu.set_trainable(False)
    model.q_sqrt.set_trainable(False)

    # the two optimizers we'll be combining 
    adam_opt = gpflow.training.AdamOptimizer(lr)
    ng_opt = gpflow.training.NatGradOptimizer(gamma)
    
    # the tensorflow operations
    t1 = adam_opt.make_optimize_tensor(model)
    t2 = ng_opt.make_optimize_tensor(model, var_list=var_list)

    # make the actions
    op1 = gpflow.actions.Optimization().with_optimizer_tensor(t1).with_run_kwargs()
    op2 = gpflow.actions.Optimization().with_optimizer_tensor(t2).with_run_kwargs()
    ops = gpflow.actions.Group(op1, op2)
    
    # run the loop
    gpflow.actions.Loop(ops).with_settings(stop=iterations)()

    model.anchor(model.enquire_session())

# compare
run_nat_grads_with_adam(m_vgp, lr, 1., iterations)

print('GPR lengthscale: {:.4f}'.format(m_gpr.kern.lengthscales.read_value()))
print('VGP lengthscale: {:.4f}'.format(m_vgp.kern.lengthscales.read_value()))

GPR lengthscale: 0.4827
VGP lengthscale: 0.4827


### This also works for the sparse model
Nat grads turn SVGP into SGPR in the Gaussian likelihood case. 


In [4]:
M = 10
Z = np.random.uniform(size=(M, D))

m_svgp = gpflow.models.SVGP(X, Y, make_kern(), gpflow.likelihoods.Gaussian(), Z=Z)
m_sgpr = gpflow.models.SGPR(X, Y, make_kern(), Z=Z)

for model in m_svgp, m_sgpr:
    model.likelihood.variance = 0.1
    
print('analytically optimal sparse model likelihood: {:.4f}'.format(m_sgpr.compute_log_likelihood()))

print('SVGP likelihood before nat grad step: {:.4f}'.format(m_svgp.compute_log_likelihood()))

natgrad_optimizer.minimize(m_svgp, maxiter=1, var_list=[[m_svgp.q_mu, m_svgp.q_sqrt]])

# should be the same as GP likelihoo (up to discrepanceies caused by jitter etc)
print('SVGP likelihood after a single nat grad step: {:.4f}'.format(m_svgp.compute_log_likelihood()))



analytically optimal sparse model likelihood: -281.5543
SVGP likelihood before nat grad step: -1404.0805
SVGP likelihood after a single nat grad step: -281.5543


### Minibatches
A crucial property of the natural gradient method is that it still works with minibatches. We need to use a smaller gamma, though

In [5]:
m_svgp_minibatch = gpflow.models.SVGP(X, Y, make_kern(), gpflow.likelihoods.Gaussian(), 
                                      Z=Z, minibatch_size=50)
m_svgp_minibatch.likelihood.variance = 0.1

natgrad_optimizer_minibatch = gpflow.training.NatGradOptimizer(gamma=0.1)

natgrad_optimizer_minibatch.minimize(m_svgp_minibatch, 
                                     maxiter=100, 
                                     var_list=[[m_svgp_minibatch.q_mu, m_svgp_minibatch.q_sqrt]])

L = np.average([m_svgp_minibatch.compute_log_likelihood() for _ in range(1000)])
print('minibatch SVGP likelihood after nat grad optimization: {:.4f}'.format(L))



minibatch SVGP likelihood after nat grad optimization: -281.8539


### Comparison with ordinary gradients in the non-conjugate case

#### (Take Natural gradients are always better)
Compared with doing SVGP with ordinary gradients with minibatches, nat grads is much faster, in the Gaussian case

Here we'll do hyperparameter learning together optimization of the variational parameters, comparing the interleaved nat grad approach and using ordinary gradients for the hyperparameters and variational parameters jointly 

In [None]:
m_svgp_minibatch_ordinary = gpflow.models.SVGP(X, Y, make_kern(), gpflow.likelihoods.Gaussian(), 
                                               Z=Z, minibatch_size=50)

m_svgp_minibatch_nat = gpflow.models.SVGP(X, Y, make_kern(), gpflow.likelihoods.Gaussian(), 
                                          Z=Z, minibatch_size=50)

# ordinary gradients and Adam
gpflow.training.AdamOptimizer(lr).minimize(m_svgp_minibatch_ordinary, maxiter=iterations)

# nat grads with Adam 
run_nat_grads_with_adam(m_svgp_minibatch_nat, lr, 0.1, iterations)

L = np.average([m_svgp_minibatch_ordinary.compute_log_likelihood() for _ in range(1000)])
print('ordinary grads SVGP likelihood: {:.4f}'.format(L))

L = np.average([m_svgp_minibatch_nat.compute_log_likelihood() for _ in range(1000)])
print('nat grads + Adam SVGP likelihood: {:.4f}'.format(L))


### Comparison with ordinary gradients in the non-conjugate case

#### (Natural gradients are usually better)

We can use nat grads even when the likelihood isn't Gaussian. 

In [None]:

Y_binary = np.random.choice([1., -1], size=X.shape)

m_vgp_bernoulli = gpflow.models.VGP(X, Y_binary, make_kern(), gpflow.likelihoods.Bernoulli())
m_vgp_bernoulli_natgrads = gpflow.models.VGP(X, Y_binary, make_kern(), gpflow.likelihoods.Bernoulli())

# ordinary gradients and Adam
gpflow.training.AdamOptimizer(lr).minimize(m_vgp_bernoulli, maxiter=iterations)

# nat grads with Adam 
run_nat_grads_with_adam(m_vgp_bernoulli_natgrads, lr, 0.1, iterations)

print('ordinary grads SVGP likelihood: {:.4f}'.format(m_vgp_bernoulli.compute_log_likelihood()))

print('nat grads + Adam SVGP likelihood: {:.4f}'.format(m_vgp_bernoulli_natgrads.compute_log_likelihood()))


We can also choose to run natural gradients in another parameterization. The 
sensible choice might is the model parameters (q_mu, q_sqrt), which is already in gpflow

In [None]:

m_vgp_bernoulli_natgrads_xi = gpflow.models.VGP(X, Y_binary, make_kern(), gpflow.likelihoods.Bernoulli())

var_list = [[m_vgp_bernoulli_natgrads_xi.q_mu, 
             m_vgp_bernoulli_natgrads_xi.q_sqrt, 
             gpflow.training.XiSqrtMeanVar()]]
run_nat_grads_with_adam(m_vgp_bernoulli_natgrads_xi, lr, 0.01, iterations, var_list=var_list)

print('nat grads + Adam with XiSqrtMeanVar: {:.4f}'.format(m_vgp_bernoulli_natgrads_xi.compute_log_likelihood()))


With sufficiently small steps, it shouldn't make a difference which transform is used, but for large 
step this can make a difference in practice.
