In [None]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import theano
import theano.tensor as T
import lasagne
import math


# Variational Autoencoders (VAE)

In this exercise we'll implement an variational autoencoder. Very briefly an autoencoder encodes some input into a new representaiton and usually more compact representation which can be used to reconstruct the input data again. An variational autoencoder makes the furhter assumption that the compact representation is follows probabilistic distribution (usually a gaussian) which makes it possible to sample new data from a trained variational autoencoder. The "variational" part of the name comes from the fact that these models are training using variational inference.

The mathematical details of the training can be a bit challenging however we believe that probabilistic deep learning will be an important part of future deep learning developments why we find it important to introduce the concepts.

As background material we recommend reading [Tutorial on Variational Autoencoder](http://arxiv.org/abs/1606.05908). For the implementation of the model you must read the article "Auto-Encoding Variational Bayes", Kingma & Welling, ICLR 2014: http://arxiv.org/pdf/1312.6114v10.pdf and "Stochastic Backpropagation and Approximate Inference in Deep Generative Models", Rezende et al, ICML 2014:
http://arxiv.org/pdf/1401.4082v3.pdf



## VAE crash course

VAEs consist of two parts:

 * Encoder (also known as recognition, inference or Q-model): Maps the input data into a probabilistic latent space by calculating the mean and variance parameters of a gaussian distribution as a function of the input data x:  $q(z|x) = \mathcal{N}(z|\mu_\theta(x), \sigma_\phi(x)I)$
 * Decoder (also known as generative or P-model): Reconstructs the input image using a sample from the latent space defined by the encoder model: $p(x|z)$
<img src="VAE.png" alt="Drawing" style="width: 300px;"/>


In more mathematical details we have (this can be a bit challenging)

$p(x) = \int_z p(x|z)p(z)dz$

$p(x) = \int_z p(x|z)p(z)\frac{q(z|x)}{q(z|x)}dz$


$p(x) = \int_z q(z|x) \frac{p(x|z)p(z)}{q(z|x)}dz$


$\log p(x) = \log \int_z q(z|x) \frac{p(x|z)p(z)}{q(z|x)}dz$

$\log p(x) \geq  \int_z q(z|x)\log \frac{p(x|z)p(z)}{q(z|x)}dz$

This is know as the variational lower bound. We contiue with a bit of rewriting

$\log p(x) \geq E_{q(z|x)} \left[\log \frac{p(x|z)p(z)}{q(z|x)}\right]$

$\log p(x) \geq E_{q(z|x)} \left[\log p(x|z)\right] - KL(q(z|x) | p(z))$

Here the first term on the right hand side are the data reconstruction and the second term the Kulback-Liebler divergenve between the approximate and true posterior distributions which acts as a probabilistic regularizer.

### Training a VAE 
The VAE is similar to an deterministic autoencoder except that we assume that the hidden units are following some distribution. Usually we just assume that the units are independent standard gaussian distributed.

Above we defined a lower bound on the log likelihood of the data. We can train the model by pushing up the lowerbound. I'e we do gradient ascent on the lowerbound.  By using the _reparameterization trick_ we can directly backprop throug the model and uptimize the lower bound. If you are interested in the technical details you can look at the references given above.

### Setting up the network

We set up the network like an autoencoder except that the bottle neck layer is the __SimpleSampleLayer__ which samples the hidden units. 

The lower bound is calculated in the ```LogLikelihood```. 

In [None]:
#To speed up training we'll only work on a subset of the data
#We discretize the data to 0 and 1 in order to use it with a bernoulli observation model p(x|z) = Ber(mu(z))

def bernoullisample(x):
    return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)


data = np.load('mnist.npz')
num_classes = 10
x_train = bernoullisample(data['X_train'][:50000]).astype('float32')
targets_train = data['y_train'][:50000].astype('int32')

x_valid = bernoullisample(data['X_valid'][:500]).astype('float32')
targets_valid = data['y_valid'][:500].astype('int32')

x_test = bernoullisample(data['X_test'][:500]).astype('float32')
targets_test = data['y_test'][:500].astype('int32')

In [None]:
#plot a few MNIST examples

def plot_samples(x,title=''):
    idx = 0
    canvas = np.zeros((28*10, 10*28))
    for i in range(10):
        for j in range(10):
            canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x[idx].reshape((28, 28))
            idx += 1
    plt.figure(figsize=(7, 7))
    plt.imshow(canvas, cmap='gray')
    plt.title(title)
    plt.show()

plot_samples(x_train[:100],title='MNIST handwritten digits')

In [None]:
#defined a couple of helper functions
c = - 0.5 * math.log(2*math.pi)
def log_bernoulli(x, p, eps=0.0):
    p = T.clip(p, eps, 1.0 - eps)
    return -T.nnet.binary_crossentropy(p, x)

def kl_normal2_stdnormal(mean, log_var):
    return -0.5*(1 + log_var - mean**2 - T.exp(log_var))


Construct the lasagne layer.

In [None]:
from lasagne.layers import InputLayer,DenseLayer,get_output, get_all_params
from lasagne.nonlinearities import elu, identity, sigmoid
from samplelayer import SimpleSampleLayer

num_features = x_train.shape[-1]
num_latent_z = 64

#MODEL SPECIFICATION

#ENCODER
l_in_x = InputLayer(shape=(None, num_features))
l_enc = DenseLayer(l_in_x, num_units=256, nonlinearity=elu)
l_enc = DenseLayer(l_enc, num_units=256, nonlinearity=elu) 
l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=identity)     #mu(x)
l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), 

l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \sim q(z|x) = N(mu(x),logvar(x))

#we split the in two parts to allow sampling from the decoder model separately

#DECODER
l_in_z = InputLayer(shape=(None, num_latent_z))
l_dec = DenseLayer(l_in_z, num_units=256, nonlinearity=elu) 
l_dec = DenseLayer(l_dec, num_units=256, nonlinearity=elu) 
l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid)  #reconstruction of input using a sigmoid output since mux \in [0,1] 

In [None]:

sym_x = T.matrix('x')
sym_z = T.matrix('z')

z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=False)
mux_train = get_output(l_mux,{l_in_z:z_train},deterministic=False)

z_eval, muq_eval, logvarq_eval = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=True)
mux_eval = get_output(l_mux,{l_in_z:z_eval},deterministic=True)

mux_sample = get_output(l_mux,{l_in_z:sym_z},deterministic=True)



#defined the cost function

def LogLikelihood(mux,x,muq,logvarq):
    log_px_given_z = log_bernoulli(x, mux, eps=1e-6).sum(axis=1).mean() #note that we sum the latent dimension and mean over the samples
    KL_qp = kl_normal2_stdnormal(muq, logvarq).sum(axis=1).mean()
    LL = log_px_given_z - KL_qp
    return LL, log_px_given_z, KL_qp


LL_train, logpx_train, KL_train = LogLikelihood(mux_train, sym_x, muq_train, logvarq_train)
LL_eval, logpx_eval, KL_eval = LogLikelihood(mux_eval, sym_x, muq_eval, logvarq_eval)


all_params = get_all_params([l_z,l_mux],trainable=True)

# Let Theano do its magic and get all the gradients we need for training
all_grads = T.grad(-LL_train, all_params)


# Set the update function for parameters 
# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.
updates = lasagne.updates.adam(all_grads, all_params, learning_rate=1e-3)


f_train = theano.function(inputs=[sym_x],
                          outputs=[LL_train, logpx_train, KL_train],
                          updates=updates)

f_eval = theano.function(inputs=[sym_x],
                         outputs=[LL_train, logpx_train, KL_train])

f_sample= theano.function(inputs=[sym_z],
                         outputs=[mux_sample])

f_recon= theano.function(inputs=[sym_x],
                         outputs=[mux_eval])



In [None]:
#Test the forward pass
print  f_train(x_valid)

In [None]:
#plot some samples from the untrained model
z = np.random.normal(0,1,size=(100,num_latent_z)).astype('float32')
mux_sample = f_sample(z)[0]

plot_samples(mux_sample,title='MNIST handwritten samples, untrained model')

Train the model.

In [None]:
num_epochs = 10
batch_size = 64
num_batch_train = x_train.shape[0] // batch_size
 
LL_train, KL_train, logpx_train = [],[],[]
LL_valid, KL_valid, logpx_valid = [],[],[]

for e in range(num_epochs):
    _LL_train, _KL_train, _logpx_train = [],[],[]
    for i in range(num_batch_train):
        out = f_train(x_train[batch_size*i:(i+1)*batch_size])
        #out = [LL, logpx,KL_qp]
        _LL_train += [out[0]]
        _logpx_train += [out[1]]
        _KL_train += [out[2]]
        
    LL_train += [np.mean(_LL_train)] 
    KL_train += [np.mean(_KL_train)] 
    logpx_train += [np.mean(_logpx_train)] 
    
    out = f_eval(x_valid)
    LL_valid += [out[0]]
    logpx_valid += [out[1]]
    KL_valid += [out[2]]

    print "Epoch %i\t"%(e) + \
    "Train: LL: %0.1f\tKL %0.1f\tlogpx: %0.1f\t"%(LL_train[-1],KL_train[-1],logpx_train[-1]) + \
    "Valid: LL: %0.1f\tKL %0.1f\tlogpx: %0.1f"%(LL_valid[-1],KL_valid[-1],logpx_valid[-1])


epoch = np.arange(len(LL_train))
plt.figure()
plt.plot(epoch,LL_train,'r',epoch,LL_valid,'b')
plt.legend(['Train LL','Val LL'],loc='best')
plt.xlabel('Updates'), plt.ylabel('LL')



In [None]:
#plot some samples from the trained model
mux_sample = f_sample(z)[0]
plot_samples(mux_sample,title='MNIST handwritten samples, $z\sim p(z)$')

#plot some samples from the trained model
mux_recon = f_recon(x_test[:100])[0]
plot_samples(mux_recon,title='MNIST handwritten reconstructions, $z\sim q(z|x)$')


## Assignments
Remember that the model defines the probability distribution $p(x,z) = p(x|z)p(z)$. We additionally have the inference network $q(z|x)$ which allows us to infer the latent variables, $z$, for specific input data values $x$.



1. Explain how you could sample form the model, which function does this in the code? 
2. Explain how you could get reconstructions from the model. Remember that you have the inference network $q(z|x)$
3. Use the original paper http://arxiv.org/pdf/1312.6114v10.pdf or [this blog](http://blog.shakirm.com/2015/10/machine-learning-trick-of-the-day-4-reparameterisation-tricks/) to explain what the reparameterization trick does. 
4. The VAE is a probablistic model. We could model $p(x,z,y)$ where $y$ is the label information. How could this model handle semisupervised learning? You can look the papers https://arxiv.org/pdf/1406.5298.pdf or  https://arxiv.org/pdf/1602.05473v4.pdf. 