# Notice

The file contains codes used by group 9 to replicate and extend 5.3, and to do the planar flow extension of 5.2.

This is the code used by candidate 1040250.

The following code is largely based on C. Cremer's Github repository of the paper "Inference Suboptimality of Variational Autoencoder" (https://github.com/chriscremer/Inference-Suboptimality). Proper citations are included in the heading of each section of my code. Original codes and modifications to C. Cremer's code will be pointed out and explained at appropriate locations.

# Reference List:

Cremer, Chris, Li, Xuechen, and Duvenaud, David. Inference suboptimality in variational autoencoders.  In Dy, Jennifer G. and Krause, Andreas (eds.),Proceedings of the 35th International Conference on Machine Learning, ICML 2018, Stockholmsmässan, Stockholm, Sweden, July 10-15,2018, volume 80 of Proceedings of Machine Learning Research, pp. 1086–1094. PMLR, 2018. URL http://proceedings.mlr.press/v80/cremer18a.html.

# Different installations and Connection to Google Drive

(Original command lines written by the group)


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd /content/drive/My\ Drive/ATiML

In [0]:
!python --version

In [0]:
!pip3 install tqdm

In [0]:
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl

In [0]:
!pip3 install torchvision==0.2

In [0]:
!pip3 install visdom

#Import

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import time
import sys
import os
import math
import argparse
from tqdm import tqdm
import numpy as np


import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
import visdom
import torch.utils.data
import torch.nn as nn

import pickle
import os
import gzip

# Some functions needed for VAE model

Modified from Chris Cremer's util.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes in that file have been deleted in the following.

In [0]:

def lognormal2(x, mean, logvar):
    '''
    x: [P,B,Z]
    mean,logvar: [B,Z]
    output: [P,B]
    '''

    assert len(x.size()) == 3
    assert len(mean.size()) == 2
    assert len(logvar.size()) == 2
    assert x.size()[1] == mean.size()[0]

    D = x.size()[2]

    if torch.cuda.is_available():
        term1 = D * torch.log(torch.cuda.FloatTensor([2.*math.pi])) #[1]
    else:
        term1 = D * torch.log(torch.FloatTensor([2.*math.pi])) #[1]


    return -.5 * (Variable(term1) + logvar.sum(1) + ((x - mean).pow(2)/torch.exp(logvar)).sum(2))


def lognormal333(x, mean, logvar):
    '''
    x: [P,B,Z]
    mean,logvar: [P,B,Z]
    output: [P,B]
    '''

    assert len(x.size()) == 3
    assert len(mean.size()) == 3
    assert len(logvar.size()) == 3
    assert x.size()[0] == mean.size()[0]
    assert x.size()[1] == mean.size()[1]

    D = x.size()[2]

    if torch.cuda.is_available():
        term1 = D * torch.log(torch.cuda.FloatTensor([2.*math.pi])) #[1]
    else:
        term1 = D * torch.log(torch.FloatTensor([2.*math.pi])) #[1]


    return -.5 * (Variable(term1) + logvar.sum(2) + ((x - mean).pow(2)/torch.exp(logvar)).sum(2))



    
def log_bernoulli(pred_no_sig, target):
    '''
    pred_no_sig is [P, B, X] 
    t is [B, X]
    output is [P, B]
    '''

    assert len(pred_no_sig.size()) == 3
    assert len(target.size()) == 2
    assert pred_no_sig.size()[1] == target.size()[0]

    return -(torch.clamp(pred_no_sig, min=0)
                        - pred_no_sig * target
                        + torch.log(1. + torch.exp(-torch.abs(pred_no_sig)))).sum(2) #sum over dimensions








def lognormal3(x, mean, logvar):
    '''
    x: [P]
    mean,logvar: [P]
    output: [1]
    '''

    return -.5 * (logvar.sum(0) + ((x - mean).pow(2)/torch.exp(logvar)).sum(0))




def lognormal4(x, mean, logvar):
    '''
    x: [B,X]
    mean,logvar: [X]
    output: [B]
    '''
    # print x.size()
    # print mean.size()
    # print logvar.size()
    # print mean
    # print logvar
    D = x.size()[1]
    # print D
    term1 = D * torch.log(torch.FloatTensor([2.*math.pi])) #[1]
    # print term1
    # print logvar.sum(0)

    aaa = -.5 * (term1 + logvar.sum(0) + ((x - mean).pow(2)/torch.exp(logvar)).sum(1))
    # print aaa.size()

    return aaa



# Generator of the VAE

Modified from Chris Cremer's generator.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes in that file have been deleted in the following.

In [0]:

class Generator(nn.Module):

    def __init__(self, hyper_config):
        super(Generator, self).__init__()

        if hyper_config['cuda']:
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']

        #Decoder
        self.decoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['decoder_arch'])):
            self.decoder_weights.append(nn.Linear(hyper_config['decoder_arch'][i][0], hyper_config['decoder_arch'][i][1]))

        count =1
        for i in range(len(self.decoder_weights)):
            self.add_module(str(count), self.decoder_weights[i])
            count+=1
   

    def decode(self, z):
        k = z.size()[0]
        B = z.size()[1]
        z = z.view(-1, self.z_size)


        out = z
        for i in range(len(self.decoder_weights)-1):
            out = self.act_func(self.decoder_weights[i](out))
        out = self.decoder_weights[-1](out)

        x = out.view(k, B, self.x_size)
        return x





# Approximate Posterior Distributions used by the paper

Modified from Chris Cremer's distributions.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes (e.g. HNF(nn.Module)) in that file have been deleted.

Note that "Flow" refers to Auxilairy Flow distributions, and "Flow1" refers to Flow distributions.

In [0]:


class Gaussian(nn.Module):

    def __init__(self, hyper_config):
        #mean,logvar: [B,Z]
        super(Gaussian, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']



    def sample(self, mean, logvar, k):

        self.B = mean.size()[0]

        eps = Variable(torch.FloatTensor(k, self.B, self.z_size).normal_().type(self.dtype)) #[P,B,Z]
        z = eps.mul(torch.exp(.5*logvar)) + mean  #[P,B,Z]
        logqz = lognormal2(z, mean, logvar) #[P,B]

        return z, logqz



    def logprob(self, z, mean, logvar):

        logqz = lognormal2(z, mean, logvar) #[P,B]

        return logqz







class Flow(nn.Module):

    def __init__(self, hyper_config):
        #mean,logvar: [B,Z]
        super(Flow, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config
        # self.B = mean.size()[0]
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']

        self.act_func = hyper_config['act_func']
        

        count =1

        # f(vT|x,vT)
        rv_arch = [[self.z_size,50],[50,50],[50,self.z_size*2]]
        self.rv_weights = []
        for i in range(len(rv_arch)):
            layer = nn.Linear(rv_arch[i][0], rv_arch[i][1])
            self.rv_weights.append(layer)
            self.add_module(str(count), layer)
            count+=1


        n_flows = 2
        self.n_flows = n_flows
        h_s = 50

        
        self.flow_params = []
        for i in range(n_flows):
            #first is for v, second is for z
            self.flow_params.append([
                                [nn.Linear(self.z_size, h_s), nn.Linear(h_s, self.z_size), nn.Linear(h_s, self.z_size)],
                                [nn.Linear(self.z_size, h_s), nn.Linear(h_s, self.z_size), nn.Linear(h_s, self.z_size)]
                                ])
        
        for i in range(n_flows):

            self.add_module(str(count), self.flow_params[i][0][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][2])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][2])
            count+=1
    


 


    def norm_flow(self, params, z, v):
        h = F.tanh(params[0][0](z))
        mew_ = params[0][1](h)
        sig_ = F.sigmoid(params[0][2](h)) #[PB,Z]

        v = v*sig_ + mew_
        logdet = torch.sum(torch.log(sig_), 1)



        h = F.tanh(params[1][0](v))
        mew_ = params[1][1](h)
        sig_ = F.sigmoid(params[1][2](h)) #[PB,Z]
        z = z*sig_ + mew_
        logdet2 = torch.sum(torch.log(sig_), 1)



        #[PB]
        logdet = logdet + logdet2
        #[PB,Z], [PB]
        return z, v, logdet



    def sample(self, mean, logvar, k):

        self.B = mean.size()[0]
        gaus = Gaussian(self.hyper_config)

        # q(z0)
        z, logqz0 = gaus.sample(mean, logvar, k)

        # q(v0)
        zeros = Variable(torch.zeros(self.B, self.z_size)).cuda()
        v, logqv0 = gaus.sample(zeros, zeros, k)


        #[PB,Z]
        z = z.view(-1,self.z_size)
        v = v.view(-1,self.z_size)

        #Transform
        logdetsum = 0.
        for i in range(self.n_flows):

            params = self.flow_params[i]

            z, v, logdet = self.norm_flow(params,z,v)
            logdetsum += logdet

        logdetsum = logdetsum.view(k,self.B)


        out = z #[PB,Z]

        for i in range(len(self.rv_weights)-1):
            out = self.act_func(self.rv_weights[i](out))
        out = self.rv_weights[-1](out)
        mean = out[:,:self.z_size]
        logvar = out[:,self.z_size:]



        v = v.view(k, self.B, self.z_size)
        z = z.view(k, self.B, self.z_size)

        mean = mean.contiguous().view(k, self.B, self.z_size)
        logvar = logvar.contiguous().view(k, self.B, self.z_size)



        logrvT = lognormal333(v, mean, logvar)





        logpz = logqz0+logqv0-logdetsum-logrvT

        return z, logpz






#NO AUX VAF
class Flow1(nn.Module):

    def __init__(self, hyper_config):
        #mean,logvar: [B,Z]
        super(Flow1, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']

        self.act_func = hyper_config['act_func']
        

        count =1


        n_flows = 2
        self.n_flows = n_flows
        h_s = 50

        self.z_half_size = int(self.z_size / 2)

        
        self.flow_params = []
        for i in range(n_flows):
            self.flow_params.append([
                                [nn.Linear(self.z_half_size, h_s), nn.Linear(h_s, self.z_half_size), nn.Linear(h_s, self.z_half_size)],
                                [nn.Linear(self.z_half_size, h_s), nn.Linear(h_s, self.z_half_size), nn.Linear(h_s, self.z_half_size)]
                                ])
        
        for i in range(n_flows):

            self.add_module(str(count), self.flow_params[i][0][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][2])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][2])
            count+=1
    



    def norm_flow(self, params, z1, z2):
        h = F.tanh(params[0][0](z1))
        mew_ = params[0][1](h)
        sig_ = F.sigmoid(params[0][2](h)) #[PB,Z]

        z2 = z2*sig_ + mew_
        logdet = torch.sum(torch.log(sig_), 1)


        h = F.tanh(params[1][0](z2))
        mew_ = params[1][1](h)
        sig_ = F.sigmoid(params[1][2](h)) #[PB,Z]
        z1 = z1*sig_ + mew_
        logdet2 = torch.sum(torch.log(sig_), 1)



        #[PB]
        logdet = logdet + logdet2
        #[PB,Z], [PB]
        return z1, z2, logdet



    def sample(self, mean, logvar, k):

        self.B = mean.size()[0]
        gaus = Gaussian(self.hyper_config)

        # q(z0)
        z, logqz0 = gaus.sample(mean, logvar, k)

        #[PB,Z]
        z = z.view(-1,self.z_size)

        #Split z  [PB,Z/2]
        z1 = z.narrow(1, 0, self.z_half_size)
        z2 = z.narrow(1, self.z_half_size, self.z_half_size) 

        #Transform
        logdetsum = 0.
        for i in range(self.n_flows):

            params = self.flow_params[i]

            z1, z2, logdet = self.norm_flow(params,z1,z2)
            logdetsum += logdet

        logdetsum = logdetsum.view(k,self.B)

        #Put z back together  [PB,Z]
        z = torch.cat([z1,z2],1)

        z = z.view(k, self.B, self.z_size)


        logpz = logqz0-logdetsum

        return z, logpz

# Approximate Posterior: 4-Flow

This code is exactly the same as the code for Flow1(nn.Module), but self.n_flow is set to 4 to perform 4 Flow transformations.

In [0]:

class Four_Flow1(nn.Module):

    def __init__(self, hyper_config):
        #mean,logvar: [B,Z]
        super(Four_Flow1, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']

        self.act_func = hyper_config['act_func']
        

        count =1


        n_flows = 4
        self.n_flows = n_flows
        h_s = 50

        self.z_half_size = int(self.z_size / 2)

        
        self.flow_params = []
        for i in range(n_flows):
            self.flow_params.append([
                                [nn.Linear(self.z_half_size, h_s), nn.Linear(h_s, self.z_half_size), nn.Linear(h_s, self.z_half_size)],
                                [nn.Linear(self.z_half_size, h_s), nn.Linear(h_s, self.z_half_size), nn.Linear(h_s, self.z_half_size)]
                                ])
        
        for i in range(n_flows):

            self.add_module(str(count), self.flow_params[i][0][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][2])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][2])
            count+=1
    



    def norm_flow(self, params, z1, z2):
        h = F.tanh(params[0][0](z1))
        mew_ = params[0][1](h)
        sig_ = F.sigmoid(params[0][2](h)) #[PB,Z]

        z2 = z2*sig_ + mew_
        logdet = torch.sum(torch.log(sig_), 1)


        h = F.tanh(params[1][0](z2))
        mew_ = params[1][1](h)
        sig_ = F.sigmoid(params[1][2](h)) #[PB,Z]
        z1 = z1*sig_ + mew_
        logdet2 = torch.sum(torch.log(sig_), 1)



        #[PB]
        logdet = logdet + logdet2
        #[PB,Z], [PB]
        return z1, z2, logdet



    def sample(self, mean, logvar, k):

        self.B = mean.size()[0]
        gaus = Gaussian(self.hyper_config)

        # q(z0)
        z, logqz0 = gaus.sample(mean, logvar, k)

        #[PB,Z]
        z = z.view(-1,self.z_size)

        #Split z  [PB,Z/2]
        z1 = z.narrow(1, 0, self.z_half_size)
        z2 = z.narrow(1, self.z_half_size, self.z_half_size) 

        #Transform
        logdetsum = 0.
        for i in range(self.n_flows):

            params = self.flow_params[i]

            z1, z2, logdet = self.norm_flow(params,z1,z2)
            logdetsum += logdet

        logdetsum = logdetsum.view(k,self.B)

        #Put z back together  [PB,Z]
        z = torch.cat([z1,z2],1)

        z = z.view(k, self.B, self.z_size)


        logpz = logqz0-logdetsum

        return z, logpz

# Original Approximate Posterior: Planar_Flow

This is a original piece of code written by an member of the group to do the planar flow extension.

In [0]:

class Planar_Flow(nn.Module):

    def __init__(self, hyper_config):#, mean, logvar):
        #mean,logvar: [B,Z]
        super(Planar_Flow, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']

        self.act_func = hyper_config['act_func']
      


        
        self.flow_params = []








    def sample(self, mean, logvar, k, w1, u1, b1, w2, u2, b2):

        self.B = mean.size()[0]
        gaus = Gaussian(self.hyper_config)

        # mean, logvar, w1, u1, w2, u2: should have size [B,Z]
        # b1, b2 should have size [B,1]



        # q(z0); z: [P,B,Z]
        z, logqz0 = gaus.sample(mean, logvar, k)

        #[PB,Z]
        z = z.view(-1,self.z_size)
        w1 = w1.repeat(k,1,1)
        w1 = w1.view(-1,self.z_size)
        w2 = w2.repeat(k,1,1)
        w2 = w2.view(-1,self.z_size)
        u1 = u1.repeat(k,1,1)
        u1 = u1.view(-1,self.z_size)
        u2 = u2.repeat(k,1,1)
        u2 = u2.view(-1,self.z_size)
        b1 = b1.repeat(k,1,1)
        b1 = b1.view(-1,1)
        b2 = b2.repeat(k,1,1)
        b2 = b2.view(-1,1)



        # First Planar Flow transformation
        # [PB]
        wu_1 = torch.sum(torch.mul(w1 , u1) , 1)

        # [PB,1]
        wu_1_transpose = wu_1.view(-1,1)

        wz_1 = torch.sum(torch.mul(w1 , z) , 1)

        # [PB,1]
        wz_1_transpose = wz_1.view(-1,1)

        # [PB]
        m_wu_1 = torch.log(1. + torch.exp(wu_1)) - 1.

        # [PB,1]
        m_wu_1_transpose = m_wu_1.view(-1,1)

        # [PB]
        sq_norm_of_w1 = torch.sum(torch.mul(w1 , w1) , 1)
        # [PB,1]
        sq_norm_of_w1_transpose = sq_norm_of_w1.view(-1,1)

        # [PB,Z]
        u1_prime = u1 + ((m_wu_1_transpose - wu_1_transpose)*w1)/sq_norm_of_w1_transpose

        # [PB,Z]
        z_first = z + torch.tanh(wz_1_transpose  + b1)*u1_prime

        # [PB,1]
        logdet_first = torch.log(torch.abs(1. + (1. - (torch.tanh(wz_1_transpose  + b1)).pow(2))*((torch.sum(torch.mul(w1 , u1_prime),1)).view(-1,1))))
        
        # [PB]
        logdet_first = logdet_first.view(-1)


        # Second Planar Flow Transformation
        # [PB]
        wu_2 = torch.sum(torch.mul(w2 , u2) , 1)

        # [PB,1]
        wu_2_transpose = wu_2.view(-1,1)

        wz_2 = torch.sum(torch.mul(w1 , z_first) , 1)

        # [PB,1]
        wz_2_transpose = wz_2.view(-1,1)

        # [PB]
        m_wu_2 = torch.log(1. + torch.exp(wu_2)) - 1.

        # [PB,1]
        m_wu_2_transpose = m_wu_2.view(-1,1)

        # [PB]
        sq_norm_of_w2 = torch.sum(torch.mul(w2 , w2) , 1)
        # [PB,1]
        sq_norm_of_w2_transpose = sq_norm_of_w2.view(-1,1)

        # [PB,Z]
        u2_prime = u2 + ((m_wu_2_transpose - wu_2_transpose)*w2)/sq_norm_of_w2_transpose

        # [PB,Z]
        z_second = z_first + torch.tanh(wz_2_transpose  + b2)*u2_prime

        # [PB,1]
        logdet_second = torch.log(torch.abs(1. + (1. - (torch.tanh(wz_2_transpose  + b2)).pow(2))*((torch.sum(torch.mul(w2 , u2_prime),1)).view(-1,1))))

        # [PB]
        logdet_second = logdet_second.view(-1)



        logdetsum = 0.

        logdetsum = logdetsum - logdet_first - logdet_second


        logdetsum = logdetsum.view(k,self.B)

        z_second = z_second.view(k, self.B, self.z_size)


        logpz = logqz0-logdetsum

        return z_second, logpz

# Encoder of the VAE

Modified from Chris Cremer's inference_net.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes in that file have been deleted. Moreover, the forward function has been modified to account for the use of planar flow approximate posterior.

In [0]:


class standard(nn.Module):

    def __init__(self, hyper_config):
        super(standard, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config

        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']


        #Encoder
        self.encoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['encoder_arch'])):
            self.encoder_weights.append(nn.Linear(hyper_config['encoder_arch'][i][0], 
                                                  hyper_config['encoder_arch'][i][1]))


        count =1
        for i in range(len(self.encoder_weights)):
            self.add_module(str(count), self.encoder_weights[i])
            count+=1
       



        # self.q = Gaussian(self.hyper_config) #, mean, logvar)
        # self.q = Flow(self.hyper_config)#, mean, logvar)
        self.q = hyper_config['q']


    def forward(self, k, x, logposterior):
        '''
        k: number of samples
        x: [B,X]
        logposterior(z) -> [P,B]
        '''

        self.B = x.size()[0]

        #Encode
        out = x
        for i in range(len(self.encoder_weights)-1):
            out = self.act_func(self.encoder_weights[i](out))

        out = self.encoder_weights[-1](out)
        mean = out[:,:self.z_size]  #[B,Z]
        logvar = out[:,self.z_size:2*self.z_size]
        if self.hyper_config['planar_flow']:
          w1 = out[:,2*self.z_size:3*self.z_size]
          u1 = out[:,3*self.z_size:4*self.z_size]
          w2 = out[:,4*self.z_size:5*self.z_size]
          u2 = out[:,5*self.z_size:6*self.z_size]
          b1 = out[:,6*self.z_size:(6*self.z_size + 1)]
          b2 = out[:,(6*self.z_size + 1):(6*self.z_size + 2)]



        if self.hyper_config['hnf']:
            z, logqz = self.q.sample(mean, logvar, k, logposterior)
        elif self.hyper_config['planar_flow']:
            z, logqz = self.q.sample(mean, logvar, k, w1, u1, b1, w2, u2, b2)
        else:
            z, logqz = self.q.sample(mean, logvar, k)

        return z, logqz


# the whole VAE model for training

Modified from Chris Cremer's vae_2.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes in that file have been deleted.

In [0]:


class VAE(nn.Module):
    def __init__(self, hyper_config, seed=1):
        super(VAE, self).__init__()

        torch.manual_seed(seed)


        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']


        self.q_dist = hyper_config['q_dist'](hyper_config=hyper_config)


        self.generator = Generator(hyper_config=hyper_config)


        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
            self.q_dist.cuda()
        else:
            self.dtype = torch.FloatTensor
            



    def forward(self, x, k, warmup=1.):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal2(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k, x, self.logposterior)

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz - (warmup*logqz) #[P,B]
        if k>1:
            max_ = torch.max(elbo, 0)[0] #[B]
            elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
            
        elbo = torch.mean(elbo) #[1]
        logpxz = torch.mean(logpxz) #[1]
        logqz = torch.mean(logqz)

        return elbo, logpxz, logqz


    def sample_q(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal2(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k=k, x=x, logposterior=self.logposterior)

        return z


    def logposterior_func(self, x, z):
        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))


        z = Variable(z).type(self.dtype)
        z = z.view(-1,self.B,self.z_size)
        return lognormal2(z, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(z), x)



    def logposterior_func2(self, x, z):
        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))


        z = z.view(-1,self.B,self.z_size)

        return lognormal2(z, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(z), x)



    def forward2(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal2(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k, x, self.logposterior)

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz - logqz #[P,B]
            
        elbo = torch.mean(elbo) #[1]
        logpxz = torch.mean(logpxz) #[1]
        logqz = torch.mean(logqz)

        return elbo, logpxz, logqz




    def forward3_prior(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa:  log_bernoulli(self.generator.decode(aa), x)

        z = Variable(torch.FloatTensor(k, self.B, self.z_size).normal_().type(self.dtype)) #[P,B,Z]

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz #- logqz #[P,B]
        if k>1:
            max_ = torch.max(elbo, 0)[0] #[B]
            elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
            
        elbo = torch.mean(elbo) #[1]

        return elbo






# A function needed for computing L(q*)

Modified from Chris Cremer's optimize_local_q.py file (Cremer et al. (2018)) in his Github Code of the paper. Some codes in that file have been deleted. Modifications has been made so that this code can be used to compute L(q*) for planar flow.

In [0]:

def optimize_local_q_dist(logposterior, hyper_config, x, q):

    B = x.size()[0] #batch size
    P = 100

    z_size = hyper_config['z_size']
    x_size = hyper_config['x_size']
    if torch.cuda.is_available():
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor
        


    if hyper_config['planar_flow']:
        w1 = Variable((torch.zeros(B, z_size).normal_()).type(dtype), requires_grad=True)
        w2 = Variable((torch.zeros(B, z_size).normal_()).type(dtype), requires_grad=True)
        u1 = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)
        u2 = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)
        b1 = Variable(torch.zeros(B, 1).type(dtype), requires_grad=True)
        b2 = Variable(torch.zeros(B, 1).type(dtype), requires_grad=True)
        mean = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)
        logvar = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)

        params = [mean, logvar, w1, u1, w2, u2, b1, b2]

    else:
        mean = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)
        logvar = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)

        params = [mean, logvar]


    for aaa in q.parameters():
        params.append(aaa)


    optimizer = optim.Adam(params, lr=.001)

    last_100 = []
    best_last_100_avg = -1
    consecutive_worse = 0
    for epoch in range(1, 999999):



        if hyper_config['planar_flow']:
           z, logqz = q.sample(mean, logvar, P, w1, u1, b1, w2, u2, b2)
        else:
           z, logqz = q.sample(mean, logvar, P)

        logpx = logposterior(z)

        optimizer.zero_grad()


        loss = -(torch.mean(logpx-logqz)) 
        loss_np = loss.data.cpu().numpy()

        loss.backward()
        optimizer.step()

        last_100.append(loss_np)
        if epoch % 100 ==0:

            last_100_avg = np.mean(last_100)
            if last_100_avg< best_last_100_avg or best_last_100_avg == -1:
                consecutive_worse=0
                best_last_100_avg = last_100_avg
            else:
                consecutive_worse +=1 
                # print(consecutive_worse)
                if consecutive_worse> 10:
                    # print ('done')
                    break

            if epoch % 2000 ==0:
                print (epoch, last_100_avg, consecutive_worse)
            # print (torch.mean(logpx))

            last_100 = []



    # Compute VAE and IWAE bounds


    if hyper_config['planar_flow']:
      z, logqz = q.sample(mean, logvar, 5000, w1, u1, b1, w2, u2, b2)
    else:
      z, logqz = q.sample(mean, logvar, 5000)


    logpx = logposterior(z)

    elbo = logpx-logqz #[P,B]
    vae = torch.mean(elbo)

    max_ = torch.max(elbo, 0)[0] #[B]
    elbo_ = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
    iwae = torch.mean(elbo_)

    return vae, iwae

# Function for computing AIS estimation of log(p(x))

This whole piece of code is directly copied from all the codes present in ais_4.py file of C. Cremer's Github repository for Cremer et al. (2018).

In [0]:


# This one samples the prior distribution

# Use this to time everytthing


import math
import torch
from torch.autograd import Variable
import numpy as np



import time


def test_ais(model, data_x, batch_size, display, k, n_intermediate_dists):


    def intermediate_dist(t, z, mean, logvar, zeros, batch):
        # logp1 = lognormal(z, mean, logvar)  #[P,B]
        log_prior = lognormal2(z, zeros, zeros)  #[P,B]
        log_likelihood = log_bernoulli(model.generator.decode(z), batch)
        # logpT = log_prior + log_likelihood
        # log_intermediate_2 = (1-float(t))*logp1 + float(t)*logpT

        log_intermediate_2 = log_prior + float(t)*log_likelihood

        return log_intermediate_2


    def hmc(z, intermediate_dist_func):

        if torch.cuda.is_available():
            v = Variable(torch.FloatTensor(z.size()).normal_(), volatile=volatile_, requires_grad=requires_grad).cuda()
        else:
            v = Variable(torch.FloatTensor(z.size()).normal_()) 

        v0 = v
        z0 = z

        # print (intermediate_dist_func(z))
        # fasdf
        gradients = torch.autograd.grad(outputs=intermediate_dist_func(z), inputs=z,
                          grad_outputs=grad_outputs,
                          create_graph=True, retain_graph=retain_graph, only_inputs=True)[0]

        gradients = gradients.detach()

        v = v + .5 *step_size*gradients
        z = z + step_size*v

        for LF_step in range(n_HMC_steps):

            # log_intermediate_2 = intermediate_dist(t1, z, mean, logvar, zeros, batch)
            gradients = torch.autograd.grad(outputs=intermediate_dist_func(z), inputs=z,
                              grad_outputs=grad_outputs,
                              create_graph=True, retain_graph=retain_graph, only_inputs=True)[0]
            gradients = gradients.detach()
            v = v + step_size*gradients
            z = z + step_size*v

        # log_intermediate_2 = intermediate_dist(t1, z, mean, logvar, zeros, batch)
        gradients = torch.autograd.grad(outputs=intermediate_dist_func(z), inputs=z,
                          grad_outputs=grad_outputs,
                          create_graph=True, retain_graph=retain_graph, only_inputs=True)[0]
        gradients = gradients.detach()
        v = v + .5 *step_size*gradients

        return z0, v0, z, v


    def mh_step(z0, v0, z, v, step_size, intermediate_dist_func):

        logpv0 = lognormal2(v0, zeros, zeros) #[P,B]
        hamil_0 =  intermediate_dist_func(z0) + logpv0
        
        logpvT = lognormal2(v, zeros, zeros) #[P,B]
        hamil_T = intermediate_dist_func(z) + logpvT

        accept_prob = torch.exp(hamil_T - hamil_0)

        if torch.cuda.is_available():
            rand_uni = Variable(torch.FloatTensor(accept_prob.size()).uniform_(), volatile=volatile_, requires_grad=requires_grad).cuda()
        else:
            rand_uni = Variable(torch.FloatTensor(accept_prob.size()).uniform_())

        accept = accept_prob > rand_uni

        if torch.cuda.is_available():
            accept = accept.type(torch.FloatTensor).cuda()
        else:
            accept = accept.type(torch.FloatTensor)
        
        accept = accept.view(k, int(model.B), 1)

        z = (accept * z) + ((1-accept) * z0)

        #Adapt step size
        avg_acceptance_rate = torch.mean(accept)

        if avg_acceptance_rate.cpu().data.numpy() > .65:
            step_size = 1.02 * step_size
        else:
            step_size = .98 * step_size

        if step_size < 0.0001:
            step_size = 0.0001
        if step_size > 0.5:
            step_size = 0.5

        return z, step_size




    # n_intermediate_dists = 10
    n_HMC_steps = 10
    step_size = .1

    retain_graph = False
    volatile_ = False
    requires_grad = False

    time_ = time.time()

    logws = []
    data_index= 0
    for i in range(int(len(data_x)/ batch_size)):

        #AIS

        schedule = np.linspace(0.,1.,n_intermediate_dists)
        model.B = batch_size

        batch = data_x[data_index:data_index+batch_size]
        data_index += batch_size

        B = int(model.B)

        if torch.cuda.is_available():
            batch = Variable(torch.from_numpy(batch).type(model.dtype), volatile=volatile_, requires_grad=requires_grad).cuda()
            zeros = Variable(torch.zeros(B, int(model.z_size)).type(model.dtype), volatile=volatile_, requires_grad=requires_grad).cuda() # [B,Z]
            logw = Variable(torch.zeros(k, B).type(model.dtype), volatile=True, requires_grad=requires_grad).cuda()
            grad_outputs = torch.ones(k, B).cuda()
        else:
            batch = Variable(torch.from_numpy(batch))
            zeros = Variable(torch.zeros(model.B, model.z_size)) # [B,Z]
            logw = Variable(torch.zeros(k, model.B))
            grad_outputs = torch.ones(k, model.B)


        # #Encode x
        # mean, logvar = model.encode(batch) #[B,Z]
        # #Init z
        # z, logpz, logqz = model.sample(mean, logvar, k=k)  #[P,B,Z], [P,B], [P,B]


        # z, logqz = model.q_dist.forward(k=k, x=batch, logposterior=model.logposterior)


        # z = Variable(torch.FloatTensor(k, model.B, model.z_size).normal_().type(model.dtype),requires_grad=True)

        z = Variable(torch.FloatTensor(k, B, model.z_size).normal_().type(model.dtype))

        time_2 = time.time()
        for (t0, t1) in zip(schedule[:-1], schedule[1:]):


            #logw = logw + logpt-1(zt-1) - logpt(zt-1)  t, z, mean, logvar, zeros, batch
            # log_intermediate_1 = intermediate_dist(t0, z, mean, logvar, zeros, batch)
            # log_intermediate_2 = intermediate_dist(t1, z, mean, logvar, zeros, batch)
            log_intermediate_1 = intermediate_dist(t=t0, z=z, mean=zeros, logvar=zeros, zeros=zeros, batch=batch)
            log_intermediate_2 = intermediate_dist(t=t1, z=z, mean=zeros, logvar=zeros, zeros=zeros, batch=batch)


            logw += log_intermediate_2 - log_intermediate_1

            # print('ere')

            z = z.data

            z = Variable(z, requires_grad=True)


            #HMC dynamics
            # intermediate_dist_func = lambda aaa: intermediate_dist(t1, aaa, mean, logvar, zeros, batch)
            intermediate_dist_func = lambda aaa: intermediate_dist(t1, aaa, zeros, zeros, zeros, batch)

            # print (t1)
            time_1 = time.time()
            z0, v0, z, v = hmc(z, intermediate_dist_func)
            # print (t0, 'time to do hmc', time.time()-time_1)


            #MH step
            z, step_size = mh_step(z0, v0, z, v, step_size, intermediate_dist_func)

            z = z.detach()








        # print ('time to do whole schedule', time.time()-time_2)
        # fasd
        #log sum exp
        max_ = torch.max(logw,0)[0] #[B]
        logw = torch.log(torch.mean(torch.exp(logw - max_), 0)) + max_ #[B]

        logws.append(torch.mean(logw.cpu()).data.numpy())


        if i%display==0:
            print (i,len(data_x)/ batch_size, np.mean(logws),step_size, time.time()-time_)

    mean_ = np.mean(logws)
    print(mean_, 'T:', time.time()-time_)
    return mean_
     



# Loading MNIST (non-binarized)

In [0]:
import pickle
import numpy as np


with open('datasets/mnist_non_binarised.pkl' , 'rb') as f:
  train_x , valid_x , test_x = pickle.load(f , encoding = 'latin1')



train_x = train_x[0]
valid_x = valid_x[0]
test_x = test_x[0]


train_x = np.asarray(train_x)
valid_x = np.asarray(valid_x)
test_x = np.asarray(test_x)

type(train_x)

print ('Train', train_x.shape)
print ('Valid', valid_x.shape)
print ('Test', test_x.shape)

# Loading MNIST (binarized)

In [0]:
import pickle


with open('datasets/binMNIST_train.pkl' , 'rb') as f:
  train_x = pickle.load(f , encoding = 'latin1')


with open('datasets/binMNIST_valid.pkl' , 'rb') as f:
  valid_x = pickle.load(f , encoding = 'latin1')


with open('datasets/binMNIST_test.pkl' , 'rb') as f:
  test_x = pickle.load(f , encoding = 'latin1')

print ('Train', train_x.shape)
print ('Valid', valid_x.shape)
print ('Test', test_x.shape)

type(train_x)

In [0]:
print(torch.cuda.is_available())

# Function for training a complete VAE.

Modified from Chris Cremer's train_mnist.py file (Cremer et al. (2018)) in the flow_effect_on_amort_exp folder of his Github Code  of the paper. Modifications are made mainly to account for reloading a checkpoint saved. There is also an extension of 5.2 about planar flow written (original work).

In [0]:




def train_encoder_and_decoder(model, train_x, test_x, k, batch_size,
                    start_at, save_freq, display_epoch, 
                    path_to_save_variables, ckpt_number):

    train_y = torch.from_numpy(np.zeros(len(train_x)))
    train_x = torch.from_numpy(train_x).float().type(model.dtype)

    train_ = torch.utils.data.TensorDataset(train_x, train_y)
    train_loader = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)

    #IWAE paper training strategy
    time_ = time.time()
    total_epochs = ckpt_number

    i_max = 7

    warmup_over_epochs = 100.


    all_params = []
    for aaa in model.q_dist.parameters():
        all_params.append(aaa)
    for aaa in model.generator.parameters():
        all_params.append(aaa)

    print (model.q_dist)
    print (model.generator)




    for i in range(0,i_max+1):
        
        number_of_training = sum([3**x for x in list(range(i+1))])

        previous_slots_number = number_of_training - 3**i

        if ckpt_number < number_of_training and ckpt_number < previous_slots_number:


          lr = .001 * 10**(-i/float(i_max))
          print (i, 'LR:', lr)


          optimizer = optim.Adam(all_params, lr=lr)


          epochs = 3**(i)






          for epoch in range(1, epochs + 1):

              for batch_idx, (data, target) in enumerate(train_loader):
                  
                  batch = Variable(data)

                  optimizer.zero_grad()

                  warmup = total_epochs/warmup_over_epochs
                  if warmup > 1.:
                    warmup = 1.

                  elbo, logpxz, logqz = model.forward(batch, k=k, warmup=warmup)

                  loss = -(elbo)
                  loss.backward()
                  optimizer.step()

              total_epochs += 1

              if total_epochs%display_epoch==0:
                print ('Train Epoch: {}/{}'.format(epoch, epochs),
                    'total_epochs {}'.format(total_epochs),
                    'LL:{:.3f}'.format(-loss.data[0]),
                    'logpxz:{:.3f}'.format(logpxz.data[0]),
                    'logqz:{:.3f}'.format(logqz.data[0]),
                    'warmup:{:.3f}'.format(warmup),
                    'T:{:.2f}'.format(time.time()-time_),
                    )
                time_ = time.time()

              
              if total_epochs >= start_at and (total_epochs-start_at)%save_freq==0:

                # save params
                save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
                torch.save(model.q_dist.state_dict(), save_file)
                print ('saved variables ' + save_file)
                save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
                torch.save(model.generator.state_dict(), save_file)
                print ('saved variables ' + save_file)




        if ( ckpt_number < number_of_training ) and ( ckpt_number >= previous_slots_number ) :


          lr = .001 * 10**(-i/float(i_max))
          print (i, 'LR:', lr)


          optimizer = optim.Adam(all_params, lr=lr)


          epochs = 3**(i)






          for epoch in range(1 + ckpt_number - previous_slots_number, epochs + 1):

              for batch_idx, (data, target) in enumerate(train_loader):
                  
                  batch = Variable(data)

                  optimizer.zero_grad()

                  warmup = total_epochs/warmup_over_epochs
                  if warmup > 1.:
                    warmup = 1.

                  elbo, logpxz, logqz = model.forward(batch, k=k, warmup=warmup)

                  loss = -(elbo)
                  loss.backward()
                  optimizer.step()

              total_epochs += 1

              if total_epochs%display_epoch==0:
                print ('Train Epoch: {}/{}'.format(epoch, epochs),
                    'total_epochs {}'.format(total_epochs),
                    'LL:{:.3f}'.format(-loss.data[0]),
                    'logpxz:{:.3f}'.format(logpxz.data[0]),
                    'logqz:{:.3f}'.format(logqz.data[0]),
                    'warmup:{:.3f}'.format(warmup),
                    'T:{:.2f}'.format(time.time()-time_),
                    )
                time_ = time.time()

              
              if total_epochs >= start_at and (total_epochs-start_at)%save_freq==0:

                # save params
                save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
                torch.save(model.q_dist.state_dict(), save_file)
                print ('saved variables ' + save_file)
                save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
                torch.save(model.generator.state_dict(), save_file)
                print ('saved variables ' + save_file)




    # save params
    save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
    torch.save(model.q_dist.state_dict(), save_file)
    print ('saved variables ' + save_file)
    save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
    torch.save(model.generator.state_dict(), save_file)
    print ('saved variables ' + save_file)


    print ('done training')



Training the initial encoder of expt 5.3

In [0]:



x_size = 784
z_size = 50
batch_size = 20
k = 1
#save params 
start_at = 100
save_freq = 100
display_epoch = 3

hyper_config = { 
                'x_size': x_size,
                'z_size': z_size,
                'act_func': F.tanh,# F.relu,
                'encoder_arch': [[x_size,200],[200,200],[200,z_size*2]],
                'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                'cuda': 1 ,
                'hnf': 0,
                'planar_flow': 0
            }

hyper_config['q'] = Gaussian(hyper_config)

print ('Init model')
model = VAE(hyper_config)
if torch.cuda.is_available():
    model.cuda()

print('\nModel:', hyper_config,'\n')



path_to_save_variables='Experiment_3/train_mnist_ckpts/' #.pt'



# load generators and encoders
# path_to_load_variables_decoder = 'Experiment_3/train_mnist_ckpts/_generator_2200.pt'
# path_to_load_variables_encoder = 'Experiment_3/train_mnist_ckpts/_encoder_2200.pt'
# model.generator.load_state_dict(torch.load(path_to_load_variables_decoder , map_location=lambda storage, loc: storage))
# model.q_dist.load_state_dict(torch.load(path_to_load_variables_encoder , map_location=lambda storage, loc: storage))
# print('loaded previous checkpoints (decoder): ' + path_to_load_variables_decoder)
# print('loaded previous checkpoints (encoder): ' + path_to_load_variables_encoder)




print('\nTraining')



train_encoder_and_decoder(model=model, train_x=train_x, test_x=test_x, k=k, batch_size=batch_size,
                    start_at=start_at, save_freq=save_freq, display_epoch=display_epoch, 
                    path_to_save_variables=path_to_save_variables , ckpt_number = 0)

print ('Done.')





Extension to 5.2: training VAE with Planar Flow with a different encoder architecture: 784 - 500 - 302.

In [0]:


x_size = 784
z_size = 50
batch_size = 100
k = 1
#save params 
start_at = 100
save_freq = 100
display_epoch = 3

hyper_config = { 
                'x_size': x_size,
                'z_size': z_size,
                'act_func': F.elu,# F.relu,
                'encoder_arch': [[x_size,500],[500,((z_size*6)+2)]],
                'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                'cuda': 1 ,
                'hnf': 0,
                'planar_flow': 1
            }


# hyper_config['q'] = Flow1(hyper_config)
hyper_config['q'] = Planar_Flow(hyper_config)



print ('Init model')
model = VAE(hyper_config)
if torch.cuda.is_available():
    model.cuda()

print('\nModel:', hyper_config,'\n')



path_to_save_variables='Extension_Planar_Flow/train_mnist_ckpts/' #.pt'



# load generators and encoders
# path_to_load_variables_decoder = 'Extension_Planar_Flow/train_mnist_ckpts/_generator_1000.pt'
# path_to_load_variables_encoder = 'Extension_Planar_Flow/train_mnist_ckpts/_encoder_1000.pt'
# model.generator.load_state_dict(torch.load(path_to_load_variables_decoder , map_location=lambda storage, loc: storage))
# model.q_dist.load_state_dict(torch.load(path_to_load_variables_encoder , map_location=lambda storage, loc: storage))
# print('loaded previous checkpoints (decoder): ' + path_to_load_variables_decoder)
# print('loaded previous checkpoints (encoder): ' + path_to_load_variables_encoder)




print('\nTraining')



train_encoder_and_decoder(model=model, train_x=train_x, test_x=test_x, k=k, batch_size=batch_size,
                    start_at=start_at, save_freq=save_freq, display_epoch=display_epoch, 
                    path_to_save_variables=path_to_save_variables , ckpt_number = 0)

print ('Done.')




# Re-training encoders using decoder of trained VAE with FFG

Modified from Chris Cremer's train_encoder_only_2.py file (Cremer et al. (2018)) in the folder flow_effect_on_amort_exp of his Github Code of the paper. Some codes in that file have been deleted. Codes has been modified for reloading of checkpoints of training.

In [0]:


def train_encoder_only(model, train_x, test_x, k, batch_size,
                    start_at, save_freq, display_epoch, 
                    path_to_save_variables , ckpt_number):

    train_y = torch.from_numpy(np.zeros(len(train_x)))
    train_x = torch.from_numpy(train_x).float().type(model.dtype)

    train_ = torch.utils.data.TensorDataset(train_x, train_y)
    train_loader = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)

    #IWAE paper training strategy
    time_ = time.time()
    total_epochs = ckpt_number

    i_max = 7

    warmup_over_epochs = 100.


    all_params = []
    for aaa in model.q_dist.parameters():
        all_params.append(aaa)
    # for aaa in model.generator.parameters():
    #     all_params.append(aaa)
    # print (len(all_params), 'number of params')

    print (model.q_dist)
    print (model.generator)



    for i in range(0,i_max+1):

        number_of_training = sum([3**x for x in list(range(i+1))])

        previous_slots_number = number_of_training - 3**i

        if ckpt_number < number_of_training and ckpt_number < previous_slots_number:

          lr = .001 * 10**(-i/float(i_max))
          print (i, 'LR:', lr)

          optimizer = optim.Adam(all_params, lr=lr)

          epochs = 3**(i)

          for epoch in range(1, epochs + 1):

             for batch_idx, (data, target) in enumerate(train_loader):

                 batch = Variable(data)

                 optimizer.zero_grad()

                 warmup = total_epochs/warmup_over_epochs
                 if warmup > 1.:
                     warmup = 1.

                 elbo, logpxz, logqz = model.forward(batch, k=k, warmup=warmup)

                 loss = -(elbo)
                 loss.backward()
                 optimizer.step()

             total_epochs += 1


             if total_epochs%display_epoch==0:
                 print ('Train Epoch: {}/{}'.format(epoch, epochs),
                    'total_epochs {}'.format(total_epochs),
                    'LL:{:.3f}'.format(-loss.data[0]),
                    'logpxz:{:.3f}'.format(logpxz.data[0]),
                    'logqz:{:.3f}'.format(logqz.data[0]),
                    'warmup:{:.3f}'.format(warmup),
                    'T:{:.2f}'.format(time.time()-time_),
                    )
                 time_ = time.time()


             if total_epochs >= start_at and (total_epochs-start_at)%save_freq==0:

                 # save params
                 save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
                 torch.save(model.q_dist.state_dict(), save_file)
                 print ('saved variables ' + save_file)
                 # save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
                 # torch.save(model.generator.state_dict(), save_file)
                 # print ('saved variables ' + save_file)




        if ckpt_number < number_of_training and ckpt_number >= previous_slots_number:

          lr = .001 * 10**(-i/float(i_max))
          print (i, 'LR:', lr)

          optimizer = optim.Adam(all_params, lr=lr)

          epochs = 3**(i)

          for epoch in range(1 + ckpt_number - previous_slots_number , epochs + 1):

             for batch_idx, (data, target) in enumerate(train_loader):

                 batch = Variable(data)

                 optimizer.zero_grad()

                 warmup = total_epochs/warmup_over_epochs
                 if warmup > 1.:
                     warmup = 1.

                 elbo, logpxz, logqz = model.forward(batch, k=k, warmup=warmup)

                 loss = -(elbo)
                 loss.backward()
                 optimizer.step()

             total_epochs += 1


             if total_epochs%display_epoch==0:
                 print ('Train Epoch: {}/{}'.format(epoch, epochs),
                    'total_epochs {}'.format(total_epochs),
                    'LL:{:.3f}'.format(-loss.data[0]),
                    'logpxz:{:.3f}'.format(logpxz.data[0]),
                    'logqz:{:.3f}'.format(logqz.data[0]),
                    'warmup:{:.3f}'.format(warmup),
                    'T:{:.2f}'.format(time.time()-time_),
                    )
                 time_ = time.time()


             if total_epochs >= start_at and (total_epochs-start_at)%save_freq==0:

                 # save params
                 save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
                 torch.save(model.q_dist.state_dict(), save_file)
                 print ('saved variables ' + save_file)
                 # save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
                 # torch.save(model.generator.state_dict(), save_file)
                 # print ('saved variables ' + save_file)





    # save params
    save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
    torch.save(model.q_dist.state_dict(), save_file)
    print ('saved variables ' + save_file)
    # save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
    # torch.save(model.generator.state_dict(), save_file)
    # print ('saved variables ' + save_file)


    print ('done training')





Code to run to re-train encoders on the decoder of the initial VAE. 3 points to note:

1. hyper_config['q'] (approximate posterior) has to be chosen

2. 'encoder_arch' in hyper_config has to be chosen, to change the number of hidden layers

3. path_to_save_variables (folder to save training checkpoints) has to be chosen for different experiments.

In [0]:


x_size = 784
z_size = 50
batch_size = 20
k = 1
#save params 
start_at = 100
save_freq = 100
display_epoch = 3





#flow1
hyper_config = { 
                'x_size': x_size,
                'z_size': z_size,
                'act_func': F.tanh,  #F.elu, #,# F.relu,
                # 'encoder_arch': [[x_size,200],[200,z_size*2]],
                'encoder_arch': [[x_size,z_size*2]],
                'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                'cuda': 1,
                'hnf': 0,
                'planar_flow':0
            }

# hyper_config['q'] = Flow(hyper_config)
# hyper_config['q'] = Flow1(hyper_config)
# hyper_config['q'] = Four_Flow1(hyper_config)
hyper_config['q'] = Gaussian(hyper_config)


print ('Init model')
model = VAE(hyper_config)
if torch.cuda.is_available():
    model.cuda()

print('\nModel:', hyper_config,'\n')





# path_to_load_variables=''
# path_to_save_variables = 'Experiment_3/Encoder_1Hidden_FFG_ckpts/'
# path_to_save_variables = 'Experiment_3/Encoder_1Hidden_Flow1_ckpts/'
path_to_save_variables = 'Experiment_3/Encoder_0Hidden_FFG_ckpts/'
# path_to_save_variables = 'Experiment_3/Encoder_0Hidden_Flow_ckpts/'
# path_to_save_variables = 'Experiment_3/Encoder_0Hidden_Flow1_ckpts/'
# path_to_save_variables = 'Experiment_3/Encoder_0Hidden_4_Flow1_ckpts/'
# path_to_save_variables = 'Chris/encoder_only_Flow1/' #.pt'





# load generator
print ('Load params for decoder')
path_to_load_variables_generator = 'Experiment_3/train_mnist_ckpts/_generator_3280.pt'
model.generator.load_state_dict(torch.load(path_to_load_variables_generator, map_location=lambda storage, loc: storage))
print ('loaded variables ' + path_to_load_variables_generator)
# print ()


# load encoder
# print ('Load params for encoder')
# path_to_load_variables_encoder = 'Experiment_3/Encoder_1Hidden_FFG_ckpts/_encoder_2000.pt'
# model.q_dist.load_state_dict(torch.load(path_to_load_variables_encoder, map_location=lambda storage, loc: storage))
# print ('loaded variables ' + path_to_load_variables_encoder)
# print ()




print('\nTraining')


train_encoder_only(model=model, train_x=train_x, test_x=test_x, k=k, batch_size=batch_size,
                    start_at=start_at, save_freq=save_freq, display_epoch=display_epoch, 
                    path_to_save_variables=path_to_save_variables , ckpt_number = 0)

print ('Done.')







# Compute Approximation and Amortization errors

Modified from Chris Cremer's compute_gaps.py file (Cremer et al. (2018)) in the folder flow_effect_on_amort_exp of his Github Code of the paper. Just a few codes in that file have been deleted. The original code in compute_gaps.py for evaluating L(q^star) been adapted to be a function, to make re-loading of L(q^star) values for previous datapoints possible.

test is the function for computing IWAE bound. test_vae is the function for computing L(q) - amortized ELBO bound.


In [0]:


def test_vae(model, data_x, batch_size, display, k):
    
    time_ = time.time()
    elbos = []
    data_index= 0
    for i in range(int(len(data_x)/ batch_size)):

        batch = data_x[data_index:data_index+batch_size]
        data_index += batch_size

        batch = Variable(torch.from_numpy(batch)).type(model.dtype)

        elbo, logpxz, logqz = model.forward2(batch, k=k)

        elbos.append(elbo.data[0])



    mean_ = np.mean(elbos)


    return mean_





def test(model, data_x, batch_size, display, k):
    
    time_ = time.time()
    elbos = []
    data_index= 0
    for i in range(int(len(data_x)/ batch_size)):

        batch = data_x[data_index:data_index+batch_size]
        data_index += batch_size

        batch = Variable(torch.from_numpy(batch)).type(model.dtype)

        elbo, logpxz, logqz = model(batch, k=k) # model

        elbos.append(elbo.data[0])



    mean_ = np.mean(elbos)


    return mean_




This function is used for computing L(q*)

In [0]:
def compute_local_optimum(n_data , train_x , model , q_dist , 
                          hyper_config , ckpt_number , directory_to_save_ckpts):

    if ckpt_number == 0:
       vaes = []
       iwaes = []

    if ckpt_number > 0:
       
       path_to_load_ckpts = directory_to_save_ckpts + '/vae_iwae_' + str(ckpt_number) + '.npz'

       print('Loading L(q*), IWAE for previous datapoints: ' , path_to_load_ckpts)

       previous_data = np.load(path_to_load_ckpts)
    
       vaes = list(previous_data['vaes'])
       iwaes = list(previous_data['iwaes'])
       print('Opt vae from checkpoint: ' , np.mean(vaes) , 'Opt iwae from checkpoint: ' , np.mean(iwaes))
    

    print('optimizing local')

    if ckpt_number == 0:
       for i in range(0, n_data):

         print('Current datapoint: ' , i + 500)

         x = train_x[i+500]
         x = Variable(torch.from_numpy(x)).type(model.dtype)
         x = x.view(1,784)


         logposterior = lambda aa: model.logposterior_func2(x=x,z=aa)
       
         if q_dist == 'Gaussian':
           q_local = Gaussian(hyper_config)

         if q_dist == 'Flow':
           q_local = Flow(hyper_config).cuda()

         if q_dist == 'Flow1':
           q_local = Flow1(hyper_config).cuda()

         if q_dist == 'Four_Flow1':
           q_local = Four_Flow1(hyper_config).cuda()

         if q_dist == 'Planar_Flow':
           q_local = Planar_Flow(hyper_config).cuda()



         vae, iwae = optimize_local_q_dist(logposterior, hyper_config, x, q_local)
         print (vae.data.cpu().numpy(),iwae.data.cpu().numpy(),'reg')
         vaes.append(vae.data.cpu().numpy())
         iwaes.append(iwae.data.cpu().numpy())
         print ('average of opt vae so far',np.mean(vaes))
         print ('average of opt iwae so far',np.mean(iwaes))
         print(' ')

         if i > ckpt_number and i % 10 == 0:

           path_to_save_ckpts = directory_to_save_ckpts + '/vae_iwae_' + str(i)
           np.savez(path_to_save_ckpts , vaes=vaes , iwaes=iwaes)
           print('checkpoint created. Epoch: ' , i)


    if ckpt_number > 0:
       for i in range(ckpt_number + 1 , n_data):

         print('Current datapoint: ' , i + 500)

         x = train_x[i+500]
         x = Variable(torch.from_numpy(x)).type(model.dtype)
         x = x.view(1,784)


         logposterior = lambda aa: model.logposterior_func2(x=x,z=aa)
       
         if q_dist == 'Gaussian':
           q_local = Gaussian(hyper_config)

         if q_dist == 'Flow':
           q_local = Flow(hyper_config).cuda()

         if q_dist == 'Flow1':
           q_local = Flow1(hyper_config).cuda()
        
         if q_dist == 'Four_Flow1':
           q_local = Four_Flow1(hyper_config).cuda()

         if q_dist == 'Planar_Flow':
           q_local = Planar_Flow(hyper_config).cuda()


         vae, iwae = optimize_local_q_dist(logposterior, hyper_config, x, q_local)
         print (vae.data.cpu().numpy(),iwae.data.cpu().numpy(),'reg')
         vaes.append(vae.data.cpu().numpy())
         iwaes.append(iwae.data.cpu().numpy())
         print ('average of opt vae so far',np.mean(vaes))
         print ('average of opt iwae so far',np.mean(iwaes))
         print(' ')

         if i > ckpt_number and i % 10 == 0:

           path_to_save_ckpts = directory_to_save_ckpts + '/vae_iwae_' + str(i)
           np.savez(path_to_save_ckpts , vaes=vaes , iwaes=iwaes)
           print('checkpoint created. Epoch: ' , i)



    path_to_save_ckpts = directory_to_save_ckpts + '/vae_iwae_ending_result'
    np.savez(path_to_save_ckpts , vaes=vaes , iwaes=iwaes)
    
    dddd


    print ('Ending opt vae',np.mean(vaes))
    print ('Ending opt iwae',np.mean(iwaes))





Compute AIS, IWAE, amortized L(q) of trained models

In [0]:




x_size = 784
z_size = 50
# batch_size = 20
# k = 1
#save params 
# start_at = 100
# save_freq = 100
# display_epoch = 3




hyper_config = { 
                'x_size': x_size,
                'z_size': z_size,
                'act_func': F.tanh,# F.relu,
                'encoder_arch': [[x_size,200],[200,z_size*2]],
                # 'encoder_arch': [[x_size,z_size*2]],
                # 'encoder_arch': [[x_size,500],[500,z_size*2]],
                'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                'cuda': 1,
                'hnf':0,
                'planar_flow': 0
                # 'planar_flow': 1
            }


q = Gaussian(hyper_config)
# q = Flow(hyper_config)
# q = Flow1(hyper_config)
# q = Four_Flow1(hyper_config)
# q = Planar_Flow(hyper_config)
hyper_config['q'] = q






print ('Init model')
model = VAE(hyper_config)
if torch.cuda.is_available():
    model.cuda()
print('\nModel:', hyper_config,'\n')







print('load params for decoder')
path_to_load_variables_decoder = 'Experiment_3/train_mnist_ckpts/_generator_3280.pt'
# path_to_load_variables_decoder = 'Extension_Planar_Flow/train_mnist_ckpts/_generator_3280.pt'
model.generator.load_state_dict(torch.load(path_to_load_variables_decoder, map_location=lambda storage, loc: storage))
print('loaded variables ' + path_to_load_variables_decoder)



print('load params for encoder')
path_to_load_variables_encoder = 'Experiment_3/Encoder_Gaussian_ckpts/_encoder_3280.pt'
# path_to_load_variables_decoder = 'Extension_Planar_Flow/train_mnist_ckpts/_encoder_3280.pt'
# path_to_load_variables_encoder = 'Experiment_3/Encoder_Flow_ckpts/_encoder_3280.pt'
# path_to_load_variables_encoder = 'Experiment_3/Encoder_Flow1_ckpts/_encoder_3280.pt'
# path_to_load_variables_encoder = 'Experiment_3/Encoder_4_Flow1_ckpts/_encoder_3280.pt'
# path_to_load_variables_encoder = 'Experiment_3/Encoder_1Hidden_FFG_ckpts/_encoder_3280.pt'
# path_to_load_variables_encoder = 'Experiment_3/Encoder_1Hidden_Flow1_ckpts/_encoder_3280.pt'
model.q_dist.load_state_dict(torch.load(path_to_load_variables_encoder, map_location=lambda storage, loc: storage))
print('loaded variables ' + path_to_load_variables_encoder)




n_data = 1000



VAE_train = test_vae(model=model, data_x=train_x[:n_data], batch_size=np.minimum(n_data, 50), display=10, k=5000)
IW_train = test(model=model, data_x=train_x[:n_data], batch_size=np.minimum(n_data, 50), display=10, k=5000)
print ('amortized VAE',VAE_train)
print ('amortized IW',IW_train)






AIS_train = test_ais(model=model, data_x=train_x[:n_data], batch_size=n_data, display=2, k=50, n_intermediate_dists=500)
print ('AIS_train',AIS_train)






Compute L(q*) for a trained decoder

In [0]:
q_ = 'Gaussian'
# q_ = 'Flow'
# q_ = 'Flow1'
# q_ = 'Four_Flow1'
# q_ = 'Planar_Flow'



compute_local_optimum(n_data=1000 , train_x=train_x , model=model , q_dist = q_ , 
                          hyper_config=hyper_config , ckpt_number = 0 , 
                      directory_to_save_ckpts = 'Experiment_3/Encoder_1Hidden_FFG_ckpts')






