# Introduction

This code contains almost all the code to replicate my (1041079) work in the report: experiment 5.5.
PIWAE is in a different file, however, because this code is significantly different.

Large amounts of this code are from different files in Chris Cremer's Github (https://github.com/chriscremer/Inference-Suboptimality) and adapted. I've gone through and commented this code in many places, clarifying what things do for my own understanding. This particular version doesn't contain the adaptions to work on Google Colab, it's meant to run locally.

I've added several bits to this code, some small and some large. The small changes mainly involve changing parameters and allowing the code to start training from a later Epoch (to work with Colab constantly disconnecting every 30 minutes). The larger changes involve adding the ability to train with the IWAE lower bound. Where I've changed code, I've pointed this out with a comment.

I've attempted to put all my code in one file for clarity but in reality I ran everything using several different files. I've done my best to avoid any errors in combining the code, but this is a possible cause if any remain.

All my comments start with "1041079:"

In [1]:
import numpy as np
import gzip
import time
import pickle
import csv
import math

from os.path import expanduser
home = expanduser("~")

import sys, os
sys.path.insert(0, '../models')
sys.path.insert(0, '../models/utils')


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


import torch
from torch.autograd import Variable
import torch.utils.data
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

# Utilities

Code from the "utils" file, providing utility functions to be used throughout the code

In [2]:
# 1041079: I've renamed this from lognormal2 to lognormal to make the inporting work
def lognormal(x, mean, logvar):
    ''' 1041079: Takes in a batch of x vectors and a batch of mean and log std vectors,
    returns a batch made up of values of the log of the normal distribution for each specified vector and specified 
    distribution. P is the number of samples per batch, B is the number of batches and Z is the dimension 
    (called Z becuase it's usually the dimension of the latent space)
    This is different from the function below as it assumes every sample vector x follows the same distribution
    '''
    '''
    x: [P,B,Z]
    mean,logvar: [B,Z]
    output: [P,B]
    '''

    ''' 1041079: x is made up of 3 entries - the first one seems to be useless'''
    assert len(x.size()) == 3
    assert len(mean.size()) == 2
    assert len(logvar.size()) == 2
    ''' 1041079: This seem to check that the number of batches of x is the same as for the mean'''
    assert x.size()[1] == mean.size()[0]

    '''1041079: This calculates the constant term in the log of the normal dist'''
    D = x.size()[2]

    if torch.cuda.is_available():
        term1 = D * torch.log(torch.cuda.FloatTensor([2.*math.pi])) #[1]
    else:
        term1 = D * torch.log(torch.FloatTensor([2.*math.pi])) #[1]


    return -.5 * (Variable(term1) + logvar.sum(1) + ((x - mean).pow(2)/torch.exp(logvar)).sum(2))

def lognormal333(x, mean, logvar):
    ''' 1041079: This is identical to above but where different means and logvariances can exist for each sample'''
    '''
    x: [P,B,Z]
    mean,logvar: [P,B,Z]
    output: [P,B]
    '''

    assert len(x.size()) == 3
    assert len(mean.size()) == 3
    assert len(logvar.size()) == 3
    assert x.size()[0] == mean.size()[0]
    assert x.size()[1] == mean.size()[1]

    D = x.size()[2]

    if torch.cuda.is_available():
        term1 = D * torch.log(torch.cuda.FloatTensor([2.*math.pi])) #[1]
    else:
        term1 = D * torch.log(torch.FloatTensor([2.*math.pi])) #[1]


    return -.5 * (Variable(term1) + logvar.sum(2) + ((x - mean).pow(2)/torch.exp(logvar)).sum(2))


def log_bernoulli(pred_no_sig, target):
    '''
    pred_no_sig is [P, B, X] 
    t is [B, X]
    output is [P, B]
    '''

    assert len(pred_no_sig.size()) == 3
    assert len(target.size()) == 2
    assert pred_no_sig.size()[1] == target.size()[0]

    return -(torch.clamp(pred_no_sig, min=0)
                        - pred_no_sig * target
                        + torch.log(1. + torch.exp(-torch.abs(pred_no_sig)))).sum(2) #sum over dimensions

# Generator

Class for the generator (decoder) Neural Network

In [3]:
''' 1041079: This is the class for the generator (decoder) '''
class Generator(nn.Module):

    def __init__(self, hyper_config):
        ''' 1041079: This function initializes the generator model with any decoder weights given in the hyperparameters'''
        super(Generator, self).__init__() 
        '''1041079: This is just a standard required line for all subclasses of the Pytorch nn Module'''

        if hyper_config['cuda']:
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.z_size = hyper_config['z_size'] 
        '''1041079: z_size is latent dimension size'''
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']

        #Decoder
        self.decoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['decoder_arch'])):
            self.decoder_weights.append(nn.Linear(hyper_config['decoder_arch'][i][0], hyper_config['decoder_arch'][i][1]))

        count =1
        for i in range(len(self.decoder_weights)):
            self.add_module(str(count), self.decoder_weights[i])
            count+=1
   

    def decode(self, z):
        ''' 1041079: This function decodes! Given an input (z) it outputs x'''
        k = z.size()[0]
        B = z.size()[1]
        z = z.view(-1, self.z_size)

        out = z
        for i in range(len(self.decoder_weights)-1):
            out = self.act_func(self.decoder_weights[i](out))
            # out = self.act_func(self.layer_norms[i].forward(self.decoder_weights[i](out)))
        out = self.decoder_weights[-1](out)

        x = out.view(k, B, self.x_size)
        return x

# Inference Net

Class for the Inference Neural Network (Encoder)

In [None]:
''' 1041079: This is the class for the Inference network (encoder) '''
class standard(nn.Module):

    def __init__(self, hyper_config):
        ''' This function initializes the encoder model with any decoder weights given in the hyperparameters'''
        super(standard, self).__init__() 
        '''1041079: This is just a standard required line for all subclasses of the Pytorch nn Module'''

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config

        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']


        #Encoder
        self.encoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['encoder_arch'])):
            self.encoder_weights.append(nn.Linear(hyper_config['encoder_arch'][i][0], hyper_config['encoder_arch'][i][1]))

        count =1
        for i in range(len(self.encoder_weights)):
            self.add_module(str(count), self.encoder_weights[i])
            count+=1

        self.q = hyper_config['q'] #1041079: removed (self.hyper_config) here because class has already been
        # initialised in my version of the code


    def forward(self, k, x, logposterior):
        '''1041079: This function generates latent variable z sampled from distribution due to input x '''
        '''
        k: number of samples
        x: [B,X]
        logposterior(z) -> [P,B]
        '''

        self.B = x.size()[0]

        #Encode
        out = x
        for i in range(len(self.encoder_weights)-1):
            out = self.act_func(self.encoder_weights[i](out))

        out = self.encoder_weights[-1](out)
        mean = out[:,:self.z_size]  #[B,Z]
        logvar = out[:,self.z_size:]

        # 1041079: HNF not implemented so I've added the following line:
        self.hyper_config['hnf'] = False
        # and this is the original code:
        if self.hyper_config['hnf']:
            z, logqz = self.q.sample(mean, logvar, k, logposterior)
        else:
            z, logqz = self.q.sample(mean, logvar, k)

        return z, logqz

# Distributions

Classes for the distributions used throughout the rest of the notebook: Gaussian, which is the FFG distribution, and Flow, which is the Flow distribution.

In [5]:
class Gaussian(nn.Module):

    def __init__(self, hyper_config): #, mean, logvar):
        #mean,logvar: [B,Z]
        super(Gaussian, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        

        # self.B = mean.size()[0]
        # # self.z_size = mean.size()[1]
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        # # dfas

        # self.mean = mean
        # self.logvar = logvar


    def sample(self, mean, logvar, k):
        ''' 1041079: Returns k vectors sampled from FFG gaussian with mean vector ``mean'' and log variance ``logvar'''

        self.B = mean.size()[0]

        eps = Variable(torch.FloatTensor(k, self.B, self.z_size).normal_().type(self.dtype)) #[P,B,Z]
        z = eps.mul(torch.exp(.5*logvar)) + mean  #[P,B,Z]
        logqz = lognormal(z, mean, logvar) #[P,B]

        return z, logqz



    def logprob(self, z, mean, logvar):
        '''1041079: Return the logs of the probabilities of achieving values z'''

        # self.B = mean.size()[0]

        # eps = Variable(torch.FloatTensor(k, self.B, self.z_size).normal_().type(self.dtype)) #[P,B,Z]
        # z = eps.mul(torch.exp(.5*logvar)) + mean  #[P,B,Z]
        logqz = lognormal(z, mean, logvar) #[P,B]

        return logqz





class Flow(nn.Module):
    
    def __init__(self, hyper_config):#, mean, logvar):
        #mean,logvar: [B,Z]
        super(Flow, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.hyper_config = hyper_config
        # self.B = mean.size()[0]
        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']

        self.act_func = hyper_config['act_func']
        

        count =1

        # f(vT|x,vT)
        # rv_arch = [[self.x_size+self.z_size,200],[200,200],[200,self.z_size*2]]
        rv_arch = [[self.z_size,50],[50,50],[50,self.z_size*2]]
        self.rv_weights = []
        for i in range(len(rv_arch)):
            layer = nn.Linear(rv_arch[i][0], rv_arch[i][1])
            self.rv_weights.append(layer)
            self.add_module(str(count), layer)
            count+=1


        n_flows = 2
        self.n_flows = n_flows
        h_s = 50

        
        self.flow_params = []
        for i in range(n_flows):
            #first is for v, second is for z
            self.flow_params.append([
                                [nn.Linear(self.z_size, h_s), nn.Linear(h_s, self.z_size), nn.Linear(h_s, self.z_size)],
                                [nn.Linear(self.z_size, h_s), nn.Linear(h_s, self.z_size), nn.Linear(h_s, self.z_size)]
                                ])
        
        for i in range(n_flows):

            self.add_module(str(count), self.flow_params[i][0][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][0])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][1])
            count+=1
            self.add_module(str(count), self.flow_params[i][0][2])
            count+=1
            self.add_module(str(count), self.flow_params[i][1][2])
            count+=1
    

        # # q(v0)
        # self.q_v = Gaussian(self.hyper_config, torch.zeros(self.B, self.z_size), torch.zeros(self.B, self.z_size))

        # # q(z0)
        # self.q_z = Gaussian(self.hyper_config, mean, logvar)

 


    def norm_flow(self, params, z, v):
        # print (z.size())
        h = F.tanh(params[0][0](z))
        mew_ = params[0][1](h)
        # sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z]
        sig_ = F.sigmoid(params[0][2](h)) #[PB,Z]

        v = v*sig_ + mew_
        logdet = torch.sum(torch.log(sig_), 1)



        h = F.tanh(params[1][0](v))
        mew_ = params[1][1](h)
        # sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z]
        sig_ = F.sigmoid(params[1][2](h)) #[PB,Z]
        z = z*sig_ + mew_
        logdet2 = torch.sum(torch.log(sig_), 1)



        #[PB]
        logdet = logdet + logdet2
        #[PB,Z], [PB]
        return z, v, logdet



    def sample(self, mean, logvar, k):

        self.B = mean.size()[0]
        gaus = Gaussian(self.hyper_config)

        # q(z0)
        z, logqz0 = gaus.sample(mean, logvar, k)

        # q(v0)
        # 1041079: I added this cuda check (didn't exist in original code)
        if torch.cuda.is_available():
            zeros = Variable(torch.zeros(self.B, self.z_size)).cuda()
        else:
            zeros = Variable(torch.zeros(self.B, self.z_size))
        v, logqv0 = gaus.sample(zeros, zeros, k)


        #[PB,Z]
        z = z.view(-1,self.z_size)
        v = v.view(-1,self.z_size)

        #Transform
        logdetsum = 0.
        for i in range(self.n_flows):

            params = self.flow_params[i]

            # z, v, logdet = self.norm_flow([self.flow_params[i]],z,v)
            z, v, logdet = self.norm_flow(params,z,v)
            logdetsum += logdet

        logdetsum = logdetsum.view(k,self.B)

        #r(vT|x,zT)
        #r(vT|zT)  try that
        out = z #[PB,Z]
        # print (out.size())
        # fasda
        for i in range(len(self.rv_weights)-1):
            out = self.act_func(self.rv_weights[i](out))
        out = self.rv_weights[-1](out)
        # print (out)
        mean = out[:,:self.z_size]
        logvar = out[:,self.z_size:]
        # r_vt = Gaussian(self.hyper_config, mean, logvar)



        v = v.view(k, self.B, self.z_size)
        z = z.view(k, self.B, self.z_size)

        mean = mean.contiguous().view(k, self.B, self.z_size)
        logvar = logvar.contiguous().view(k, self.B, self.z_size)

        # print (mean.size()) #[PB,Z]
        # print (v.size())   #[P,B,Z]
        # print (self.B)
        # print (k)

        # logrvT = gaus.logprob(v, mean, logvar)
        logrvT = lognormal333(v, mean, logvar)

        # print (logqz0.size())
        # print (logqv0.size())
        # print (logdetsum.size())
        # print (logrvT.size())
        # fadsf




        logpz = logqz0+logqv0-logdetsum-logrvT

        return z, logpz

# Optimize Local q
Code to optimise the parameters for the approximate distribution for a local datapoint x.

In [None]:
quick = 0


def optimize_local_q_dist(logposterior, hyper_config, x, q):
    '''1041079: Optimises parameters for approx dist for single datapoint x'''

    B = x.size()[0] #batch size
    P = 50 # 1041079: this is the number of IWAE samples to take / number of samples of posterior to take for each datapoint.
    # They say they used 100 MC samples for this, but the code was set to 50
    
    z_size = hyper_config['z_size']
    x_size = hyper_config['x_size']
    if torch.cuda.is_available():
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor
        
    mean = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)
    logvar = Variable(torch.zeros(B, z_size).type(dtype), requires_grad=True)

    params = [mean, logvar]
    for aaa in q.parameters():
        params.append(aaa)


    optimizer = optim.Adam(params, lr=.001)

    last_100 = []
    best_last_100_avg = -1
    consecutive_worse = 0
    for epoch in range(1, 999999):
        
        # 1041079: I added all this, so optimized parameters are also calculated using IWAE if required
        if arch == 'IWAE':
            z, logqz = q.sample(mean, logvar, 64)
            logpx = logposterior(z)
            optimizer.zero_grad()
            elbo = logpx - logqz
            max_ = torch.max(elbo, 0)[0]
            elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_
            elbo = torch.mean(elbo)
            loss = -elbo
        else:
            z, logqz = q.sample(mean, logvar, P)
            logpx = logposterior(z)

            optimizer.zero_grad()

            loss = -(torch.mean(logpx-logqz))
            
        loss_np = loss.data.cpu().numpy()

        loss.backward()
        optimizer.step()

        last_100.append(loss_np)
        if epoch % 100 ==0:

            last_100_avg = np.mean(last_100)
            if last_100_avg< best_last_100_avg or best_last_100_avg == -1:
                consecutive_worse=0
                best_last_100_avg = last_100_avg
            else:
                consecutive_worse +=1 
                if consecutive_worse>10:
                    break

            if epoch % 2000 ==0:
                print (epoch, last_100_avg, consecutive_worse)

            last_100 = []



    # 1041079: Calculate IWAE first
    z, logqz = q.sample(mean, logvar, 5000)
    
    logpx = logposterior(z)

    elbo = logpx-logqz #[P,B]

    max_ = torch.max(elbo, 0)[0] #[B]
    elbo_ = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
    iwae = torch.mean(elbo_)
    
    if arch == "IWAE":
        '''1041079: Over here we have to change how they calculate the VAE bound, to make it
        the PIWAE bound instead - we will output both L_{IWAE}[q*] and L_{MIWAE}[q*]'''
        k_total = 5056
        z, logqz = q.sample(mean, logvar, k_total)
        logpx = logposterior(z)
        elbosIWAE = []

        for i in range(int(k_total/ k)):
            startindex = i * k
            endindex = (i + 1) * k
            localqz = logqz[startindex : endindex]
            localpx = logpx[startindex : endindex]

            weights = localpx - localqz #[P,B]
            # Compute IWAE bound (generator network)
            max_ = torch.max(weights, 0)[0] #[B]
            elboIWAE = torch.mean(torch.log(torch.mean(torch.exp(weights - max_), 0)) + max_) #[B]

            elbosIWAE.append(elboIWAE.data[0])

        vae = np.mean(elbosIWAE)
    else:
        
        z, logqz = q.sample(mean, logvar, 5000)
        logpx = logposterior(z)

        elbo = logpx-logqz #[P,B]
        vae = torch.mean(elbo)

    return vae, iwae

# VAE Class
Class for the overall VAE

In [7]:
class VAE(nn.Module):
    def __init__(self, hyper_config, seed=1):
        super(VAE, self).__init__()

        torch.manual_seed(seed)

        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']

        '''1041079: as far as I can tell, hyper_config['q_dist'] always = standard (the class)
        Therefore, this line initialises the encoder'''
        self.q_dist = hyper_config['q_dist'](hyper_config=hyper_config)


        self.generator = Generator(hyper_config=hyper_config)


        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
            self.q_dist.cuda()
            self.generator.cuda() # 1041079: Added generator.cuda here to make this run on graphics card
        else:
            self.dtype = torch.FloatTensor
            


    def forward(self, x, k, warmup=1.):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k, x, self.logposterior)

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz - (warmup*logqz) #[P,B]
        if k>1:
            max_ = torch.max(elbo, 0)[0] #[B]
            elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
            
        elbo = torch.mean(elbo) #[1]
        logpxz = torch.mean(logpxz) #[1]
        logqz = torch.mean(logqz)

        return elbo, logpxz, logqz


    def sample_q(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k=k, x=x, logposterior=self.logposterior)

        return z


    def logposterior_func(self, x, z):
        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        # print (x)  #[B,X]
        # print(z)    #[P,Z]
        z = Variable(z).type(self.dtype)
        z = z.view(-1,self.B,self.z_size)
        return lognormal(z, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(z), x)



    def logposterior_func2(self, x, z):
        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        # print (x)  #[B,X]
        # print(z)    #[P,Z]
        # z = Variable(z).type(self.dtype)
        z = z.view(-1,self.B,self.z_size)

        # print (z)
        return lognormal(z, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(z), x)



    def forward2(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa: lognormal(aa, self.zeros, self.zeros) + log_bernoulli(self.generator.decode(aa), x)

        z, logqz = self.q_dist.forward(k, x, self.logposterior)

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz - logqz #[P,B]
        # if k>1:
        #     max_ = torch.max(elbo, 0)[0] #[B]
        #     elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
            
        elbo = torch.mean(elbo) #[1]
        logpxz = torch.mean(logpxz) #[1]
        logqz = torch.mean(logqz)

        return elbo, logpxz, logqz




    def forward3_prior(self, x, k):

        self.B = x.size()[0] #batch size
        self.zeros = Variable(torch.zeros(self.B, self.z_size).type(self.dtype))

        self.logposterior = lambda aa:  log_bernoulli(self.generator.decode(aa), x) #+ lognormal(aa, self.zeros, self.zeros)

        # z, logqz = self.q_dist.forward(k, x, self.logposterior)

        z = Variable(torch.FloatTensor(k, self.B, self.z_size).normal_().type(self.dtype)) #[P,B,Z]

        logpxz = self.logposterior(z)

        #Compute elbo
        elbo = logpxz #- logqz #[P,B]
        if k>1:
            max_ = torch.max(elbo, 0)[0] #[B]
            elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
            
        elbo = torch.mean(elbo) #[1]
        # logpxz = torch.mean(logpxz) #[1]
        # logqz = torch.mean(logqz)

        return elbo#, logpxz, logqz


# Train MNIST
Code to train the VAE on Fashion MNIST

In [None]:
# 1041079: I've added these two lines to allow different architectures to be easily selected,
# and allow the code to load a later epoch
architecture = 'Standard' # 'Flow' 'LargerDecoder' 'LargerEncoder
epoch_number_to_load = None

# 1041079: I also added this to automatically choose hyperparams for all the different architectures
''' 1041079: I added all of this'''
def get_hparams(arch):
    if arch == 'Standard':
        hyper_config = { 
                      'x_size': x_size,
                      'z_size': z_size,
                      'act_func': F.elu, #F.tanh,# F.relu,
                      'encoder_arch': [[x_size,200],[200,200],[200,z_size*2]],
                      'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                      'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                      'cuda': 1
                  }
        q = Gaussian(hyper_config)
        # q = Flow(hyper_config)
        hyper_config['q'] = q
    elif arch == 'LargerEncoder':
        hyper_config = { 
                      'x_size': x_size,
                      'z_size': z_size,
                      'act_func': F.elu, #F.tanh,# F.relu,
                      'encoder_arch': [[x_size,500],[500,500],[500,z_size*2]],
                      'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                      'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                      'cuda': 1
                  }
    elif arch == 'LargerDecoder':
        hyper_config = { 
                      'x_size': x_size,
                      'z_size': z_size,
                      'act_func': F.elu, #F.tanh,# F.relu,
                      'encoder_arch': [[x_size,200],[200,200],[200,z_size*2]],
                      'decoder_arch': [[z_size,500],[500,500],[500,x_size]],
                      'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                      'cuda': 1
                  }
        q = Gaussian(hyper_config)
        # q = Flow(hyper_config)
        hyper_config['q'] = q
    elif arch == 'Flow':
        hyper_config = { 
                      'x_size': x_size,
                      'z_size': z_size,
                      'act_func': F.elu, #F.tanh,# F.relu,
                      'encoder_arch': [[x_size,200],[200,200],[200,z_size*2]],
                      'decoder_arch': [[z_size,200],[200,200],[200,x_size]],
                      'q_dist': standard, #FFG_LN#,#hnf,#aux_nf,#flow1,#,
                      'cuda': 1
                  }
        q = Flow(hyper_config)
        # q = Flow(hyper_config)
        hyper_config['q'] = q
    else:
        print("Unknown architecture")
    
    return hyper_config

#FASHION
def load_mnist(path, kind='train'):

    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(-1, 784)

    return images#, labels


path = '../datasets/fashion'

# 1041079: load fashion mnist, store training data in train_x, testing data in test_x
train_x = load_mnist(path=path)
test_x = load_mnist(path=path, kind='t10k')

# 1041079: re-scale all inputs to [0,1] & binarise
train_x = train_x / 255.
test_x = test_x / 255.

print (train_x.shape)
print (test_x.shape)



#binarize
train_x = (train_x > .5).astype(float)
test_x = (test_x > .5).astype(float)



def train_encoder_and_decoder(model, train_x, test_x, k, batch_size,
                    start_at, save_freq, display_epoch, 
                    path_to_save_variables, epoch_number_to_load=None):

    train_y = torch.from_numpy(np.zeros(len(train_x)))
    train_x = torch.from_numpy(train_x).float().type(model.dtype)

    train_ = torch.utils.data.TensorDataset(train_x, train_y)
    train_loader = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)

    #IWAE paper training strategy
    time_ = time.time()
    total_epochs = 0

    i_max = 7 

    warmup_over_epochs = 100.


    all_params = []
    for aaa in model.q_dist.parameters():
        all_params.append(aaa)
    for aaa in model.generator.parameters():
        all_params.append(aaa)
    # print (len(all_params), 'number of params')

    print (model.q_dist)
    # print (model.q_dist.q)
    print (model.generator)

    # fads

    # 1041079: I wrote this
    if epoch_number_to_load is not None:
      min_i = int(np.log(epoch_number_to_load) / np.log(3)) + 1 # change of base formula
      min_epoch = epoch_number_to_load + 1
      total_epochs = epoch_number_to_load
      load_file = path_to_save_variables+'_generator_'+str(epoch_number_to_load)+'.pt'
      model.generator.load_state_dict(torch.load(load_file, map_location=lambda storage, loc: storage))
      load_file = path_to_save_variables+'_encoder_'+str(epoch_number_to_load)+'.pt'
      model.q_dist.load_state_dict(torch.load(load_file, map_location=lambda storage, loc: storage))
      print(min_i, min_epoch, total_epochs)
    else:
      min_epoch = 1
      min_i = 0


    for i in range(min_i,i_max+1):

        lr = .001 * 10**(-i/float(i_max))
        print (i, 'LR:', lr)


        optimizer = optim.Adam(all_params, lr=lr)

        epochs = 3**(i)

        for epoch in range(min_epoch, epochs + 1):

            for batch_idx, (data, target) in enumerate(train_loader):

                batch = Variable(data)#.type(model.dtype)

                optimizer.zero_grad()

                warmup = total_epochs/warmup_over_epochs
                if warmup > 1.:
                    warmup = 1.

                elbo, logpxz, logqz = model.forward(batch, k=k, warmup=warmup)

                loss = -(elbo)
                loss.backward()
                optimizer.step()

            total_epochs += 1


            if total_epochs%display_epoch==0:
                print ('Train Epoch: {}/{}'.format(epoch, epochs),
                    'total_epochs {}'.format(total_epochs),
                    'LL:{:.3f}'.format(-loss.data[0]),
                    'logpxz:{:.3f}'.format(logpxz.data[0]),
                    'logqz:{:.3f}'.format(logqz.data[0]),
                    'warmup:{:.3f}'.format(warmup),
                    'T:{:.2f}'.format(time.time()-time_),
                    )
                time_ = time.time()


            if total_epochs >= start_at and (total_epochs-start_at)%save_freq==0:
                ''' 1041079: this is where the saving happens.'''

                # save params
                save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
                torch.save(model.q_dist.state_dict(), save_file)
                print ('saved variables ' + save_file)
                save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
                torch.save(model.generator.state_dict(), save_file)
                print ('saved variables ' + save_file)
        min_epoch = 1 # 1041079: change epochs to start back at 1



    # save params
    save_file = path_to_save_variables+'_encoder_'+str(total_epochs)+'.pt'
    torch.save(model.q_dist.state_dict(), save_file)
    print ('saved variables ' + save_file)
    save_file = path_to_save_variables+'_generator_'+str(total_epochs)+'.pt'
    torch.save(model.generator.state_dict(), save_file)
    print ('saved variables ' + save_file)


    print ('done training')




# Which gpu
# 1041079: I've commented this out, this seems to be used to only use 1 GPU of many
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'


x_size = 784
z_size = 20 # 1041079: in original code this was 50 but paper says 20
batch_size = 50 # 1041079: in original code this was 20 but paper (section 6.2) says it should be 50
k = 1
#save params 
start_at = 50
save_freq = 50
display_epoch = 3

# 1041079: first run standard
print("Running {} architecture".format(architecture))
hyper_config = get_hparams(architecture)

print ('Init model')
model = VAE(hyper_config)
if torch.cuda.is_available():
    model.cuda()

print('\nModel:', hyper_config,'\n')

path_to_save_variables='./{}/fashion'.format(architecture)

print('\nTraining')

train_encoder_and_decoder(model=model, train_x=train_x, test_x=test_x, k=k, batch_size=batch_size,
                    start_at=start_at, save_freq=save_freq, display_epoch=display_epoch, 
                    path_to_save_variables=path_to_save_variables, epoch_number_to_load=epoch_number_to_load)

print ('Done.')


# Calculate Gaps
Requires: Saved state_dicts from different epochs
Produces: text files

This file reads the saved neural networks for each epoch and calculates L[q], L[q*] and log(p(x)) for each one.

In [None]:
# 1041079: I added this, to specify the training k used for the IWAE bound
training_k = 64
x_size = 784
z_size = 20 # 1041079: changed to 20 as above, originally 50

def calculate_gaps(epoch, todo, arch):
    architecture = arch
    # gpu_to_use = '0' #sys.argv[1]
    epoch = epoch #sys.argv[2]

    todo = todo #sys.argv[3]

    params_file = 'fashion'

    # n_data =1001
    n_data =20 # 1041079: This seems to be the number of datapoints they optimise locally, and the number of datapoints
    # they use to check the amortized VAE and amortised IW. Note this was originally set to 20

    #to save results
    file_ = '../Exp5/{}/over_training_exps/results_'.format(architecture) +str(n_data)+'_fashion_binarized_2_2.txt'


    # 1041079: changed all these from 0 to False
    compute_local_opt = False
    compute_amort = False
    compute_local_opt_test = False
    compute_amort_test = False

    # 1041079: changed all these from 1 to True (since they were evaluated as Booleans)
    if todo == 'amort':
        compute_amort = True
        compute_amort_test = True

    elif todo == 'opt_train':
        compute_local_opt = True

    elif todo == 'opt_valid':
        compute_local_opt_test = True

    else:
        fadsfafd

    
    def test_vae(model, data_x, batch_size, display, k):

        time_ = time.time()
        elbos = []
        data_index= 0
        for i in range(int(len(data_x)/ batch_size)):

            batch = data_x[data_index:data_index+batch_size]
            data_index += batch_size

            batch = Variable(torch.from_numpy(batch)).type(model.dtype)
            if arch == "IWAE":
                for j in range(int(k / training_k)):
                    elbo, logpxz, logqz = model(batch, k=training_k)
                    elbos.append(elbo.data[0])
            else:
                elbo, logpxz, logqz = model.forward2(batch, k=k)

                elbos.append(elbo.data[0])

        mean_ = np.mean(elbos)

        return mean_




    def test(model, data_x, batch_size, display, k):

        time_ = time.time()
        elbos = []
        data_index= 0
        for i in range(int(len(data_x)/ batch_size)):

            batch = data_x[data_index:data_index+batch_size]
            data_index += batch_size

            batch = Variable(torch.from_numpy(batch)).type(model.dtype)

            elbo, logpxz, logqz = model(batch, k=k)

            elbos.append(elbo.data[0])

        mean_ = np.mean(elbos)

        return mean_

    #FASHION
    def load_mnist(path, kind='train'):

        images_path = os.path.join(path,
                                   '%s-images-idx3-ubyte.gz'
                                   % kind)

        with gzip.open(images_path, 'rb') as imgpath:
            images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                                   offset=16).reshape(-1, 784)

        return images


    path = '../datasets/fashion'

    train_x = load_mnist(path=path)
    test_x = load_mnist(path=path, kind='t10k')

    train_x = train_x / 255.
    test_x = test_x / 255.

    print (train_x.shape)
    print (test_x.shape)



    #binarize
    train_x = (train_x > .5).astype(float)
    test_x = (test_x > .5).astype(float)


    x_size = 784
    z_size = 20 # 1041079: originally 50, changed to 20 for same reasons as above
    
    # 1041079: this is what I've done:
    hyper_config = get_hparams(arch)

    print ('Init model')
    model = VAE(hyper_config)
    if torch.cuda.is_available():
        model.cuda()
        print ('using cuda')
    else:
        print ('no gpus')


    print('\nModel:', hyper_config,'\n')

    print (model.q_dist)
    print (model.generator)




    print ('Load params for decoder')
    path_to_load_variables='../Exp5/{}/'.format(arch) +str(params_file)+'_generator_'+str(epoch)+'.pt'
    
    model.generator.load_state_dict(torch.load(path_to_load_variables, map_location=lambda storage, loc: storage))
    print ('loaded variables ' + path_to_load_variables)
    print ()







    if compute_amort:

        print ('Load params for encoder')
        path_to_load_variables='../Exp5/{}/'.format(arch) +str(params_file)+'_encoder_'+str(epoch)+'.pt'

        model.q_dist.load_state_dict(torch.load(path_to_load_variables, map_location=lambda storage, loc: storage))
        print ('loaded variables ' + path_to_load_variables)


    vaes = []
    iwaes = []
    vaes_flex = []
    iwaes_flex = []


    batch_size = 10


    if compute_local_opt:
        print ('optmizing local')
        for i in range(len(train_x[:n_data])):

            print (i)

            x = train_x[i]
            x = Variable(torch.from_numpy(x)).type(model.dtype)
            x = x.view(1,784)

            logposterior = lambda aa: model.logposterior_func2(x=x,z=aa)

            q_local = Gaussian(hyper_config)

            vae, iwae = optimize_local_q_dist(logposterior, hyper_config, x, q_local)
            print (vae.data.cpu().numpy(),iwae.data.cpu().numpy(),'reg')
            vaes.append(vae.data.cpu().numpy())
            iwaes.append(iwae.data.cpu().numpy())

        print()
        print ('opt vae',np.mean(vaes))
        print ('opt iwae',np.mean(iwaes))
        print()


        with open(file_, 'a') as f:
            writer = csv.writer(f, delimiter=' ')

            writer.writerow(['training', epoch, 'L_q_star', np.mean(vaes)])
            writer.writerow(['training', epoch, 'logpx', np.mean(iwaes)])


    if compute_amort:
        VAE_train = test_vae(model=model, data_x=train_x[:n_data], batch_size=np.minimum(n_data, batch_size), display=10, k=5000)
        IW_train = test(model=model, data_x=train_x[:n_data], batch_size=np.minimum(n_data, batch_size), display=10, k=5000)
        print ('amortized VAE',VAE_train)
        print ('amortized IW',IW_train)


        with open(file_, 'a') as f:
            writer = csv.writer(f, delimiter=' ')

            writer.writerow(['training', epoch, 'L_q', VAE_train])
            writer.writerow(['training', epoch, 'L_q_IWAE', IW_train])


    # TEST SET


    vaes_test = []
    iwaes_test = []


    if compute_local_opt_test:
        print ('TEST SET')
        print ('optmizing local')
        for i in range(len(test_x[:n_data])):

            print (i)

            x = test_x[i]
            x = Variable(torch.from_numpy(x)).type(model.dtype)
            x = x.view(1,784)

            logposterior = lambda aa: model.logposterior_func2(x=x,z=aa)

            q_local = Gaussian(hyper_config)
            
            vae, iwae = optimize_local_q_dist(logposterior, hyper_config, x, q_local)
            print (vae.data.cpu().numpy(),iwae.data.cpu().numpy(),'reg')
            vaes_test.append(vae.data.cpu().numpy())
            iwaes_test.append(iwae.data.cpu().numpy())

        print()
        print ('opt vae',np.mean(vaes_test))
        print ('opt iwae',np.mean(iwaes_test))
        print()

        with open(file_, 'a') as f:
            writer = csv.writer(f, delimiter=' ')

            writer.writerow(['validation', epoch, 'L_q_star', np.mean(vaes_test)])
            writer.writerow(['validation', epoch, 'logpx', np.mean(iwaes_test)])


    if compute_amort_test:
        VAE_test = test_vae(model=model, data_x=test_x[:n_data], batch_size=np.minimum(n_data, batch_size), display=10, k=5000)
        IW_test = test(model=model, data_x=test_x[:n_data], batch_size=np.minimum(n_data, batch_size), display=10, k=5000)
        print ('amortized VAE',VAE_test)
        print ('amortized IW',IW_test)


        with open(file_, 'a') as f:
            writer = csv.writer(f, delimiter=' ')

            writer.writerow(['validation', epoch, 'L_q', VAE_test])
            writer.writerow(['validation', epoch, 'L_q_IWAE', IW_test])

# 1040179: Modify this to change what the code actually does    
for arch in ['IWAE']:
    for epoch in [250,300,350]:
        for todo in ['amort', 'opt_train', 'opt_valid']:
            print("**** Calculating architecture = {}, todo = {}, epoch = {} ****".format(arch, todo, epoch))
            calculate_gaps(epoch, todo, arch)


# Plotting The Graphs
Read the text files saved in the cell above, plot the specified graphs.

In [None]:
architecture = 'Standard'

# epochs=['100','1000','1900','2800']
# epochs=['100','1000','2200']
# epochs=['100','2800']
epochs = []
bounds = ['logpx', 'L_q_star', 'L_q']


values = {}
values['training'] = {}
values['validation'] = {}
# for epoch in epochs:
#     values['training'][epoch] = {}
#     values['validation'][epoch] = {}
# for bound in bounds:
#     for epoch in epochs:
#         values['training'][epoch][bound] = {}
#         values['validation'][epoch][bound] = {}


    
#read values
# results_file = 'results_50'
# results_file = 'results_2_fashion'
# results_file = 'results_10_fashion'
# results_file = 'results_100_fashion'
results_file = 'results_20_fashion_binarized_2'

file_ = '../Exp5/{}/over_training_exps/'.format(architecture) +results_file+'.txt'

max_value = None
min_value = None

with open(file_, 'r') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        if len(row) and row[0] in ['training','validation']: 
            # print (row)
            dataset = row[0]
            epoch = row[1]
            bound = row[2]
            value = row[3]

            if epoch not in values[dataset]:
                values[dataset][epoch] = {}
                if epoch not in epochs:
                    epochs.append(epoch)
                    print (epoch)

            values[dataset][epoch][bound] = value

            if max_value == None or float(value) > max_value:
                max_value = float(value)
            if min_value == None or float(value) < min_value:
                min_value = float(value)

# print (values)

# #sort epochs
# epochs.sort()

# print (epochs)
# fads

#convert to list
training_plot = {}
for bound in bounds:
    values_to_plot = []
    for epoch in epochs:
        values_to_plot.append(float(values['training'][epoch][bound]))
    training_plot[bound] = values_to_plot 
print (training_plot)


validation_plot = {}
for bound in bounds:
    values_to_plot = []
    for epoch in epochs:
        values_to_plot.append(float(values['validation'][epoch][bound]))
    validation_plot[bound] = values_to_plot 
print (validation_plot)


epochs_float = [float(x) for x in epochs]


rows = 1
cols = 2

legend=False

fig = plt.figure(figsize=(8+cols,2+rows), facecolor='white')

# ylimits = [-110, -84]
ylimits = [min_value, max_value]




# Training set
ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

ax.set_title('Training Set',family='serif')

# for bound in bounds:
#     ax.plot(epochs_float,training_plot[bound]) #, label=legends[i], c=colors[i], ls=line_styles[i])

ax.fill_between(epochs_float, training_plot['logpx'], training_plot['L_q_star'])
ax.fill_between(epochs_float, training_plot['L_q_star'], training_plot['L_q'])

ax.set_ylim(ylimits)
ax.set_xlim([0, 2000])
ax.grid(True, alpha=.1)








# Validation set
ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

ax.set_title('Validation Set',family='serif')

# for bound in bounds:
#     ax.plot(epochs_float,validation_plot[bound]) #, label=legends[i], c=colors[i], ls=line_styles[i])

ax.grid(True, alpha=.1)

ax.fill_between(epochs_float, validation_plot['logpx'], validation_plot['L_q_star'])
ax.fill_between(epochs_float, validation_plot['L_q_star'], validation_plot['L_q'])

ax.set_ylim(ylimits)

ax.set_xlim([0, 2000])

# ax.set_yticks()

# family='serif'
# fontProperties = {'family':'serif'}
# ax.set_xticklabels(ax.get_xticks(), fontProperties)






name_file = './Plots/{}/'.format(architecture) +results_file+'.png'
name_file2 = './Plots/{}/'.format(architecture) +results_file+'.pdf'
# name_file = home+'/Documents/tmp/plot.png'
plt.savefig(name_file)
plt.savefig(name_file2)
print ('Saved fig', name_file)
print ('Saved fig', name_file2)



print ('DONE')