In [1]:
import numpy as np
import scipy.io as sio
local_data_dir = './Datasets/' 

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import subprocess
import tempfile, pickle
import matplotlib.pyplot as plt
import tensorflow as tf
import keras


from tensorflow.python.training import moving_averages

from six.moves import urllib
from six.moves import xrange

Using TensorFlow backend.


In [3]:
def cast_and_normalise_images(data_dict):
  """Convert images to floating point with the range [0.5, 0.5]"""
  images = data_dict['images']
  data_dict['images'] = (tf.cast(images, tf.float32) / 255.0) - 0.5
  return data_dict

temp_train= np.load(local_data_dir+"train_data_dict.npy",allow_pickle=True)
train_data_dict = temp_train.item()
temp_test= np.load(local_data_dir+"test_data_dict.npy",allow_pickle=True)
test_data_dict = temp_test.item()

data_variance = np.var(train_data_dict['images'] / 255.0)

In [4]:
from keras.layers import Conv2D, Conv2DTranspose, AveragePooling2D, UpSampling2D
class Encoder():
  def __init__(self, num_hiddens, name='encoder'):
    self._num_hiddens = num_hiddens
    
  def __call__(self, x):
    h = Conv2D(filters=int(self._num_hiddens),
               kernel_size=3,
               activation='relu',
               strides=2,
               #kernel_initializer= 'RandomNormal',
               padding='same')(x)
    
    h = Conv2D(filters=int(self._num_hiddens*2),
               kernel_size=3,
               activation='relu',
               strides=2,
               #kernel_initializer= 'RandomNormal',
               padding='same')(h)
    
    h = Conv2D(filters=int(self._num_hiddens*4),
               kernel_size=3,
               activation='relu',
               strides=2,
              # kernel_initializer= 'RandomNormal',
               padding='same')(h)
    
    h = Conv2D(filters=int(self._num_hiddens*8),
               kernel_size=3,
               activation='relu',
               strides=2,
               kernel_initializer= 'RandomNormal',
               padding='same')(h)
    #h = AveragePooling2D()(h)
    
    return h

  #Last layer should not have any activiation functions or sigmoid.
class Decoder():
  def __init__(self, num_hiddens, name='decoder'):
    self._num_hiddens = num_hiddens
  
  def __call__(self, x):  
        
    #h = UpSampling2D()(x)
    h = Conv2DTranspose(filters=int(self._num_hiddens*8),
                        kernel_size=3,
                        activation='relu',
                        strides=2,
                        kernel_initializer= 'RandomNormal',
                        padding='same')(x)
    
    
    h = Conv2DTranspose(filters=int(self._num_hiddens*4),
                        kernel_size=3,
                        activation='relu',
                        strides=2,
                      #  kernel_initializer= 'RandomNormal',
                        padding='same')(h)
    
    h = Conv2DTranspose(filters=int(self._num_hiddens*2),
                        kernel_size=3,
                        activation='relu',
                        strides=2,
                     #   kernel_initializer= 'RandomNormal',
                        padding='same')(h)

      
    x_recon = Conv2DTranspose(filters=3,
                        kernel_size=3,
                        strides=2,
                     #   kernel_initializer= 'RandomNormal',
                        padding='same')(h)
    return x_recon

In [5]:
from keras.layers import Dense, Flatten, Reshape
from keras import backend as K
from keras.initializers import RandomUniform

def bottleneck_flatten(input_signal,latent_dim, num_codewords):
    # shape info needed to build decoder model
    shape = K.int_shape(input_signal)
    input_signal = Flatten()(input_signal)
    #dense is not flattened as the document suggest
    x = Dense(latent_dim)(input_signal)
    y = Dense(num_codewords)(input_signal)
    return {'z_mean':x, 
            'z_log_var': y,
            'shape':shape}

def bottleneck_deflatten(input_signal, shape):
    x = Dense(shape[1] * shape[2] * shape[3], activation='relu')(input_signal)
    #x = Dense(shape[1], activation='relu')(input_signal)
    deflated = Reshape((shape[1], shape[2], shape[3]))(x)
    #deflated = Reshape((1, 1, shape[1]))(x)
    return deflated 
  
def bottleneck_concatenation(input_signal, embedding_dim, num_codewords):
    shape = K.int_shape(input_signal)
    x = Conv2D(filters= embedding_dim,
               kernel_size=3,
              # activation='relu',
               strides=1,
            #   kernel_initializer= 'RandomNormal',
               padding='same')(input_signal)
    y = Conv2D(filters= num_codewords,
               kernel_size=3,
             #  activation='relu',
               strides=1,
            #   kernel_initializer= 'RandomNormal',
               padding='same')(input_signal)
    return {'z_mean':x, 
            'z_log_var': y,
            'shape':shape}

def sampling(z_mean, z_log_var):
    sampling_dim = tf.shape(z_mean)
    # by default, random_normal has mean=0 and std=1.0
    epsilon = tf.random_normal(sampling_dim)
    return z_mean + tf.sqrt(tf.exp( z_log_var)) * epsilon
  
def information_dropout(z_mean, sigma=None, sigma0=1.):
    sampling_dim = tf.shape(z_mean)
    e = tf.random_normal(sampling_dim)
    log_normal = tf.exp(sigma * sigma0 * e)
    return tf.multiply(z_mean, log_normal)
  
def rbf_prob(dist, smooth):
    prob = tf.exp(-tf.multiply(dist, 0.5*smooth))
    probs = prob/tf.expand_dims(tf.reduce_sum(prob, 1),1)
    return probs

def add_noise(input_signal, noise_level):
    dims = tf.shape(input_signal)
    # by default, random_normal has mean=0 and std=1.0
    noise = tf.random_normal(dims, stddev = noise_level)
    return input_signal + noise

In [6]:
class OhVectorQuantizer():
  # b: batch size; q: number of channels; K: number of codewords; d:embedding_dim; 
  def __init__(self, embedding_dim, num_embeddings, commitment_cost, name='vq_layer'):
    self._embedding_dim = embedding_dim
    self._num_embeddings = num_embeddings
    self._commitment_cost = commitment_cost

    initializer = tf.initializers.variance_scaling()
    self._w = tf.get_variable('embedding', [self._embedding_dim, self._num_embeddings], initializer=initializer, trainable=True)
  

  def __call__(self, inputs, is_training):
        #noisy
        #inputs['z_mean'] = add_noise(inputs['z_mean'], 0.01)
        # Assert last dimension is same as self._embedding_dim
        w = self._w.read_value()
      
        # shape: [batch, num_channel, embedding_dim]
        input_shape = tf.shape(inputs['z_mean'])
        with tf.control_dependencies([
            tf.Assert(tf.equal(input_shape[-1], self._embedding_dim),[input_shape])]):
            flat_inputs = tf.reshape(inputs['z_mean'], [-1, self._embedding_dim])
            flat_smooth = tf.reshape(inputs['z_log_var'], [-1, self._num_embeddings])

         # distances dimension: (b*q)*K
        distances = (tf.reduce_sum(flat_inputs**2, 1, keepdims=True)
                     - 2 * tf.matmul(flat_inputs, w)
                     + tf.reduce_sum(w ** 2, 0, keepdims=True))
        
        #after shape: (b*q)*K
        smooth = 1./tf.exp(flat_smooth)**2
        probs = rbf_prob(distances, smooth)/tf.sqrt(smooth)
        #After shape: (q*b,1,K)
        probs = tf.expand_dims(probs, 1)
        #After shape: (1,d,K)
        codebook = tf.expand_dims(w, 0)
        #expected shape: b*q*d
        quantize_vector = tf.reduce_sum(codebook*probs,2)
        quantized = tf.reshape(quantize_vector, tf.shape(inputs['z_mean']))
    
        #encoding_indices = tf.argmax(- distances, 1)
        #values dimension: flat*2
        #[values, encoding_indices] = tf.nn.top_k(-distances, k = 2)
        #encoding_indices = tf.reshape(encoding_indices[:,0], input_shape[:-1])
        #quantized = self.quantize(encoding_indices)

        
        e_latent_loss = tf.reduce_mean((tf.stop_gradient(quantized) - inputs['z_mean']) ** 2)
        q_latent_loss = tf.reduce_mean((quantized - tf.stop_gradient(inputs['z_mean'])) ** 2)
        loss = q_latent_loss + self._commitment_cost * e_latent_loss 

        quantized = inputs['z_mean'] + tf.stop_gradient(quantized - inputs['z_mean'])
        
        return {'quantize': quantized, 'loss': loss}
    
  @property
  def embeddings(self):
        return self._w
  
  def quantize(self, encoding_indices):
        with tf.control_dependencies([encoding_indices]):
            w = tf.transpose(self.embeddings.read_value(), [1, 0])
        return tf.nn.embedding_lookup(w, encoding_indices, validate_indices=False)

In [7]:
tf.reset_default_graph()

batch_size = 64
image_size = 32
# Data Loading.
train_dataset_iterator = (
    tf.data.Dataset.from_tensor_slices(train_data_dict)
    .map(cast_and_normalise_images)
    .shuffle(10000)
    .repeat(-1)  # repeat indefinitely
    .batch(batch_size)).make_one_shot_iterator()
classifer_dataset_iterator = (
    tf.data.Dataset.from_tensor_slices(train_data_dict)
    .map(cast_and_normalise_images)
    .repeat(1)  # repeat indefinitely
    .batch(73257)).make_initializable_iterator()
test_dataset_iterator = (
    tf.data.Dataset.from_tensor_slices(test_data_dict)
    .map(cast_and_normalise_images)
    .repeat(1)  # 1 epoch
    .batch(26032)).make_initializable_iterator()
train_dataset_batch = train_dataset_iterator.get_next()
classifer_dataset_batch = classifer_dataset_iterator.get_next()
test_dataset_batch = test_dataset_iterator.get_next()

def get_images(sess, subset='train'):
    if subset == 'train':
        return sess.run(train_dataset_batch)['images']
    elif subset =='classifer':
        return sess.run(classifer_dataset_batch)
    elif subset == 'test':
        return sess.run(test_dataset_batch)['images']

In [8]:
# 100k steps should take < 30 minutes on a modern (>= 2017) GPU.
num_training_updates = 10000
num_channels = 64

# This value is not that important, usually 64 works. This will not change the capacity in the information-bottleneck.
sub_dim = 64
num_latents = 4
embedding_dim = sub_dim*num_latents

# The higher this value, the higher the capacity in the information bottleneck.
num_embeddings = 32

# commitment_cost should be set appropriately. It's often useful to try a couple
# of values. It mostly depends on the scale of the reconstruction cost
# (log p(x|z)). So if the reconstruction cost is 100x higher, the
# commitment_cost should also be multiplied with the same amount.
commitment_cost = 7.5
learning_rate = 3e-4

# Build modules.
encoder = Encoder(num_channels)
decoder = Decoder(num_channels)
vq_vae = OhVectorQuantizer(
    embedding_dim=embedding_dim,
    num_embeddings=num_embeddings,
    commitment_cost=commitment_cost)

In [9]:
x = tf.placeholder(tf.float32, shape=(None, image_size, image_size, 3))

#AE
#z = bottleneck_flatten(encoder(x), embedding_dim, num_embeddings)
#input_decoder = bottleneck_deflatten(z['z_mean'],z['shape'])
#VAE
#z = bottleneck_concatenation(encoder(x), sub_dim, sub_dim)
#samples = sampling(z['z_mean'], z['z_log_var'])
#input_decoder = samples
#Information dropout
#z = bottleneck_concatenation(encoder(x), sub_dim, sub_dim)
#samples = information_dropout(z['z_mean'], sigma = 0.7*z['z_log_var'])
#input_decoder = samples
#vq-vae,
#z= bottleneck_concatenation(encoder(x), sub_dim, num_embeddings)
#vq_output_train = vq_vae(z, is_training=True)
#input_decoder = vq_output_train["quantize"]
z = bottleneck_flatten(encoder(x), embedding_dim, num_embeddings)
vq_output_train = vq_vae(z, is_training=True)
input_decoder = bottleneck_deflatten(vq_output_train["quantize"], z["shape"])


#AE
#VAE
#Information droupout
#vq-vae
x_recon = decoder(input_decoder)
recon_error = tf.reduce_mean(tf.reduce_mean((x_recon - x)**2,[1,2,3])/data_variance)  # Normalized MSE

#AE
#loss = recon_error
#VAE
#kl_loss = -0.5*tf.reduce_sum(1.0 + z['z_log_var'] - tf.square(z['z_mean']) - tf.exp(z['z_log_var']),axis = 1)
#loss = image_size*image_size*recon_error + 100.0*tf.reduce_mean(kl_loss)
# Information dropout
#dropout_cost = -tf.reduce_mean(tf.log(z['z_log_var']/0.7 + 0.001))
#loss = image_size*image_size*recon_error + 0.5*dropout_cost
#vq-vae
#beta is the power of the vq quantizer.
beta = 2.0
loss = recon_error + beta*vq_output_train["loss"]

# Create optimizer and TF session.
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#saver = tf.train.Saver()

sess = tf.train.SingularMonitoredSession()

#Train.
train_res_recon_error = []
#train_ratio = []
for i in xrange(num_training_updates):    
    feed_dict = {x: get_images(sess)}
    #results = sess.run([train_op, recon_error,ratio], feed_dict={x: get_images(sess),s_flag: s_f})
    results = sess.run([train_op, recon_error], feed_dict={x: get_images(sess)})
    train_res_recon_error.append(results[1])
    #train_ratio.append(results[2])
    
    if (i+1) % 100 == 0:
        print('%d iterations' % (i+1))
        print('recon_error: %.3f' % np.mean(train_res_recon_error[-100:]))
        print()
    
def get_session(sess):
    session = sess
    while type(session).__name__ != 'Session':
        session = session._sess
    return session 
#saver.save(get_session(sess),local_data_dir+'soft_vqvae.ckpt')

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
100 iterations
recon_error: 0.724

200 iterations
recon_error: 0.506

300 iterations
recon_error: 0.347

400 iterations
recon_error: 0.323

500 iterations
recon_error: 0.313

600 iterations
recon_error: 0.272

700 iterations
recon_error: 0.256

800 iterations
recon_error: 0.244

900 iterations
recon_error: 0.228

1000 iterations
recon_error: 0.223

1100 iterations
recon_error: 0.213

1200 iterations
recon_error: 0.206

1300 iterations
recon_error: 0.203

1400 iterations
recon_error: 0.200

1500 iterations
recon_error: 0.199

1600 iterations
recon_error: 0.199

1700 iterations
recon_error: 0.195

1800 iterations
recon_error: 0.197

1900 iterations
recon_error: 0.189

2000 iterations
recon_error: 0.184

2100 iterations
recon_error: 0.184

2200 iterations
recon_error: 0.183

2300 iterations
recon_error: 0.177

2400 iterations
recon_error: 0.180

2500 iterations
recon_err

In [10]:
train_num = 50000
test_num = 10000
# get all the training latent representations
encodings = []
labels = []
sess.run(classifer_dataset_iterator.initializer)
train_wholebatch = get_images(sess,'classifer')
encodings = sess.run(z["z_mean"], feed_dict = {x: train_wholebatch['images'][0:train_num,:,:,:]}) 
encodings = np.reshape(encodings,(train_num,-1))
labels = train_wholebatch['labels'][0:train_num]
train_encodings = np.asarray(encodings)
train_labels = np.squeeze(np.asarray(labels), axis = 1)

# get all the test latent represetations
test_encodings = []
test_labels = []
sess.run(test_dataset_iterator.initializer)
test_wholebatch = get_images(sess, subset = 'test')
encodings = sess.run(z["z_mean"], feed_dict = {x: test_wholebatch[0:test_num,:,:,:]})
encodings = np.reshape(encodings,(test_num,-1))
labels = test_data_dict['labels'][0:test_num]
test_encodings = np.asarray(encodings)
test_labels = np.squeeze(np.asarray(labels), axis = 1)

np.save(local_data_dir+'train_encodings.npy', train_encodings)
np.save(local_data_dir+'train_labels.npy',train_labels)
np.save(local_data_dir+'test_encodings.npy',test_encodings)
np.save(local_data_dir+'test_labels',test_labels)