ML Course, Bogotá, Colombia  (&copy; Josh Bloom; June 2019)

In [48]:
%run ../talktools.py

<Figure size 432x288 with 0 Axes>

## Autoencoders

A form of non-parametric representation learning with neutral nets, where the architecture of the network is used to reduce the dimensionality of the data. We will see forms of parametric dimensionality reduction in Lecture 7. 

As the name suggests, autoencoders uses the data itself to learn the best way to represent it in a compact way--it's a form of semantic compression. This is a family of self- (or un-) supervised modeling.

<img src="https://lilianweng.github.io/lil-log/assets/images/autoencoder-architecture.png">
Source: https://lilianweng.github.io/lil-log/2018/08/12/from-autoencoder-to-beta-vae.html

In practice, we take an input X (which may be a 1-d vector, 2-d image, ...) and try to squeeze it down to a smaller number of values in the "bottleneck" layer and then uncompress back to it's original shape and form. The loss function that we construct will be the way in which the network learns on each backprop through the data.

Let's look at a autoencoder which uses convnets to restruct the fashion dataset.

In [None]:
import datetime, os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow.keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Reshape, \
                                                            Activation, BatchNormalization, UpSampling2D
from tensorflow.keras import backend as K
import tensorflow as tf

# Print keras version
print(tensorflow.keras.__version__)

In [None]:
from tensorflow.keras.utils import to_categorical

fashion_mnist = tf.keras.datasets.fashion_mnist

nb_classes = 10
batch_size = 128
bottleneck_size = 64

(x_train, y_train),(x_test, y_test) = fashion_mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # scale the images to 0-1

x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))  # adapt this if using `channels_first` image data format
x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))  # adapt this if using `channels_first` image data format

# convert class vectors to binary class matrices
Y_train =  to_categorical(y_train, nb_classes)
Y_test =  to_categorical(y_test, nb_classes)

input_shape = x_train[0].shape  + (1,)
input_shape
input_img = Input(shape = (28, 28, 1))

In [None]:
x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
# at this point the representation is (4, 4, 8) i.e. 128-dimensional

x = Flatten()(x)
bottleneck = Dense(bottleneck_size, name="bottleneck")(x)

x = Dense(128)(bottleneck)
x = Reshape((4,4,8))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

# add with tf.device('/gpu:0'): if on GPU
autoencoder = Model(input_img, decoded)

In [None]:
autoencoder.summary()

In [None]:
import tensorflow 

logdir = os.path.join("nn_results", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')

model_path = f'nn_results/colombia_autoencoder_nn_{run_time_string}.h5'
print(f"Training ... {model_path}")


model_check = tf.keras.callbacks.ModelCheckpoint(model_path,
        monitor='val_loss', 
        save_best_only=True, 
        mode='min',
        verbose=1)

tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(logdir, 
                                                              histogram_freq=0, 
                                                              write_graph=True, 
                                                              write_grads=False, 
                                                              write_images=False, 
                 embeddings_freq=0, 
                 embeddings_layer_names=None, 
                 embeddings_metadata=None, embeddings_data=None)

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder_train = autoencoder.fit(x_train, x_train, 
                                                          batch_size=batch_size,epochs=10,
                                                          verbose=1, shuffle=False, 
                                                          validation_data=(x_test, x_test),
                                                          callbacks=[tensorboard_callback, model_check])

In [None]:
decoded_imgs = autoencoder.predict(x_test)

n = 10
plt.figure(figsize=(20, 4))

for i in range(n):
    # display original
    ax = plt.subplot(2, n, i+1)
    plt.imshow(x_test[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(2, n, i + n + 1)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
encoder = Model(input_img, bottleneck)
encoded_imgs = encoder.predict(x_test)
n = 10
plt.figure(figsize=(20, 8))
for i in range(n):
    ax = plt.subplot(1, n, i+1)
    plt.imshow(encoded_imgs[i].reshape(8, 8).T, cmap="viridis")
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

Think of the above as the (encoded) high-level concept of each image. Can you see the similarities across classes?

What can we do with this network?

  - **Remote compression** - if we have the encoder on our phone (or some other device) we can encode images there and send the bottleneck elsewhere (more than a factor of 10 compression with this network). We can later decode the data elsewhere to get back a fair representation of the original data. This is a great bandwidth saver! It also creates an interesting level of anonymity (if the bottleneck data is intercepted, no one without the decoder could figure out what the original images looked like).
   
  - **Classification** - we just built 64 features without using, for example, our knowledge of computer vision. And we avoiding coding 64 separate features.
  
  - **Clustering** - this is a non-parametric way of finding low-dimensional embedding of our data. We'll see more of this in Lecture 7.

# Breakout

a) Using the autoencoder model above, create a random forest model to predict the classes of the images using the 64-parameter bottleneck layer. What accuracy do you get? How does it compare with the accuracy we got on the `convnet` model before?

b) Experiment with trying a different sized layer (e.g., size 4, 16, 32) and repeat step a) above. Do you see any trends with bottleneck size?

## Denoising Autoencoder

Another application of autoencoders is to construct a model which takes low signal-to-noise data and produces high signal-to-noise versions.  For example, https://thenextweb.com/insider/2017/02/08/google-figured-out-a-way-to-zoom-and-enhance-photos-just-like-in-the-movies/

In [None]:
from IPython.display import YouTubeVideo
YouTubeVideo('I_8ZH1Ggjk0?t=27')

Here the model is very similar as before except that we feed in a corrupted/low SNR version in the input and compare the output against a higher fidelity version.

In [None]:
noise_factor = 0.3
x_train_noisy = x_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_train.shape) 
x_test_noisy = x_test + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_test.shape) 

x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)

In [None]:
n = 10
plt.figure(figsize=(20, 4))

for i in range(n):
    # display original
    ax = plt.subplot(2, n, i+1)
    plt.imshow(x_train_noisy[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display high SNR version
    ax = plt.subplot(2, n, i + n + 1)
    plt.imshow(x_train[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()


In [None]:
run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')

model_path = f'nn_results/colombia_denoise_autoencoder_nn_{run_time_string}.h5'
print(f"Training ... {model_path}")


model_check = tf.keras.callbacks.ModelCheckpoint(model_path,
        monitor='val_loss', 
        save_best_only=True, 
        mode='min',
        verbose=1)

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder_train = autoencoder.fit(x_train_noisy, x_train, 
                                                          batch_size=batch_size,epochs=25,
                                                          verbose=1, shuffle=True, 
                                                          validation_data=(x_test_noisy, x_test),
                                                          callbacks=[tensorboard_callback, model_check])

In [None]:
decoded_imgs = autoencoder.predict(x_test_noisy)

n = 10
plt.figure(figsize=(20, 4))

for i in range(n):
    # display original
    ax = plt.subplot(2, n, i+1)
    plt.imshow(x_test_noisy[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(2, n, i + n + 1)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

## Variational Autoencoder

> A VAE "is an autoencoder that learns a latent variable model for its input data. So instead of letting your neural network learn an arbitrary function, you are learning the parameters of a probability distribution modeling your data. If you sample points from this distribution, you can generate new input data samples: a VAE is a "generative model". -- https://blog.keras.io/building-autoencoders-in-keras.html

<img src="https://lilianweng.github.io/lil-log/assets/images/vae-gaussian.png">

> First, an encoder network turns the input samples x into two parameters in a latent space, which we will note z_mean and z_log_sigma. Then, we randomly sample similar points z from the latent normal distribution that is assumed to generate the data, via z = z_mean + $\exp$(z_log_sigma) * epsilon, where epsilon is a random normal tensor. Finally, a decoder network maps these latent space points back to the original input data.

Here, the loss includes the standard loss (e.g., MSE) for the image reconstruction and a measure of the difference between the distribution of the original data mapped to latent space and the random sampling in latent space. This K-L divergence is a form of information gain, a non-symmetric measure of the difference between two probability distributions.

In [None]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda

latent_dim = 2
input_img = Input(shape = (28, 28, 1))

x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
# at this point the representation is (4, 4, 8) i.e. 128-dimensional


# shape info needed to build decoder model
shape = K.int_shape(x)

# generate latent vector Q(z|X)
x = Flatten()(x)
x = Dense(16, activation='relu')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# reparameterization trick
# instead of sampling from Q(z|X), sample eps = N(0,I)
# then z = z_mean + sqrt(var)*eps
def sampling(args):
    """Reparameterization trick by sampling fr an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(input_img, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(shape[1] * shape[2] * shape[3], activation='relu')(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)

x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
outputs = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

# instantiate VAE model
outputs = decoder(encoder(input_img)[2])
vae = Model(input_img, outputs, name='vae')

In [None]:
models = (encoder, decoder)
data = (x_test, y_test)

In [None]:
from tensorflow.keras.losses import mse, binary_crossentropy
image_size = x_train.shape[1]

# VAE loss = mse + kl_loss
reconstruction_loss = mse(K.flatten(input_img), K.flatten(outputs))

reconstruction_loss *= image_size * image_size

# this is the K-L divergence loss (information gain)
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.summary()

In [None]:
FIT=False

if FIT:
    vae.fit(x_train,
          epochs=50,
          batch_size=batch_size,
          validation_data=(x_test, None))
    vae.save_weights('nn_results/colombia_vae_cnn_fashion_70.h5')
else:
    from keras.models import load_model
    import tensorflow as tf
    
    vae = load_model("nn_results/colombia_vae_cnn_fashion_70.h5")


In [None]:
!ls nn_results/*

In [None]:
x_test_encoded = encoder.predict(x_test, batch_size=batch_size)

lookup = {0: "T-shirt/top",
          1: "Trouser",
          2: "Pullover",
          3: "Dress",
          4: "Coat",
          5: "Sandal",
          6: "Shirt",
          7: "Sneaker",
          8: "Bag",
          9: "Ankle boot"}

cmap = plt.cm.get_cmap('coolwarm', 10)

plt.figure(figsize=(8, 6))
plt.scatter(x_test_encoded[0][:,1], x_test_encoded[0][:,0], c=y_test,cmap=cmap, s=4)
cbar = plt.colorbar(ticks =np.arange(10), label='clothes type')
cbar.ax.set_yticklabels([lookup[i] for i in range(10)])
plt.show()

Each of these colored clusters is a type of clothing. Close clusters are clothes that are structurally similar (i.e. clothes that share information in the latent space).

Since VAEs are generative models, we can also use it to generate new clothes. Here we will scan the latent plane, sampling latent points at regular intervals, and generating the corresponding piece of clothes for each of these points. This gives us a visualization of the latent manifold that "generates" new clothes.

In [None]:
# display a 2D manifold of the clother
n = 15  # figure with 15x15 clothes
im_size = 28
figure = np.zeros((im_size * n,im_size * n))
# we will sample n points within [-5, 5] standard deviations
grid_x = np.linspace(-5, 5, n)
grid_y = np.linspace(-5, 5, n)

epsilon_std = 1.0
for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]]) * epsilon_std
        x_decoded = decoder.predict(z_sample)
        cloth = x_decoded[0].reshape(im_size, im_size)
        figure[i * im_size: (i + 1) * im_size,
               j * im_size: (j + 1) * im_size] = cloth

fig, ax = plt.subplots(figsize=(15, 15))
        
ax.imshow(figure, cmap="viridis")
ax.grid(False)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.show()