# VAE Image generation

Cassandra Maldonado

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import os

In [2]:
tf.random.set_seed(42)
np.random.seed(42)

In [3]:
# Parameters
latent_dim = 2
input_shape = (200, 200, 3)

In [4]:
# Defining the encoder
encoder_inputs = keras.Input(shape=input_shape)
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2D(128, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2D(256, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)

z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)

In [5]:
# Sampling layer, this layer will sample from the learned latent space using the mean and log variance from the encoder.
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding the image in latent space."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Sampling()([z_mean, z_log_var])

In [6]:
# Building the encoder model, it will take the input image and output the mean and log variance of the latent space and the sampled latent vector.
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

In [8]:
# Defining the decoder, which will take the sampled latent vector and reconstruct the image.
# The decoder will reverse the operations of the encoder and it will take the latent vector and upsample it back to the original image size.
# After 4 Conv2D layers with strides=2, the 200x200 image is reduced to 200/(2^4) = 12.5
# We'll use 13x13 as the starting point for the decoder
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(13 * 13 * 256, activation="relu")(latent_inputs)
x = layers.Reshape((13, 13, 256))(x)
x = layers.Conv2DTranspose(128, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(3, 3, activation="sigmoid", strides=2, padding="same")(x)

In [9]:
# Building the decoder model, it will take the latent vector and output the reconstructed image.
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

In [10]:
# Defining the VAE model
outputs = decoder(encoder(encoder_inputs)[2])
vae = keras.Model(encoder_inputs, outputs, name="vae")

In [17]:
# Defining the loss function, which is a combination of the reconstruction loss and the KL divergence loss.
# The reconstruction loss measures how well the decoder can reconstruct the original image from the latent vector.
# The KL divergence loss measures how well the learned latent space matches a normal distribution.

reconstruction_loss = keras.losses.MeanSquaredError()(
    keras.layers.Flatten()(encoder_inputs), keras.layers.Flatten()(outputs)
)
reconstruction_loss *= 200 * 200 * 3  # Scale by input dimensions

kl_loss = -0.5 * tf.reduce_mean(
    1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
)
vae_loss = reconstruction_loss + kl_loss

vae.add_loss(vae_loss)
vae.compile(optimizer="adam")

ValueError: Cannot broadcast shape, the failure dim has value 120000, which cannot be broadcasted to 129792. Input shapes are: [None, 120000] and [None, 129792].