In [None]:
# === Environment Setup ===
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Image
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (10, 6), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)

# --- Utility Functions ---
def note(msg): display(Markdown(f"<div class='alert alert-info'>📝 {msg}</div>"))
def sec(title): print(f'\n{80*"="}\n| {title.upper()} |\n{80*'='}")

note("Environment initialized for Generative Models.")

# Chapter 7.13: Generative Models

---

### Table of Contents

1.  [**Introduction to Generative Modeling**](#intro)
2.  [**Variational Autoencoders (VAEs)**](#vae)
    - [Architecture and Key Idea](#vae-arch)
    - [The Reparameterization Trick](#reparam)
    - [Code Lab: VAE for MNIST Generation](#code-vae)
3.  [**Generative Adversarial Networks (GANs)**](#gan)
    - [Architecture and Key Idea](#gan-arch)
    - [The Adversarial Training Process](#gan-training)
    - [Code Lab: Simple GAN for MNIST Generation](#code-gan)
4.  [**A Brief History: Deep Belief Networks (DBNs)**](#dbn)
5.  [**Summary**](#summary)

<a id='intro'></a>
## 1. Introduction to Generative Modeling

While **discriminative models** learn the boundary between classes (e.g., classifying an image as a cat or a dog), **generative models** learn the underlying distribution of the data itself. Their goal is to learn a model of $P(X)$ (the probability of the data) rather than $P(y|X)$ (the probability of a label given the data).

This allows them to **generate new, synthetic data** that resembles the original data. This has profound applications, from creating artificial images and music to generating synthetic data for training other models, especially in situations where real-world data is scarce or private.

<a id='vae'></a>
## 2. Variational Autoencoders (VAEs)

<a id='vae-arch'></a>
### 2.1 Architecture and Key Idea

A Variational Autoencoder (VAE) is a generative evolution of the standard autoencoder. Instead of learning a single, fixed encoding for each input, a VAE learns a **probability distribution** for the latent space.

The encoder doesn't output a single vector; instead, it outputs two vectors: a vector of means ($\mu$) and a vector of standard deviations ($\sigma$). These parameters define a Gaussian distribution in the latent space. To generate a latent vector, we then **sample** from this distribution. This stochasticity is the key to the VAE's generative power.

In [None]:
display(Image(filename='../images/07-Machine-Learning/vae_architecture.png'))

<a id='reparam'></a>
### 2.2 The Reparameterization Trick

A challenge in training VAEs is that backpropagation cannot flow through a random sampling node. To solve this, VAEs use the **reparameterization trick**. Instead of sampling directly from $N(\mu, \sigma)$, we sample a random noise vector $\epsilon$ from a standard normal distribution $N(0, 1)$ and then compute the latent vector $z$ as:
$$ z = \mu + \sigma \odot \epsilon $$
This way, the random part of the process is external to the network, and the gradients can flow back through the $\mu$ and $\sigma$ vectors to train the encoder.

<a id='code-vae'></a>
### 2.3 Code Lab: VAE for MNIST Generation

In [None]:
sec("Building and Training a VAE")

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

latent_dim = 2

# --- Encoder ---
encoder_inputs = tf.keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

# --- Decoder ---
latent_inputs = tf.keras.Input(shape=(latent_dim,))
x = layers.Dense(7 * 7 * 64, activation="relu")(latent_inputs)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = Model(latent_inputs, decoder_outputs, name="decoder")

# --- VAE Model ---
class VAE(tf.keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [self.total_loss_tracker, self.reconstruction_loss_tracker, self.kl_loss_tracker]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(tf.reduce_sum(tf.keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)))
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {"loss": self.total_loss_tracker.result(), "reconstruction_loss": self.reconstruction_loss_tracker.result(), "kl_loss": self.kl_loss_tracker.result()}

# --- Load Data and Train ---
(x_train, _), (x_test, _) = mnist.load_data()
mnist_digits = np.concatenate([x_train, x_test], axis=0)
mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255

vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam())
note("Training VAE for 5 epochs as a demonstration. Full training would require more epochs.")
vae.fit(mnist_digits, epochs=5, batch_size=128)

<a id='gan'></a>
## 3. Generative Adversarial Networks (GANs)

<a id='gan-arch'></a>
### 3.1 Architecture and Key Idea

Generative Adversarial Networks (GANs) are a powerful class of generative models introduced by Ian Goodfellow et al. in 2014. The core idea is an adversarial game between two neural networks:

1.  **The Generator (G)**: This network takes random noise as input and tries to generate fake data that looks like the real data.
2.  **The Discriminator (D)**: This network acts as a classifier. It is shown both real images (from the training set) and fake images (from the generator) and must learn to distinguish between them.

The two networks are trained simultaneously in a zero-sum game.

In [None]:
display(Image(filename='../images/07-Machine-Learning/gan_architecture.png'))

<a id='gan-training'></a>
### 3.2 The Adversarial Training Process

- The **Discriminator** is trained to maximize the probability of correctly classifying real and fake images. Its loss is high when it is fooled by the generator.
- The **Generator** is trained to *minimize* the discriminator's ability to tell the difference. Its loss is high when the discriminator correctly identifies its output as fake.

Over time, this adversarial process leads to a dynamic equilibrium where the generator produces increasingly realistic images to fool the ever-improving discriminator.

<a id='code-gan'></a>
### 3.3 Code Lab: Simple GAN for MNIST Generation

In [None]:
sec("Building and Training a Simple GAN")

# --- 1. Load and preprocess data ---
(train_images, _), (_, _) = mnist.load_data()
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')
train_images = (train_images - 127.5) / 127.5  # Normalize the images to [-1, 1]
BUFFER_SIZE = 60000; BATCH_SIZE = 256
train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# --- 2. Build the Generator ---
def make_generator_model():
    model = tf.keras.Sequential([
        layers.Dense(7*7*256, use_bias=False, input_shape=(100,)),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Reshape((7, 7, 256)),
        layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh')
    ])
    return model

# --- 3. Build the Discriminator ---
def make_discriminator_model():
    model = tf.keras.Sequential([
        layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same', input_shape=[28, 28, 1]),
        layers.LeakyReLU(),
        layers.Dropout(0.3),
        layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'),
        layers.LeakyReLU(),
        layers.Dropout(0.3),
        layers.Flatten(),
        layers.Dense(1)
    ])
    return model

# --- 4. Define Loss and Optimizers ---
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# --- 5. Training Step ---
generator = make_generator_model()
discriminator = make_discriminator_model()

@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, 100])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

# --- 6. Train the Model ---
def train(dataset, epochs):
    for epoch in range(epochs):
        for image_batch in dataset:
            train_step(image_batch)
        # Produce images for the GIF as we go
        display.clear_output(wait=True)
        note(f"Epoch {epoch + 1} complete.")

note("Training GAN for 5 epochs as a demonstration. Full training is computationally intensive.")
train(train_dataset, 5)

<a id='dbn'></a>
## 4. A Brief History: Deep Belief Networks (DBNs)

Before the rise of VAEs and GANs, **Deep Belief Networks (DBNs)** were a significant breakthrough in deep learning, pioneered by Geoffrey Hinton. DBNs are generative graphical models composed of multiple layers of latent variables ('hidden units'), where each layer is a **Restricted Boltzmann Machine (RBM)**.

They are trained using a greedy, layer-by-layer unsupervised pre-training algorithm. While they have largely been superseded by VAEs and GANs for most generative tasks due to their computational expense and the difficulty of training, they were historically crucial for demonstrating that deep neural networks could be trained effectively, paving the way for the deep learning revolution.

In [None]:
display(Image(filename='../images/07-Machine-Learning/dbn_architecture.jpg'))

<a id='summary'></a>
## 5. Summary

Generative models represent a major frontier in machine learning. VAEs and GANs are two of the most important architectures. VAEs learn a smooth, probabilistic latent space, making them excellent for tasks requiring a well-structured encoding. GANs, through their adversarial training, can often produce sharper, more realistic samples, but can be more unstable to train. Both have driven significant advances in AI's ability to create and understand complex data.