In [2]:
# prerequisites
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torchvision.utils import save_image

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device



### **MNIST Dataset Setup**
This section prepares the MNIST dataset for training and testing purposes.

1. **`transforms.Compose`**:
   - This is used to chain multiple transformations together, applied to the dataset.
   - Transformations included:
     - **`transforms.ToTensor()`**: Converts images (PIL or NumPy) into PyTorch tensors. The image pixel values are scaled to the range `[0, 1]` (normalized by dividing by 255).
     - **`transforms.Normalize(mean, std)`**: Normalizes the tensor image by subtracting the mean and dividing by the standard deviation for each channel. 
       - Here, `mean=(0.5, 0.5, 0.5)` and `std=(0.5, 0.5, 0.5)` are used for normalization. This standardizes the pixel values to the range `[-1, 1]` because:
         - Original range after `ToTensor()`: `[0, 1]`.
         - Subtract 0.5 (mean): `[−0.5, 0.5]`.
         - Divide by 0.5 (std): `[-1, 1]`.


In [13]:
bs = 100

# MNIST Dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, ), std=(0.5, ))])

train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transform, download=False)

# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=bs, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=bs, shuffle=False)

## **1. Generator**
The **Generator** is a neural network that creates synthetic data (e.g., fake images) starting from random noise. It learns to produce data that resembles the real dataset, fooling the discriminator.

### **Code Explanation**
- **`__init__` Method**:
  - `g_input_dim`: Dimension of the random noise input (e.g., a vector of size 100).
  - `g_output_dim`: Dimension of the output (e.g., size of the real data, such as 784 for MNIST images of 28x28 pixels).
  - `self.fc1`, `self.fc2`, `self.fc3`, `self.fc4`: Fully connected (linear) layers. The network grows in feature size as it progresses:
    - Input is transformed to 256 features in `fc1`.
    - Doubled in `fc2`, then doubled again in `fc3`.
    - Finally, it outputs the desired dimension (`g_output_dim`) in `fc4`.

- **`forward` Method**:
  - Takes in the random noise input `x`.
  - Passes `x` through the fully connected layers with activation functions:
    - **`F.leaky_relu`**: A variant of ReLU (Rectified Linear Unit) activation function with a small slope for negative values (`0.2` in this case). Helps gradients flow better during training, avoiding dead neurons.
    - **`torch.tanh`**: Applied to the final layer to constrain the output between `-1` and `1`, which is often done for datasets normalized to this range (e.g., image datasets).

### **Flow of Data in Generator**:
1. Random noise `x` → `fc1` → Leaky ReLU.
2. Output → `fc2` → Leaky ReLU.
3. Output → `fc3` → Leaky ReLU.
4. Output → `fc4` → Tanh → Final generated data.

---

## **2. Discriminator**
The **Discriminator** is a neural network that classifies whether the input data is real (from the actual dataset) or fake (generated by the Generator). It outputs a single value (a probability).

### **Code Explanation**
- **`__init__` Method**:
  - `d_input_dim`: Dimension of the input data (e.g., 784 for MNIST images).
  - `self.fc1`, `self.fc2`, `self.fc3`, `self.fc4`: Fully connected layers. The network reduces feature size as it progresses:
    - Input starts with 1024 features in `fc1`.
    - Halved in `fc2`, then halved again in `fc3`.
    - Outputs a single value (probability) in `fc4`.

- **`forward` Method**:
  - Takes input data `x` (either real or fake).
  - Passes `x` through the layers with:
    - **`F.leaky_relu`**: Activates the neurons similarly to the Generator.
    - **`F.dropout`**: Randomly drops some neurons (30% dropout here) to prevent overfitting and improve generalization.
    - **`torch.sigmoid`**: Applied to the final layer to squash the output into a probability between `0` and `1`.

### **Flow of Data in Discriminator**:
1. Input `x` → `fc1` → Leaky ReLU → Dropout.
2. Output → `fc2` → Leaky ReLU → Dropout.
3. Output → `fc3` → Leaky ReLU → Dropout.
4. Output → `fc4` → Sigmoid → Final probability (real or fake).

---

## **How They Work Together**
1. **Generator**:
   - Takes random noise as input.
   - Outputs fake data resembling the real dataset.

2. **Discriminator**:
   - Takes either real data or fake data (from the Generator) as input.
   - Outputs a probability:
     - Close to 1 for real data.
     - Close to 0 for fake data.

3. **Training**:
   - The Generator tries to fool the Discriminator by generating more realistic data.
   - The Discriminator tries to improve its ability to distinguish real data from fake data.
   - This adversarial process helps the Generator improve over time, producing increasingly realistic data.

---

### **Key Points**
- **Generator** expands dimensions and learns to generate realistic data.
- **Discriminator** reduces dimensions and learns to classify real vs. fake data.
- The adversarial nature of GANs drives both networks to improve iteratively.

In [4]:
class Generator(nn.Module):
    def __init__(self, g_input_dim, g_output_dim):
        super(Generator, self).__init__()       
        self.fc1 = nn.Linear(g_input_dim, 256)
        self.fc2 = nn.Linear(self.fc1.out_features, self.fc1.out_features*2)
        self.fc3 = nn.Linear(self.fc2.out_features, self.fc2.out_features*2)
        self.fc4 = nn.Linear(self.fc3.out_features, g_output_dim)
    
    # forward method
    def forward(self, x): 
        x = F.leaky_relu(self.fc1(x), 0.2)
        x = F.leaky_relu(self.fc2(x), 0.2)
        x = F.leaky_relu(self.fc3(x), 0.2)
        return torch.tanh(self.fc4(x))
    
class Discriminator(nn.Module):
    def __init__(self, d_input_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(d_input_dim, 1024)
        self.fc2 = nn.Linear(self.fc1.out_features, self.fc1.out_features//2)
        self.fc3 = nn.Linear(self.fc2.out_features, self.fc2.out_features//2)
        self.fc4 = nn.Linear(self.fc3.out_features, 1)
    
    # forward method
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x), 0.2)
        x = F.dropout(x, 0.3)
        x = F.leaky_relu(self.fc2(x), 0.2)
        x = F.dropout(x, 0.3)
        x = F.leaky_relu(self.fc3(x), 0.2)
        x = F.dropout(x, 0.3)
        return torch.sigmoid(self.fc4(x))

### **1. Build Network**
This section defines the **Generator (G)** and **Discriminator (D)** networks and sets them up for training.

---

### **Code Breakdown**

#### **1.1 `z_dim`**
```python
z_dim = 100
```
- This defines the **dimension of the random noise vector** \( z \) that will be input to the Generator.
- \( z \) is typically a latent space representation, where each vector is sampled randomly (e.g., from a normal or uniform distribution).
- Commonly used values for `z_dim` are 64, 100, or 128.

---

#### **1.2 `mnist_dim`**
```python
mnist_dim = train_dataset.train_data.size(1) * train_dataset.train_data.size(2)
```
- This calculates the **dimensionality of the MNIST dataset images**.
  - `train_dataset.train_data` is a tensor containing the training data (28x28 grayscale images for MNIST).
  - `size(1)` gives the height of each image (28).
  - `size(2)` gives the width of each image (28).
  - Multiplying them gives the total number of pixels: \( 28 \times 28 = 784 \).
- `mnist_dim` is therefore set to 784, which is the input size for both:
  - The **Discriminator**, which processes flattened 784-dimensional image data.
  - The **Generator's output**, which generates fake data of the same dimension as real MNIST images.

---

#### **1.3 Create the Generator**
```python
G = Generator(g_input_dim=z_dim, g_output_dim=mnist_dim).to(device)
```
- **Generator Initialization**:
  - `g_input_dim=z_dim`: The input dimension for the Generator is the random noise vector \( z \) of size 100.
  - `g_output_dim=mnist_dim`: The Generator's output is the flattened MNIST image data of size 784.
- **`.to(device)`**:
  - Moves the Generator model to the specified device (`device`), which could be either CPU or GPU.
  - This ensures all computations for the Generator are performed on the correct device.

---

#### **1.4 Create the Discriminator**
```python
D = Discriminator(mnist_dim).to(device)
```
- **Discriminator Initialization**:
  - `d_input_dim=mnist_dim`: The Discriminator's input is the flattened MNIST image data (real or fake) of size 784.
- **`.to(device)`**:
  - Moves the Discriminator model to the specified device (CPU or GPU).

---

### **Summary**
1. The **Generator**:
   - Takes a 100-dimensional random noise vector as input.
   - Outputs a 784-dimensional vector representing a fake MNIST image.

2. The **Discriminator**:
   - Takes a 784-dimensional vector as input (real MNIST image or fake image from the Generator).
   - Outputs a probability indicating whether the input is real or fake.

3. Both networks are transferred to the specified device (`device`) to ensure compatibility with hardware (e.g., GPU acceleration).

---

### **Key Concepts**
- **Why `z_dim`?**
  - The random noise vector \( z \) serves as the input to the Generator. It is mapped to the data distribution of real images through the Generator's learned weights.
  
- **Why `mnist_dim`?**
  - The MNIST images are flattened into 1D vectors of size 784 to be compatible with fully connected (linear) layers in both the Generator and Discriminator.

- **Device Management**:
  - Using `.to(device)` ensures that both the models and data tensors are placed on the same hardware (CPU or GPU). This is essential for efficient computation, especially during training.

In [5]:
# build network
z_dim = 100
mnist_dim = train_dataset.train_data.size(1) * train_dataset.train_data.size(2)

G = Generator(g_input_dim = z_dim, g_output_dim = mnist_dim).to(device)
D = Discriminator(mnist_dim).to(device)



In [6]:
G

Generator(
  (fc1): Linear(in_features=100, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=784, bias=True)
)

In [7]:
D

Discriminator(
  (fc1): Linear(in_features=784, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=1, bias=True)
)

### **1. Loss Function**
```python
criterion = nn.BCELoss()
```

- **`nn.BCELoss()`**:
  - **Binary Cross-Entropy Loss** is used to measure the difference between predicted probabilities and actual binary labels (real or fake).
  - In the context of GANs:
    - The Discriminator outputs a probability (via `torch.sigmoid`) for whether input data is real or fake.
    - The Generator’s goal is to maximize the Discriminator's error, i.e., make its fake outputs appear real.
  - Formula for Binary Cross-Entropy Loss:  
    $
    \text{BCE Loss} = - \frac{1}{N} \sum_{i=1}^N [ y_i \cdot \log(p_i) + (1 - y_i) \cdot \log(1 - p_i) ]
    $
    - \( y_i \): True label (1 for real, 0 for fake).
    - \( p_i \): Predicted probability.
  - This loss helps:
    - The **Discriminator** learn to classify real vs. fake images.
    - The **Generator** improve so that its fake images are classified as real.



In [8]:
# loss
criterion = nn.BCELoss() 

# optimizer
lr = 0.0002 
G_optimizer = optim.Adam(G.parameters(), lr = lr)
D_optimizer = optim.Adam(D.parameters(), lr = lr)

Let’s break down the function `D_train(x)` step by step. This function is responsible for **training the Discriminator (D)** in a GAN framework.

---

### **1. Function Purpose**
The goal of `D_train(x)` is to update the Discriminator so it can:
1. Maximize the probability of correctly classifying **real images** as real.
2. Minimize the probability of incorrectly classifying **fake images** (generated by the Generator) as real.

---

### **2. Key Steps**

#### **Step 1: Zero Gradients**
```python
D.zero_grad()
```
- This clears the gradients of the Discriminator's parameters before the backward pass.
- Ensures that accumulated gradients from previous iterations do not interfere with the current one.

---

#### **Step 2: Train Discriminator on Real Data**
```python
x_real, y_real = x.view(-1, mnist_dim), torch.ones(bs, 1)
x_real, y_real = Variable(x_real.to(device)), Variable(y_real.to(device))
```
- **`x_real`**:
  - `x` is the batch of real MNIST images (from the dataset).
  - `x.view(-1, mnist_dim)` flattens each 28x28 image into a 1D vector of size 784 (`mnist_dim`).
- **`y_real`**:
  - A tensor of size `(batch_size, 1)` filled with ones (indicating real images).
- **Convert to `Variable` and move to `device`**:
  - Ensures that data is compatible with GPU (if used) and supports gradient tracking.

```python
D_output = D(x_real)
D_real_loss = criterion(D_output, y_real)
D_real_score = D_output
```
- **`D_output`**:
  - The Discriminator predicts a probability for each image being real.
  - Output shape: `(batch_size, 1)`.
- **`D_real_loss`**:
  - Binary Cross-Entropy Loss between the predicted probabilities and the real label (`1`).
  - Encourages the Discriminator to classify real images correctly.
- **`D_real_score`**:
  - The raw output of the Discriminator for real images, used for analysis.

---

#### **Step 3: Train Discriminator on Fake Data**
```python
z = Variable(torch.randn(bs, z_dim).to(device))
x_fake, y_fake = G(z), Variable(torch.zeros(bs, 1).to(device))
```
- **`z`**:
  - A batch of random noise vectors sampled from a standard normal distribution.
  - Shape: `(batch_size, z_dim)`.
- **`x_fake`**:
  - Fake images generated by the Generator from the noise vectors.
  - Shape: `(batch_size, mnist_dim)` (same as real images).
- **`y_fake`**:
  - A tensor of size `(batch_size, 1)` filled with zeros (indicating fake images).

```python
D_output = D(x_fake)
D_fake_loss = criterion(D_output, y_fake)
D_fake_score = D_output
```
- **`D_output`**:
  - The Discriminator predicts a probability for each fake image being real.
  - Output shape: `(batch_size, 1)`.
- **`D_fake_loss`**:
  - Binary Cross-Entropy Loss between the predicted probabilities and the fake label (`0`).
  - Encourages the Discriminator to classify fake images as fake.
- **`D_fake_score`**:
  - The raw output of the Discriminator for fake images, used for analysis.

---

#### **Step 4: Compute Total Discriminator Loss**
```python
D_loss = D_real_loss + D_fake_loss
```
- The total Discriminator loss is the sum of:
  - `D_real_loss`: Loss on real images.
  - `D_fake_loss`: Loss on fake images.
- This ensures the Discriminator learns from both real and fake samples.

---

#### **Step 5: Backward Pass and Optimizer Step**
```python
D_loss.backward()
D_optimizer.step()
```
- **`D_loss.backward()`**:
  - Computes the gradients of the loss with respect to the Discriminator's parameters.
- **`D_optimizer.step()`**:
  - Updates the Discriminator's parameters based on the computed gradients.

---

#### **Step 6: Return Discriminator Loss**
```python
return D_loss.data.item()
```
- Returns the total Discriminator loss as a scalar for logging or monitoring.

---

### **Summary**
1. **Real Data**:
   - The Discriminator learns to classify real images as real $( y = 1 )$
   - Loss: $ \mathcal{L}_{D_{\text{real}}} = -\log(D(x_{\text{real}})) $
2. **Fake Data**:
   - The Discriminator learns to classify fake images as fake $( y = 0 )$
   - Loss: $ \mathcal{L}_{D_{\text{fake}}} = -\log(1 - D(x_{\text{fake}})) $
3. **Total Loss**:
   - $ \mathcal{L}_D = \mathcal{L}_{D_{\text{real}}} + \mathcal{L}_{D_{\text{fake}}} $
4. **Optimization**:
   - The Discriminator's parameters are updated to minimize $ \mathcal{L}_D $

---

### **Flow Overview**
1. Clear previous gradients: `D.zero_grad()`.
2. Compute loss on real data (real images labeled as real).
3. Compute loss on fake data (fake images labeled as fake).
4. Sum up the losses.
5. Backpropagate and update the Discriminator's weights.
6. Return the loss for monitoring.

---

### **Key Concepts**
- The Discriminator's objective is to maximize accuracy for real vs. fake classification.
- **Real Label = 1**, **Fake Label = 0**:
  - Real images: Push $ D(x_{\text{real}}) \to 1 $
  - Fake images: Push $ D(x_{\text{fake}}) \to 0 $
- Gradients are computed only for the Discriminator in this step (Generator remains untouched).

In [9]:
def D_train(x):
    #=======================Train the discriminator=======================#
    D.zero_grad()

    # train discriminator on real
    x_real, y_real = x.view(-1, mnist_dim), torch.ones(bs, 1)
    x_real, y_real = Variable(x_real.to(device)), Variable(y_real.to(device))

    D_output = D(x_real)
    D_real_loss = criterion(D_output, y_real)
    D_real_score = D_output

    # train discriminator on facke
    z = Variable(torch.randn(bs, z_dim).to(device))
    x_fake, y_fake = G(z), Variable(torch.zeros(bs, 1).to(device))

    D_output = D(x_fake)
    D_fake_loss = criterion(D_output, y_fake)
    D_fake_score = D_output

    # gradient backprop & optimize ONLY D's parameters
    D_loss = D_real_loss + D_fake_loss
    D_loss.backward()
    D_optimizer.step()
        
    return  D_loss.data.item()

In [10]:
def G_train(x):
    #=======================Train the generator=======================#
    G.zero_grad()

    z = Variable(torch.randn(bs, z_dim).to(device))
    y = Variable(torch.ones(bs, 1).to(device))

    G_output = G(z)
    D_output = D(G_output)
    G_loss = criterion(D_output, y)

    # gradient backprop & optimize ONLY G's parameters
    G_loss.backward()
    G_optimizer.step()
        
    return G_loss.data.item()

In [None]:
n_epoch = 200
for epoch in range(1, n_epoch + 1):
    D_losses, G_losses = [], []

    for batch_idx, (x, _) in enumerate(train_loader):
        # Train Discriminator and Generator
        D_loss = D_train(x)
        G_loss = G_train(x)

        # Append the losses for this batch
        D_losses.append(D_loss)
        G_losses.append(G_loss)

    # Calculate the mean losses for the epoch
    avg_D_loss = sum(D_losses) / len(D_losses)
    avg_G_loss = sum(G_losses) / len(G_losses)

    # Print the epoch summary
    print(f'[{epoch}/{n_epoch}]: loss_d: {avg_D_loss:.3f}, loss_g: {avg_G_loss:.3f}')

[1/200]: loss_d: 0.999, loss_g: 2.759
[2/200]: loss_d: 1.076, loss_g: 1.555
[3/200]: loss_d: 0.892, loss_g: 1.973
[4/200]: loss_d: 0.585, loss_g: 2.597
[5/200]: loss_d: 0.583, loss_g: 2.475
[6/200]: loss_d: 0.638, loss_g: 2.468
[7/200]: loss_d: 0.595, loss_g: 2.396
[8/200]: loss_d: 0.621, loss_g: 2.401
[9/200]: loss_d: 0.582, loss_g: 2.597
[10/200]: loss_d: 0.646, loss_g: 2.321
[11/200]: loss_d: 0.709, loss_g: 2.203
[12/200]: loss_d: 0.750, loss_g: 2.110
[13/200]: loss_d: 0.742, loss_g: 2.058
[14/200]: loss_d: 0.745, loss_g: 2.039
[15/200]: loss_d: 0.762, loss_g: 1.979
[16/200]: loss_d: 0.836, loss_g: 1.832
[17/200]: loss_d: 0.798, loss_g: 1.896
[18/200]: loss_d: 0.772, loss_g: 1.932
[19/200]: loss_d: 0.837, loss_g: 1.768
[20/200]: loss_d: 0.884, loss_g: 1.671
[21/200]: loss_d: 0.884, loss_g: 1.673
[22/200]: loss_d: 0.916, loss_g: 1.581
[23/200]: loss_d: 0.907, loss_g: 1.576
[24/200]: loss_d: 0.934, loss_g: 1.569
[25/200]: loss_d: 0.908, loss_g: 1.610
[26/200]: loss_d: 0.958, loss_g: 1

In [11]:
with torch.no_grad():
    test_z = Variable(torch.randn(bs, z_dim).to(device))
    generated = G(test_z)

    save_image(generated.view(generated.size(0), 1, 28, 28), './samples/sample_' + '.png')

In [None]:
device