# Libraries

In [1]:
import torch
from torch import nn
from tqdm.auto import tqdm
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import make_grid
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
from torch.nn import functional as F
import torchvision
torch.manual_seed(0) # Set for testing purposes, please do not change!

<torch._C.Generator at 0x1e0bb0d7230>

# Google Colab / Your Device
**Because directory system in your device(laptop) is different from google drive, so first determine this section first**

In [2]:
def directory(mode):
    if mode == 'colab':
        root_ds='/content/drive/MyDrive/gan-lab/Dataset'
        root_models = '/content/drive/MyDrive/gan-lab/Models'

    elif mode == 'pc':
        root_ds='F:\GitHub\gan-lab\Dataset'
        root_models = 'F:\GitHub\gan-lab\Models'

    return root_ds, root_models

In [3]:
# root
dir_mode = 'pc'
root_ds, root_models = directory(dir_mode)

# devices
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


**NOTE:**
Unlike Basic GANs that you played with Nodes, in DCGANs you will play with Channels

# Information

1. We have NO any pooling layer
2. We have 2D batchnorm layer in both G and D.
3. We have NO fully connected hidden layer (nn.Linear).
4. ReLU in hidden layers - Tanh in final layer (Generators)
5. LeakyReLU in hidden layers - NO activation in final layer (Discriminator)
6. You will build a generator using 4 layers (3 hidden layers + 1 output layer)
7. You will use 3 layers in your discriminator's neural network

# Generator

In [4]:
class Generator(nn.Module):
    def __init__(self, C_noise, C_hidden, C_image):
        super(Generator, self).__init__()

        self.gen = nn.Sequential(
            self.gen_block(C_noise   , C_hidden*4, K=3, S=2),
            self.gen_block(C_hidden*4, C_hidden*2, K=4, S=1),
            self.gen_block(C_hidden*2, C_hidden*1, K=3, S=2),
            self.gen_block(C_hidden*1  , C_image , K=4, S=2, final_layer=True)
        )

    def gen_block(self, C_in, C_out, K, S, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.ConvTranspose2d(C_in, C_out, kernel_size = K, stride = S),
                nn.Tanh()
            )
        else:
            return nn.Sequential(
                nn.ConvTranspose2d(C_in, C_out, kernel_size = K, stride = S),
                nn.BatchNorm2d(C_out),
                nn.ReLU(inplace=True)
            )

    def forward(self, noise):
        return self.gen(noise)

# Discriminator

In [5]:
class Discriminator(nn.Module):
    def __init__(self, C_image, C_hidden):
        super(Discriminator, self).__init__()

        self.dis = nn.Sequential(
            self.dis_block(C_image   , C_hidden*1, K=4, S=2),
            self.dis_block(C_hidden*1, C_hidden*2, K=4, S=2),
            self.dis_block(C_hidden*2, 1         , K=4, S=2, final_layer=True),
        )

    def dis_block(self, C_in, C_out, K, S, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.Conv2d(C_in, C_out, kernel_size = K, stride = S)
            )
        else:
            return nn.Sequential(
                nn.Conv2d(C_in, C_out, kernel_size = K, stride = S),
                nn.BatchNorm2d(C_out),
                nn.LeakyReLU(inplace=True, negative_slope=0.2)
            )

    def forward(self, x):
        return self.dis(x).view(x.shape[0], -1)

# Noise/Epsilon

In [6]:
def get_noise(N_noise, C_noise, device='cpu'):
    return torch.randn(N_noise, C_noise, device=device).view(-1, C_noise, 1, 1)

# Losses (NEW)

## 1. Discriminator Loss

for $i=1, \ldots, m$ do
Sample real data $\boldsymbol{x} \sim \mathbb{P}_{r}$, latent variable $\boldsymbol{z} \sim p(\boldsymbol{z})$, a random number $\epsilon \sim U[0,1] .$
<br>
$\tilde{\boldsymbol{x}} \leftarrow G_{\theta}(\boldsymbol{z})$
$\hat{\boldsymbol{x}} \leftarrow \epsilon \boldsymbol{x}+(1-\epsilon) \tilde{\boldsymbol{x}}$
$L^{(i)} \leftarrow D_{w}(\tilde{\boldsymbol{x}})-D_{w}(\boldsymbol{x})+\lambda\left(\left\|\nabla_{\hat{\boldsymbol{x}}} D_{w}(\hat{\boldsymbol{x}})\right\|_{2}-1\right)^{2}$

### A. epsilon

In [7]:
def get_epsilon(N_epsilon, device='cpu'):
    return torch.rand(N_epsilon, 1, 1, 1, device=device, requires_grad=True)

### B. Calculate Gradient of Discriminator with respect to Data
**$ \nabla_{\hat{\boldsymbol{x}}} D_{w}(\hat{\boldsymbol{x}}) $**

In [8]:
def get_gradient(dis, data):
    """
    Return the gradient of the critic's scores with respect to mixes of real and fake images.
    Parameters:
        dis: the critic model
        data: a batch of data
    Returns:
        gradient: the gradient of the discriminator's scores, with respect to data
    """

    # True require_grad of Data
    data.requires_grad_()
    # Calculate the discriminator's scores on the data
    score = dis(data)

    # Take the gradient of the scores with respect to the data
    gradient = torch.autograd.grad(
        inputs=data,
        outputs=score,
        # These other parameters have to do with the pytorch autograd engine works
        grad_outputs=torch.ones_like(score),
        create_graph=True,
        retain_graph=True,
    )[0]
    return gradient

In [9]:
# test

### C. Calculate Gradient Penalty given a gradient

$ \left(\left\|\nabla_{\hat{\boldsymbol{x}}} D_{w}(\hat{\boldsymbol{x}})\right\|_{2}-1\right)^{2} $

In [10]:
def gradient_penalty(gradient):
    '''
    Return the gradient penalty, given a gradient.
    Given a batch of data gradients, you calculate the magnitude of each data's gradient
    and penalize the mean quadratic distance of each magnitude to 1.

    Parameters:
        gradient: the gradient of the discriminator's scores, with respect to the data
        e.g shape  : (128, 1, 28, 28)

    Returns:
        penalty: the gradient penalty
        e.g shaoe : (scaler)
    '''

    # Flatten the gradients so that each row captures one image
    # e.g shape  : (128, 1, 28, 28) ==> (128, 784)
    gradient = gradient.view(len(gradient), -1)

    # Calculate the magnitude of every row
    # e.g shape : (128, 784) ==> (128, 1)
    gradient_norm = gradient.norm(2, dim=1)

    # Penalize the mean squared distance of the gradient norms from 1
    # e.g shape : (128, 1) ==> (scaler)
    penalty = ( ( gradient_norm - 1.0 )**2 ).mean(dim=0)
    return penalty

In [11]:
# test

### D. Calculate Discriminator Loss

for $i=1, \ldots, m$ do
Sample real data $\boldsymbol{x} \sim \mathbb{P}_{r}$, latent variable $\boldsymbol{z} \sim p(\boldsymbol{z})$, a random number $\epsilon \sim U[0,1] .$
<br>
$\tilde{\boldsymbol{x}} \leftarrow G_{\theta}(\boldsymbol{z})$
$\hat{\boldsymbol{x}} \leftarrow \epsilon \boldsymbol{x}+(1-\epsilon) \tilde{\boldsymbol{x}}$
$L^{(i)} \leftarrow D_{w}(\tilde{\boldsymbol{x}})-D_{w}(\boldsymbol{x})+\lambda\left(\left\|\nabla_{\hat{\boldsymbol{x}}} D_{w}(\hat{\boldsymbol{x}})\right\|_{2}-1\right)^{2}$

In [12]:
def get_loss_dis(gen, dis,
                real,
                N_noise, C_noise,
                c_lambda,
                device):
    '''
    Parameters:

    Returns:
        dis_loss: a scalar for the dis's loss
    '''
    fake = gen(get_noise(N_noise, C_noise, device))
    epsilon = get_epsilon(N_epsilon = N_noise, device=device)
    mixed_images = real * epsilon + fake * (1 - epsilon)
    gp = gradient_penalty(get_gradient(dis, mixed_images))
    return ( dis(mixed_images) - dis(real) + c_lambda * gp ).mean(dim=0)

In [13]:
# test

## 2. Generator Loss

$ -D_{w}\left(G_{\theta}(\boldsymbol{z})\right) $

In [14]:
def get_loss_gen(gen, dis,
                N_noise, C_noise,
                device):
    '''
    Return the loss of a generator.
    Parameters:

    Returns:
       a scalar loss value for the current batch of the generator
    '''
    fake = gen(get_noise(N_noise, C_noise, device))
    return torch.mean( -dis(fake) )

In [15]:
# test

# Helper Functions

In [16]:
def save_model(gen, dis, epoch, root, mode = None):
    # directory system in colab and pc is different.
    if mode == 'colab':
        filename = root + f'/model_epoch_{epoch}.pt'
    else:
        filename = root + f'\model_epoch_{epoch}.pt'

    torch.save({'epoch' : epoch,
              'model_dis_state_dict' : dis.state_dict(),
              'model_gen_state_dict' : gen.state_dict()},
              filename)

In [17]:
# when you want to continue training GAN from last epoch.
def load_model(gen, dis, root):
    # root = last generator and discriminator parameters that has been saved.
    # gen  = un-initialized generator
    # dis  = un-initialized discriminator

    models=torch.load(root)
    gen.load_state_dict(models['model_gen_state_dict'])
    dis.load_state_dict(models['model_dis_state_dict'])
    return models['epoch']

In [18]:
def show_tensor_images(image_tensor, num_images=25, size=(1, 28, 28)):
    image_unflat = image_tensor.detach().cpu().view(-1, *size)
    image_grid = make_grid(image_unflat[:num_images], nrow=5)
    plt.imshow(image_grid.permute(1, 2, 0).squeeze())
    plt.show()

In [19]:
def weights_init(submodules):
    if isinstance(submodules, nn.Conv2d) or isinstance(submodules, nn.ConvTranspose2d):
        torch.nn.init.normal_(submodules.weight, 0.0, 0.02)
    if isinstance(submodules, nn.BatchNorm2d):
        torch.nn.init.normal_(submodules.weight, 0.0, 0.02)
        torch.nn.init.constant_(submodules.bias, 0)

# Hyperparameters

**WARNING**
batch_size and N_noise in some iteration it is not same!

In [20]:
# Sample numbers of noise and image
N_noise = 128
batch_size =128

# Channels of noise and image
C_noise = 64
C_image = 1

# Channels of hidden layers
C_hidden_gen = 64
C_hidden_dis = 16

# lr/epoch/disp
lr = 0.0002
beta_1 = 0.5
beta_2 = 0.999

# epochs
epochs= 100
disp_freq=100

# NEW
n_critic = 5
c_lambda = 10

# Real Image

In [21]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

dataloader = DataLoader(
    MNIST(root_ds, download=True, transform=transform),
    batch_size=batch_size,
    shuffle=True)

# Create Models / Criterion / Optimizers

In [22]:
# get instance from models
gen = Generator(C_noise, C_hidden_gen, C_image).to(device)
dis = Discriminator(C_image, C_hidden_dis).to(device)

# Initialize
gen = gen.apply(weights_init)
dis = dis.apply(weights_init)

# Optimizers
optim_dis = torch.optim.Adam(dis.parameters(), lr=lr, betas=(beta_1, beta_2))
optim_gen = torch.optim.Adam(gen.parameters(), lr=lr, betas=(beta_1, beta_2))

# Train

In [24]:
loss_gen_min = np.Inf

for epoch in range(1,epochs+1):
    print(60 * "#")
    print(6 * "#" + " Epoch " + str(epoch) + " " + 45 * "#")
    print(60 * "#")

    # Set mode on "train mode"
    gen.train()
    dis.train()

    for real_image, _ in tqdm(dataloader):
        # GPU (model and data)
        real_image=real_image.to(device)

        # Fix Bug!
        # RuntimeError: The size of tensor a (128) must match the size of tensor b (96) at non-singleton dimension 0
        N_noise = real_image.shape[0]

        # Discriminator Learning
        for _ in range(n_critic):
            optim_dis.zero_grad()
            loss_dis = get_loss_dis(gen, dis, real_image, N_noise, C_noise, c_lambda, device)
            loss_dis.backward()
            optim_dis.step()

        # Generator Learning
        optim_gen.zero_grad()
        loss_gen = get_loss_gen(gen, dis, N_noise, C_noise, device)
        loss_gen.backward()
        optim_gen.step()

    # Save parameters of discriminator and generator
    save_model(gen, dis, epoch, root_models, mode=dir_mode)
    print("Loss Dis: {:.2f}\tLoss Gen: {:.2f}".format(loss_dis.item(),loss_gen.item()))

    gen.eval()
    fake_images = gen(get_noise(25, C_noise, device=device))
    show_tensor_images(fake_images)

############################################################
###### Epoch 1 #############################################
############################################################


  0%|          | 0/469 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 2.00 GiB total capacity; 21.70 MiB already allocated; 464.34 MiB free; 22.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF