In [36]:
# ! wget https://raw.githubusercontent.com/callummcdougall/arena-v1/main/w4d1/utils.py
# ! gdown --id 0B7EVK8r0v71pZjFTYXZWM3FlRnM

import torch as t
from typing import Union
from torch import nn
import torch.nn.functional as F
import plotly.express as px
import plotly.graph_objects as go
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange
from fancy_einsum import einsum
import os
from tqdm.auto import tqdm
from torchvision import transforms, datasets
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset, DataLoader, TensorDataset
import wandb
import utils
import gan_modules
from typing import Optional

image_size = 64
batch_size = 3
latent_dim_size = 100
img_channels = 3
generator_num_features = 512
n_layers = 4


In [30]:
# @t.no_grad() not necessary since nn.init already uses nograd mode
def initialize_weights(model: nn.Module) -> None:
    """ 
    They mention at the end of page 3 that all weights were initialized from a N(0, 0.02)N(0,0.02) distribution. 
    This applies to the convolutional and convolutional transpose layers' weights, 
    but the BatchNorm layers' weights should be initialised from N(1, 0.02)N(1,0.02) (since 1 is their default value). 
    The BatchNorm biases should all be set to zero (which they are by default).
    """
    for name, parameter in model.named_parameters():
        if "batchnorm" in name:
            if "bias" in name:
                nn.init.constant_(parameter.data, 0.0)
            elif "weight" in name:
                nn.init.normal_(parameter.data, a=0.02, b=1.0)
        else:
            nn.init.normal_(parameter.data, a=0.0, b=0.02)

In [39]:
class Generator(nn.Module):

    def __init__(
        self,
        latent_dim_size: int,           # size of the random vector we use for generating outputs
        img_size = int,                 # size of the images we're generating
        img_channels = int,             # indicates RGB images
        generator_num_features = int,   # number of channels after first projection and reshaping
        n_layers = int,                 # number of CONV_n layers
    ):
        super().__init__()
        self.latent_dim_size = latent_dim_size
        self.img_size = img_size
        self.img_channels = img_channels
        self.generator_num_features = generator_num_features
        self.initial_width = 4
        self.n_layers = n_layers

        self._build()


    def _build(self):
        self.latent_sequential = nn.Sequential(nn.Linear(self.latent_dim_size, 8192),
            Rearrange("a (b c d) -> a b c d", c=self.initial_width, d=self.initial_width),
            nn.BatchNorm2d((self.initial_width,self.initial_width)),
            nn.ReLU()
        )

        self.layer_structure = [(self.generator_num_features//(i),
                                    self.generator_num_features//(i+1), 
                                    self.initial_width*i) for i in range(1, self.n_layers)]
        self.layer_structure.append((self.img_channels,self.img_size,self.img_size))

        block_list = [ConvTransposeBlock(*structure) for structure in self.layer_structure]

        self.upsample_sequential = nn.Sequential(*block_list)

    def forward(self, x: t.Tensor):
        x = self.latent_sequential(x)
        return self.upsample_sequential(x)

class ConvTransposeBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, width: int):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.width = width
        self.convtranspose = gan_modules.ConvTranspose2d(in_channels=in_channels,
                                            out_channels=out_channels,
                                            kernel_size=4,
                                            stride=2,
                                            padding=1)
        self.batchnorm = nn.BatchNorm2d((width,width))
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.convtranspose(x)
        x = self.batchnorm(x)
        return self.relu(x)

generator = Generator(
    latent_dim_size=latent_dim_size,
    img_size=image_size,
    img_channels=img_channels,
    generator_num_features=generator_num_features,
    n_layers=n_layers
)



In [None]:
class Discriminator(nn.Module):
    def __init__(
        self,
        img_size = 64,
        img_channels = 3,
        generator_num_features = 1024,
        n_layers = 4,
    ):
        super().__init__()
        self.img_size = img_size
        self.img_channels = img_channels
        self.generator_num_features = generator_num_features
        self.n_layers = n_layers

    def forward(self, x: t.Tensor):
        pass


    class ConvBlock(nn.Module):
        def __init__(self):
            super().__init__()

In [32]:
from torchvision import transforms, datasets

from torch.utils.data import DataLoader


transform = transforms.Compose([
    transforms.Resize((image_size,image_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

trainset = ImageFolder(
    root="data",
    transform=transform
)

utils.show_images(trainset, rows=3, cols=5)

trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

In [33]:
trainset[0][0].shape

torch.Size([3, 64, 64])

In [34]:
def train_generator_discriminator(
    netG: Generator, 
    netD: Discriminator, 
    optG,
    optD,
    trainloader,
    epochs: int,
    max_epoch_duration: Optional[Union[int, float]] = None,           # Each epoch terminates after this many seconds
    print_netG_output_interval: Optional[Union[int, float]] = None,   # Generator output is printed at this frequency
    use_wandb: bool = False
):
    criterion = t.nn.BCELoss()
    for epoch in range(epochs):
        for i, real_images in enumerate(trainloader):
            # generate image from random
            z = t.empty((batch_size, latent_dim_size))
            generated_images = netG(z)
            # concat with real images
            all_images = t.cat(generated_images, real_images,dim=0)
            # use discriminator
            predictions = netD(all_images)

            groundtruth = t.tensor([0]*batch_size + [1]*batch_size,dtype=t.long)
            loss = criterion(predictions, groundtruth)
            loss.backward()
            optD.step()
            optG.step()
            optD.zero_grad()
            optG.zero_grad()



train_generator_discriminator(
    netG=Generator()
)

### If stuck
compare solutions to own version

In [9]:
# ! wget https://raw.githubusercontent.com/callummcdougall/arena-v1/main/w4d1/solutions.py
# ! wget https://raw.githubusercontent.com/callummcdougall/arena-v1/main/w4d1/w0d2_solutions.py
# ! wget https://raw.githubusercontent.com/callummcdougall/arena-v1/main/w4d1/w0d3_solutions.py


from solutions import netG_celeb_mini
utils.print_param_count(Generator, netG_celeb_mini)

Layer (type:depth-idx)                   Output Shape              Param #
Generator                                [3, 3, 64, 64]            --
├─Sequential: 1-1                        [3, 512, 4, 4]            --
│    └─Linear: 2-1                       [3, 8192]                 819,200
│    └─Rearrange: 2-2                    [3, 512, 4, 4]            --
│    └─BatchNorm2d: 2-3                  [3, 512, 4, 4]            1,024
│    └─ReLU: 2-4                         [3, 512, 4, 4]            --
├─Sequential: 1-2                        [3, 3, 64, 64]            --
│    └─Sequential: 2-5                   [3, 256, 8, 8]            --
│    │    └─ConvTranspose2d: 3-1         [3, 256, 8, 8]            2,097,152
│    │    └─BatchNorm2d: 3-2             [3, 256, 8, 8]            512
│    │    └─ReLU: 3-3                    [3, 256, 8, 8]            --
│    └─Sequential: 2-6                   [3, 128, 16, 16]          --
│    │    └─ConvTranspose2d: 3-4         [3, 128, 16, 16]          52

epoch=0, steps=0, lossD=1.2538, lossG=3.0848:   0%|          | 1/25325 [00:02<16:48:52,  2.39s/it]


KeyboardInterrupt: 