In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils import data
import torch.nn.functional as F

from torch.optim import Adam

import torchvision

In [5]:
from math import floor, log2

Generate each part as a sequence of strokes. 

Condition on 
    (1) image features of a canvas with previous parts 
    (2) sentence features
   
Image features: CNN-based architecture. (ResNet, DoodlerGAN encoder)

In [2]:
class ClassifierBlock(nn.Module):
    def __init__(self, input_channels, filters, downsample=True):
        super().__init__()
        self.conv_res = nn.Conv2d(input_channels, filters, 1)

        self.net = nn.Sequential(
            nn.Conv2d(input_channels, filters, 3, padding=1),
            leaky_relu(),
            nn.Conv2d(filters, filters, 3, padding=1),
            leaky_relu()
        )

        self.downsample = nn.Conv2d(filters, filters, 3, padding = 1, stride = 2) if downsample else None

    def forward(self, x):
        res = self.conv_res(x)
        x = self.net(x)
        x = x + res
        if self.downsample is not None:
            x = self.downsample(x)
        return x

In [13]:
image_size = 64
network_capacity = 16
n_part = 7

In [14]:
num_layers = int(log2(image_size) - 1)
num_init_filters = n_part

blocks = []
filters = [num_init_filters] + [(network_capacity) * (2 ** i) for i in range(num_layers+1)]
chan_in_out = list(zip(filters[0:-1], filters[1:]))

In [15]:
def leaky_relu(p=0.2):
    return nn.LeakyReLU(p, inplace=True)

In [16]:
for ind, (in_chan, out_chan) in enumerate(chan_in_out):
    num_layer = ind + 1
    is_not_last = ind < (len(chan_in_out) - 1)
    block = ClassifierBlock(in_chan, out_chan, downsample = is_not_last)
    blocks.append(block)

blocks = nn.ModuleList(blocks)

In [17]:
blocks

ModuleList(
  (0): ClassifierBlock(
    (conv_res): Conv2d(7, 16, kernel_size=(1, 1), stride=(1, 1))
    (net): Sequential(
      (0): Conv2d(7, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): LeakyReLU(negative_slope=0.2, inplace=True)
      (2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): LeakyReLU(negative_slope=0.2, inplace=True)
    )
    (downsample): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (1): ClassifierBlock(
    (conv_res): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1))
    (net): Sequential(
      (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): LeakyReLU(negative_slope=0.2, inplace=True)
      (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): LeakyReLU(negative_slope=0.2, inplace=True)
    )
    (downsample): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (2): ClassifierBlock(
    (conv_res): Conv2d(3