In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

from tensorflow.keras import layers
import os

import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import random
import time
from pathlib import Path
import imageio
import moviepy.editor as mpy

import re
from IPython import display

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Select GPU number 1
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1 Physical GPUs, 1 Logical GPUs


2022-01-09 15:21:35.357000: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-09 15:21:35.362658: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-09 15:21:35.362988: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-09 15:21:35.364556: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

# Load the training set we had preprocessed
* We used skip-throughs as our text embedding model.
* It can encode the whole sentence instead of a word, and it can predict next sentence and previous sentence. We chose it because of this property. 
* Encode the discription to 4800 dimensions vector while given the text for condition. And the 4800 dimensions vector contains two part. The first 2400 is unidirectional vector, the other is bidirectional vecotr. We used first 2400 dimensions as our embedding.

In [None]:
st = np.load('./dataset/train_captions.npy', allow_pickle=True)

# Hyperparameter setting
* In this part, we used the setting in lab and competetion template

In [None]:
IMG_H = 64
IMG_W = 64
IMG_C = 3
IMG_SHAPE = (IMG_H, IMG_W, IMG_C)

dataset_size = 7370

Z_DIM = 128
text_dim = 128
BATCH_SIZE = 64
BZ = (BATCH_SIZE, 1, 1, 50)

W_LR = 2.0e-04
W_beta1 = 0.5
W_beta2 = 0.99
W_EPOCH = 1500

data_path = './dataset'
image_dir = ''

hparas = {
    'MAX_SEQ_LENGTH': 20,                     # maximum sequence length
    'EMBED_DIM': 256,                         # word embedding dimension
    'RNN_HIDDEN_SIZE': 128,                   # number of RNN neurons
    'Z_DIM': 50,                             # random noise z dimension
    'DENSE_DIM': 128,                         # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                # render image size
    'BATCH_SIZE': 64,
    'LR': 1e-4,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 600,
    'CHECKPOINTS_DIR': './checkpoints/final',  
    'PRINT_FREQ': 1                           
}

# Image preprocess
* We preprocessed the data by resizeing, cropping, padding, and random flip left and right.
* We generated the dataset which description didn't match it's image for the loss in conditional-GAN

In [None]:
def image_preprocess(img):
    img = tf.image.decode_jpeg(img, channels=3)
    short_side = tf.minimum(tf.shape(img)[0], tf.shape(img)[1])
    img = tf.image.resize_with_crop_or_pad(img, short_side, short_side)
    img = tf.image.random_flip_left_right(img)
    img = tf.image.resize(img, [IMG_H, IMG_W])
    img = tf.cast(img, tf.float32)
    img = tf.clip_by_value(img, 0, 255)
    img = img/127.5 - 1.0
    return img

def training_data_generator(captions, image_path, wrong_image_path):
    img = tf.io.read_file(image_dir+image_path)
    img = image_preprocess(img)
    wrong_img = tf.io.read_file(image_dir+wrong_image_path)
    wrong_img = image_preprocess(wrong_img)  
    caption = random.choice(captions)[:2400]
    return img, wrong_img, caption

def dataset_generator(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    real_image_path = df['ImagePath'].values
    wrong_image_path = tf.random.shuffle(real_image_path)
    
    dataset = tf.data.Dataset.from_tensor_slices((st, real_image_path, wrong_image_path))
    dataset = dataset.repeat(5)
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(5000).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

In [None]:
dataset_train = dataset_generator(data_path + '/text2ImgData.pkl', BATCH_SIZE, training_data_generator)

2022-01-09 15:21:49.237114: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 707520000 exceeds 10% of free system memory.
2022-01-09 15:21:49.484114: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 707520000 exceeds 10% of free system memory.


In [None]:
class EmbeddingCompressor(tf.keras.Model):
    def __init__(self):
        super(EmbeddingCompressor, self).__init__()
        self.dense = tf.keras.layers.Dense(units = 128) # 128

    def call(self, E):
        X = self.dense(E)
        return tf.nn.leaky_relu(X)

# Model Architecture
* We reference some papers or github pages for building our model.  

In [None]:
class Generator(tf.keras.Model):
    """
    Generate fake image based on given text(hidden representation) and noise z
    input: text and noise
    output: fake image with size 64*64*3
    """
    def __init__(self, hparas):
        super(Generator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.d1 = EmbeddingCompressor() # DENSE_DIM = 128
        self.d2 = tf.keras.layers.Dense(units = 128*4*4*4, kernel_initializer = tf.random_normal_initializer(stddev = 0.02))
        self.reshape = tf.keras.layers.Reshape(target_shape = (4, 4, 128*4), input_shape = (128*4*4*4, ))
        self.batchnorm = tf.keras.layers.BatchNormalization()
        self.batchnorm0 = tf.keras.layers.BatchNormalization()
        self.batchnorm1 = tf.keras.layers.BatchNormalization()
        self.batchnorm2 = tf.keras.layers.BatchNormalization()
        self.batchnorm3 = tf.keras.layers.BatchNormalization()
        self.batchnorm4 = tf.keras.layers.BatchNormalization()
        self.batchnorm5 = tf.keras.layers.BatchNormalization()
        self.batchnorm7 = tf.keras.layers.BatchNormalization()
        self.batchnorm8 = tf.keras.layers.BatchNormalization()
        self.batchnorm6 = tf.keras.layers.BatchNormalization()
        self.conv_same1 = tf.keras.layers.Conv2D(filters = 256, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same2 = tf.keras.layers.Conv2D(filters = 256, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same3 = tf.keras.layers.Conv2D(filters = 256, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same4 = tf.keras.layers.Conv2D(filters = 256, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same5 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same6 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same7 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same8 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.reshape = tf.keras.layers.Reshape(target_shape = (4, 4, 128*4), input_shape = (128*4*4*4, ))
        self.deconv = tf.keras.layers.Conv2DTranspose(filters = 256, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.deconv0 = tf.keras.layers.Conv2DTranspose(filters = 256, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.deconv1 = tf.keras.layers.Conv2DTranspose(filters = 256, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.deconv2 = tf.keras.layers.Conv2DTranspose(filters = 128, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.deconv3 = tf.keras.layers.Conv2DTranspose(filters = 128, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.deconv4 = tf.keras.layers.Conv2DTranspose(filters = 3, kernel_size = 4, strides = (2, 2), padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv1 = tf.keras.layers.Conv2D(filters = 256, kernel_size =1 , strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv2 = tf.keras.layers.Conv2D(filters = 256, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv3 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv4 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
    def call(self, text, noise_z):
        text = self.d1(text)
        
        text = tf.expand_dims(text, axis=1)
        text = tf.expand_dims(text, axis=1)
        
        noise_z = tf.expand_dims(noise_z, axis=1)
        noise_z = tf.expand_dims(noise_z, axis=1)
        
        img = tf.concat([text, noise_z], axis=3)
        X = self.deconv(img)
        X = self.batchnorm(X)
        X = tf.nn.leaky_relu(X)
        X = self.deconv0(X)
        X = self.batchnorm0(X)
        X = tf.nn.leaky_relu(X)
        

        # ResBlock
        X = self.conv_same1(X)
        X = self.batchnorm1(X)
        img = self.conv1(img)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same2(X)
        X = self.batchnorm2(X) + img
        X = tf.nn.leaky_relu(X)

        # Up-sampling
        Res_X = self.deconv1(X)
        
        
        # ResBlock
        X = self.conv_same3(Res_X)
        X = self.batchnorm3(X)
        Res_X = self.conv2(Res_X)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same4(X)
        X = self.batchnorm4(X) + Res_X
        X = tf.nn.leaky_relu(X)

        # Up-sampling
        Res_X = self.deconv2(X)
        
        
        # ResBlock
        X = self.conv_same5(Res_X)
        X = self.batchnorm5(X)
        Res_X = self.conv3(Res_X)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same6(X)
        X = self.batchnorm6(X) + Res_X
        X = tf.nn.leaky_relu(X)

        # Up-sampling
        Res_X = self.deconv3(X)
        
        
        # ResBlock
        X = self.conv_same7(Res_X)
        X = self.batchnorm7(X)
        Res_X = self.conv4(Res_X)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same8(X)
        X = self.batchnorm8(X) + Res_X
        X = tf.nn.leaky_relu(X)

        # Up-sampling
        logits = self.deconv4(X)
        output = tf.nn.tanh(logits)
        
        return output

### check if Generator works

In [None]:
noise = tf.random.uniform(shape=[64,100], minval=0.7, maxval=1.)
text = tf.random.uniform(shape=[64,100], minval=0.7, maxval=1.)
g = Generator(hparas)
g(text, noise)

2022-01-09 15:21:52.654034: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-01-09 15:21:53.028090: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8202
2022-01-09 15:21:53.636254: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


<tf.Tensor: shape=(64, 64, 64, 3), dtype=float32, numpy=
array([[[[-0.5204494 , -0.94369143, -0.97567415],
         [-0.9999601 , -0.9901111 ,  0.9994086 ],
         [ 0.9954048 ,  0.9092694 ,  0.8924996 ],
         ...,
         [-0.54295564, -0.8364265 , -0.17171238],
         [ 0.24456996, -0.34253263, -0.9645204 ],
         [ 0.06219353,  0.6510829 ,  0.88143307]],

        [[-0.9061176 , -0.8572843 , -0.9143423 ],
         [ 0.9999089 ,  0.9999567 ,  0.3664996 ],
         [-1.        , -0.99976164, -0.99915373],
         ...,
         [ 0.641505  , -0.08585563, -0.57915723],
         [ 0.8277271 , -0.71646   , -0.95273536],
         [ 0.9850715 , -0.10212433, -0.94293416]],

        [[-0.9998918 , -0.9990797 , -1.        ],
         [ 1.        , -1.        ,  0.99997646],
         [ 0.6211959 , -0.99999946, -0.92935586],
         ...,
         [ 0.8714288 , -0.7560331 , -0.29461834],
         [ 0.9734517 ,  0.51597357, -0.9732179 ],
         [ 0.6578226 ,  0.87391704,  0.91657877

In [None]:
class Discriminator(tf.keras.Model):
    """
    Differentiate the real and fake image
    input: image and corresponding text
    output: labels, the real image should be 1, while the fake should be 0
    """
    def __init__(self, hparas):
        super(Discriminator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.batch_norm1 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.batch_norm2 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.batch_norm3 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.batch_norm4 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.batch_norm5 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.batch_norm6 = tf.keras.layers.BatchNormalization(axis = -1, momentum = 0.99)
        self.conv1 = tf.keras.layers.Conv2D(filters = 16, kernel_size =1 , strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.ap_1 =  tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')
        self.conv2 = tf.keras.layers.Conv2D(filters = 32, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.ap_2 =  tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')
        self.conv3 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.ap_3 =  tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')
        self.conv4 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.ap_4 =  tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')
        self.conv5 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv6 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 1, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        
        
        self.conv_same1 = tf.keras.layers.Conv2D(filters = 16, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same2 = tf.keras.layers.Conv2D(filters = 16, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same3 = tf.keras.layers.Conv2D(filters = 32, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same4 = tf.keras.layers.Conv2D(filters = 32, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same5 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same6 = tf.keras.layers.Conv2D(filters = 64, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same7 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        self.conv_same8 = tf.keras.layers.Conv2D(filters = 128, kernel_size = 4, strides = 1, padding = "same", kernel_initializer = tf.keras.initializers.TruncatedNormal())
        
        self.d_img = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])

        self.embed = tf.keras.layers.Dense(256) # for text
        self.reshape = tf.keras.layers.Reshape(target_shape = (1, 1, 128))
        self.concat = tf.keras.layers.Concatenate()
        self.last_d1 = tf.keras.layers.Dense(128)
        self.last_d2 = tf.keras.layers.Dense(1)
        
    def call(self, text, img, train=None):
        
        
        # Res_Block
        
        X = self.conv_same1(img)
        X = self.batch_norm1(X)
        img = self.conv1(img)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same2(X)
        X = self.batch_norm2(X) + img
        X = tf.nn.leaky_relu(X)
        Res_X = self.ap_1(X)
        
        # Res_Block
        X = self.conv_same3(Res_X)
        X = self.batch_norm3(X)
        Res_X = self.conv2(Res_X)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same4(X)
        X = self.batch_norm4(X) + Res_X
        X = tf.nn.leaky_relu(X)
        Res_X = self.ap_2(X)
        
        
        # Res_Block
        X = self.conv_same5(Res_X)
        X = self.batch_norm5(X)
        Res_X = self.conv3(Res_X)
        X = tf.nn.leaky_relu(X)
        X = self.conv_same6(X)
        X = self.batch_norm6(X) + Res_X
        X = tf.nn.leaky_relu(X)
        Res_X = self.ap_3(X)
        
    
        
        T = self.embed(text)
        T = tf.reshape(T, [64,16,16,1])
        X = tf.concat([X, T], 3)
        
        merged_input = self.conv5(X, training=train)
        merged_input = self.conv6(merged_input, training=train)
        
        merged_input = self.flatten(merged_input)
        

        Y = self.last_d1(merged_input)
        logits = self.last_d2(Y)
        output = tf.nn.sigmoid(logits)
        # return logits, output
        return logits

### check if Discriminator works

In [None]:
img = tf.random.uniform(shape=[64,64,64,3], minval=0.7, maxval=1.)
text = tf.random.uniform(shape=[64,100], minval=0.7, maxval=1.)
g = Discriminator(hparas)
g(text, img)

<tf.Tensor: shape=(64, 1), dtype=float32, numpy=
array([[-0.0437666 ],
       [-0.05221966],
       [-0.05009417],
       [-0.05511851],
       [-0.05305054],
       [-0.04867206],
       [-0.04897459],
       [-0.05139726],
       [-0.05004559],
       [-0.04404794],
       [-0.05481909],
       [-0.05086534],
       [-0.05123526],
       [-0.04773366],
       [-0.0520039 ],
       [-0.04918405],
       [-0.04656748],
       [-0.05110708],
       [-0.04592313],
       [-0.04780387],
       [-0.05596517],
       [-0.05304431],
       [-0.04863382],
       [-0.04825912],
       [-0.04492773],
       [-0.05068965],
       [-0.05201664],
       [-0.05120134],
       [-0.04668717],
       [-0.04745018],
       [-0.0508514 ],
       [-0.04915348],
       [-0.05349309],
       [-0.04925626],
       [-0.04406099],
       [-0.04861335],
       [-0.04834228],
       [-0.05456967],
       [-0.04869805],
       [-0.04834382],
       [-0.04847126],
       [-0.04867705],
       [-0.05287009],
     

In [None]:
generator = Generator(hparas)
discriminator = Discriminator(hparas)

In [None]:
optimizer_g = tf.keras.optimizers.Adam(W_LR, beta_1=W_beta1, beta_2=W_beta2)
optimizer_d = tf.keras.optimizers.Adam(W_LR, beta_1=W_beta1, beta_2=W_beta2)

# Loss
* In this part, we used the W-GAN loss in the lab for our training

In [None]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits = True)

def DC_G_Loss(z0):
    return cross_entropy(tf.ones_like(z0), z0)

def DC_D_Loss(z0, z_caption, z1):
    l0 = cross_entropy(tf.zeros_like(z0), z0)
    l1 = cross_entropy(tf.ones_like(z1), z1)
    l_caption = cross_entropy(tf.zeros_like(z_caption), z_caption)
    return l0, l_caption, l1

@tf.function
def W_GTrain(real_img, wrong_img, text):
    z = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    with tf.GradientTape() as tpg:
        c0 = generator(text, z, training = True)
        z1 = discriminator(text, real_img, training = True)
        z0 = discriminator(text, c0, training = True)
        z_caption = discriminator(text, wrong_img, training = True)
        lg = DC_G_Loss(z0)
        l0, l_caption, l1 = DC_D_Loss(z0, z1, z_caption)
        ld = l0/2.0+ l_caption + l1
    gradient_g = tpg.gradient(lg, generator.trainable_variables)
    optimizer_g.apply_gradients(zip(gradient_g, generator.trainable_variables))
    
    return lg, (l0, l_caption, l1)

@tf.function
def W_DTrain(real_img, wrong_img, text):
    z = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    with tf.GradientTape() as tpd:
        c0 = generator(text, z, training = True)
    

        z1 = discriminator(text, real_img, training = True)
        z0 = discriminator(text, c0, training = True)
        z_caption = discriminator(text, wrong_img, training = True)

        lg = DC_G_Loss(z0)
        
        l0, l_caption, l1 = DC_D_Loss(z0, z1, z_caption)
        ld = l0/2.0+ l_caption + l1

    gradient_d = tpd.gradient(ld, discriminator.trainable_variables)

    optimizer_d.apply_gradients(zip(gradient_d, discriminator.trainable_variables))
    
    return lg, (l0, l_caption, l1)

In [None]:
WTrain = (
    W_DTrain,
    W_DTrain,
    W_DTrain,
    W_GTrain
)

WCritic = len(WTrain)

In [None]:
@tf.function
def test_step(caption):
    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    fake_image = generator(caption, noise, training=False)
    return fake_image

In [None]:
def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    # getting the pixel values between [0, 1] to save it
    return plt.imsave(path, merge(images, size)*0.5 + 0.5)

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

In [None]:
def sample_generator(captions_test):
    captions_test = np.asarray(captions_test)
    dataset = tf.data.Dataset.from_tensor_slices(captions_test)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

In [None]:
st_check = np.load('sample_captions.npy')

FileNotFoundError: [Errno 2] No such file or directory: 'sample_captions.npy'

In [None]:
ni = int(np.ceil(np.sqrt(BATCH_SIZE)))
sample_size = BATCH_SIZE
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, 1, 1, Z_DIM)).astype(np.float32)
sample_sentence = [st_check[1][0][:2400]] * int(sample_size/ni) + \
                  [st_check[2][0][:2400]] * int(sample_size/ni) + \
                  [st_check[3][0][:2400]] * int(sample_size/ni) + \
                  [st_check[4][0][:2400]] * int(sample_size/ni) + \
                  [st_check[5][0][:2400]] * int(sample_size/ni) + \
                  [st_check[6][0][:2400]] * int(sample_size/ni) + \
                  [st_check[7][0][:2400]] * int(sample_size/ni) +\
                  [st_check[7][0][:2400]] * int(sample_size/ni)

sample_sentence = sample_generator(sample_sentence)

In [None]:
ckpt = tf.train.Checkpoint(optimizer_g=optimizer_g,
                           optimizer_d=optimizer_d,
                           generator=generator,
                           discriminator=discriminator)

manager = tf.train.CheckpointManager(ckpt, './checkpoints/final', max_to_keep=1500)

In [None]:
wlg = [None] * W_EPOCH #record loss of g for each epoch
wld = [None] * W_EPOCH #record loss of d for each epoch
wsp = [None] * W_EPOCH #record sample images for each epoch

In [None]:
# Utility function
def utPuzzle(imgs, row, col, path=None):
    h, w, c = imgs[0].shape
    out = np.zeros((h * row, w * col, c), np.uint8)
    for n, img in enumerate(imgs):
        j, i = divmod(n, col)
        out[j * h : (j + 1) * h, i * w : (i + 1) * w, :] = img
    if path is not None : imageio.imwrite(path, out)
    return out
  
def utMakeGif(imgs, fname, duration):
    n = float(len(imgs)) / duration
    clip = mpy.VideoClip(lambda t : imgs[int(n * t)], duration = duration)
    clip.write_gif(fname, fps = n)

In [None]:
# ckpt.restore('/home/haowei/CS565600_Deep_Learning/DL_comp3/checkpoints/chiu_80_ver6mergeOu' + '/ckpt-732')

# Training

In [None]:
rsTrain = float(BATCH_SIZE) / (float(dataset_size) * 5)
ctr = 0

g_1 = []
g_2 = []
g_3 = []
wlg = []
for ep in range(1, W_EPOCH):
    print('start epoch {}'.format(ep))
    start = time.time()

    lgt = 0.0
    ldt = 0.0
    l0, l_caption, l1 = 0.0, 0.0, 0.0
    idx = 0
    for img, wrong_img, caption in dataset_train:
        idx+=1
        lg, ld = WTrain[ctr](img, wrong_img, caption)
        ctr += 1
        l0 += ld[0].numpy()
        l_caption += ld[1].numpy()
        l1 += ld[2].numpy()
        lgt += lg.numpy()
        if ctr == WCritic : ctr = 0

    wlg[ep] = lgt * rsTrain
    wld[ep] = ldt * rsTrain
    g_1.append(l0 * rsTrain)
    g_2.append(l_caption * rsTrain)
    g_3.append(l1 * rsTrain)
    wlg.append(lgt * rsTrain)
    

    print('\rEnd epoch {}, lg = {:.4f}, ld = ((l0 = {:.4f} + l_caption = {:.4f})/2 + l1 = {:.4f})'.format(ep, lgt * rsTrain, l0 * rsTrain,l_caption * rsTrain,l1 * rsTrain))
    print('Time for epoch {} is {:.4f} sec'.format(ep, time.time()-start))
    
    if (ep + 1) % 2 == 0:
        y_label = range(0,len(g_1))
        plt.plot(y_label, g_1, 'g', label='l0')
        plt.plot(y_label, g_2, 'b', label='l_caption')
        plt.plot(y_label, g_3, 'r', label='11')
        plt.plot(y_label, wlg, 'y', label='G_loss')
        wlg
        plt.title('Training and Validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
        save_path = manager.save()
        print("Saved checkpoint for epoch {}: {}".format(ep, save_path))

    # visualization
    if (ep + 1) % 1 == 0:
        for caption in sample_sentence:
            fake_image = test_step(caption)
        save_images(fake_image, [ni, ni], './samples/demo_80_ver6mergeOu/train_{:04d}.jpg'.format(ep))
        fake_image = utPuzzle(
            ((fake_image + 1)*127.5).numpy().astype(np.uint8),
            8,
            8
        )
        wsp[ep] = fake_image

In [None]:
def testing_data_generator(captions, index):
    caption = captions[0][:2400]
    return caption, index

def testing_dataset_generator(batch_size, data_generator):
    data = pd.read_pickle('./dataset/testData.pkl')
    st = np.load('./test_captions.npy')
    index = data['ID'].values
    index = np.asarray(index)
    
    dataset = tf.data.Dataset.from_tensor_slices((st, index))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat().batch(batch_size)
    
    return dataset

testing_dataset = testing_dataset_generator(BATCH_SIZE, testing_data_generator)

In [None]:
data = pd.read_pickle('./dataset/testData.pkl')
captions = data['Captions'].values

NUM_TEST = len(captions)
EPOCH_TEST = int(NUM_TEST / hparas['BATCH_SIZE'])

In [None]:
@tf.function
def test_step(caption, noise):
    text_embed = caption
    fake_image = generator(text_embed, noise)
    return fake_image

In [None]:
def inference(dataset):
    sample_size = hparas['BATCH_SIZE']
    sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
    
    step = 0
    start = time.time()
    for captions, idx in dataset:
        if step > EPOCH_TEST:
            break
        
        fake_image = test_step(captions, sample_seed)
        step += 1
        for i in range(hparas['BATCH_SIZE']):
            plt.imsave('./inference/chiu_80_ver6mergeOu_checker_new_propotion/inference_{:04d}.jpg'.format(idx[i]), fake_image[i].numpy()*0.5 + 0.5)
            
    print('Time for inference is {:.4f} sec'.format(time.time()-start))

In [None]:
inference(testing_dataset)

In [None]:
def visualize(idx):
    fig = plt.figure(figsize=(14, 14))
    
    for count, i in enumerate(idx):
        loc = np.where(i==index)[0][0]
        text = ''
        for word in captions[loc]:
            if id2word_dict[word] != '<PAD>':
                text += id2word_dict[word]
                text += ' '
        print(text)
        
        path = './inference/chiu_80_ver6mergeOu_checker_new_propotion/inference_{:04d}.jpg'.format(i)
        fake_iamge = plt.imread(path)
        
        plt.subplot(7, 7, count+1)
        plt.imshow(fake_iamge)
        plt.axis('off')

In [None]:
dictionary_path = './dictionary'
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

# Inference the result

In [None]:
data = pd.read_pickle('./dataset/testData.pkl')
captions = data['Captions'].values
index = data['ID'].values
random_idx = [23, 216, 224, 413, 713, 859, 876, 974, 1177, 1179, 1241, 2169, 2196, 2237, 
              2356, 2611, 2621, 2786, 2951, 2962, 3145, 3255, 3327, 3639, 3654, 3927, 4262, 
              4321, 4517, 5067, 5147, 5955, 6167, 6216, 6410, 6413, 6579, 6584, 6804, 6988, 
              7049, 7160]

print(len(random_idx))
visualize(random_idx)

# Report

## Models you tried during competition

### Text encoder:
We tried three kinds of model for text encoding, GRU, Bert, and Skip-thoughts. And we got 0.64, 0.63, 0.61 on public score respectively.
Finally, we chose Skip-thoughts as our text encoder. It needed to download the pretrian model and weight from github(https://github.com/ryankiros/skip-thoughts), and fed the condition sentence sentences to get the embedding and transformed the result to .npy file. 

### GAN:
We used the original conditional GAN as our baseline, we follow the paper to bulid our model. But we noticed the "Condition Augmentation" which can add some noise to the given embedding will fool the generator to generate the image mismatch to the sentence, so we remove it, we guessed it was due to our small dataset. After we passed the baseline60, we want to try StackGAN, but we noticed that it would take many time when training stage2, so we gave up. Maybe next time we should start our project as soon as possible. :-(



## List the experiment you did

### Data augmentation
We use resize_with_crop_or_pad and random_flip_left_right to augment the original image first, after that we size it to 64*64.

### Hyper-parameters tuning
For the hyper parameter tuning, we just used the origin setting in the lab and task.

### Architecture tuning
For the "Generator", we tried original conditional GAN as our baseline, we follow the paper to bulid our model. But we noticed the "Condition Augmentation" which can add some noise to the given embedding will fool the generator to generate the image mismatch to the sentence, so we remove it. After that, we find some paper used residual block to help training, so we add it to our model. As for the "Discriminator", we also added more layers when the generated image mismatch to the sentence because we thought this circumstance in result from the poor discriminator.

### Optimizer tuning
For the training optimizer, we only used Adam as our optimizer because most of the paper used this way. And for each training step, we trained the discriminator for three times and generator for one time.

# Anything worth mentioning

### Skip-thoughts V.S. Bert
We find some difference between these two encoders when we applied them to our task. For Bert pretrain on Imagenet, because it had learned too many words before, it may embedded our sentence in similar sequence. This result would make our sentence mismatch to the image because the generator couldn't recognize the little difference between them. As for Skip-thoughts, it can encode the whole sentence instead of a word, and it can predict next sentence and previous sentence. It can make the generated image more match to the given sentence. However, the image generated by the embedded from Bert was more like real flowers. So for the competition, we used one result from Bert but we train the discriminator more times and add more layers to it. And the others versions we used the skip-thoughts as text-encoder.

###  Training step and loss
We used the training step in the template fist, however, we notice that it is unreasonable because it update the generator and discriminator's optimizer in the same time, so we modified the training step similar to lab sample, we train and update the discriminator for 3 times and the generator for 1 times respectively. We also used the wgan loss to train it and got better result on public score. For the loss, beside (real image, caption), (fake image, caption), we follow the paper and add a new loss pair (real image, mismatch caption) when counting generator loss.