# Cup03 Reverse Image Caption
110065508 李丞恩

In [81]:
IMAGE_HEIGHT = 64
IMAGE_WIDTH = 64
IMAGE_CHANNEL = 3
INPUT_SHAPE = (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL)
dictionary_path = 'dictionary'
data_path = 'dataset'
BATCH_SIZE = 64
hparas = {
    'MAX_SEQ_LENGTH': 20,                     # maximum sequence length
    'EMBED_DIM': 256,                         # word embedding dimension
    'RNN_HIDDEN_SIZE': 128,                   # number of RNN neurons
    'Z_DIM': 512,                             # random noise z dimension
    'DENSE_DIM': 128,                         # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                # render image size
    'BATCH_SIZE': 64,
    'LR': 2e-5,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 600,
    'CHECKPOINTS_DIR': './checkpoints/demo',  # checkpoint path
    'PRINT_FREQ': 1                           # printing frequency of loss
}

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import re
import PIL
import time
import string
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras import layers
from pathlib import Path
from IPython import display
from tqdm.notebook import tqdm
from keras.applications.resnet import ResNet101, ResNet50
from keras.layers import Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D, \
    BatchNormalization, Input, Conv2D, MaxPool2D

2021-12-28 15:01:53.443718: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable warnings, info and errors 

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


2021-12-28 15:01:54.778800: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-12-28 15:01:54.849856: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-28 15:01:54.850563: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3060 computeCapability: 8.6
coreClock: 1.852GHz coreCount: 28 deviceMemorySize: 11.77GiB deviceMemoryBandwidth: 335.32GiB/s
2021-12-28 15:01:54.850631: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-12-28 15:01:54.858607: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-12-28 15:01:54.858680: I tensorflow/stream_executor/pl

In [5]:
if not os.path.exists('samples/demo'): os.makedirs('samples/demo')
if not os.path.exists('checkpoints/demo'): os.makedirs('checkpoints/demo')
if not os.path.exists('inference/demo'): os.makedirs('inference/demo')

## 一. 資料前處理
### 1. 文字前處理
感謝助教已經幫我們處理好了以下的部分：

1. Delete text over MAX_SEQ_LENGTH (20).
2. Delete all puntuation in the texts.
3. Encode each vocabulary in dictionary/vocab.npy.
4. Represent texts by a sequence of integer IDs.
5. Replace rare words by $<$RARE$>$ token to reduce vocabulary size for more efficient training.
6. Add padding as $<$PAD$>$ to each text to make sure all of them have equal length to MAX_SEQ_LENGTH (20).
    
There is no necessary to append $<$ST$>$ and $<$ED$>$ to each text because we don't need to generate any sequence in this task.
    
We can decode sequence vocabulary IDs by looking up the vocabulary dictionary:

1. dictionary/word2Id.npy is a numpy array mapping word to id.
2. dictionary/id2Word.npy is a numpy array mapping id back to word.

In [6]:
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

there are 5427 vocabularies in total
Word to id mapping, for example: flower -> 1
Id to word mapping, for example: 1 -> flower
Tokens: <PAD>: 5427; <RARE>: 5428


In [7]:
def sent2IdList(line, MAX_SEQ_LENGTH=20):
    MAX_SEQ_LIMIT = MAX_SEQ_LENGTH
    padding = 0
    
    # data preprocessing, remove all puntuation in the texts
    prep_line = re.sub('[%s]' % re.escape(string.punctuation), ' ', line.rstrip())
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('  ', ' ')
    prep_line = prep_line.replace('.', '')
    tokens = prep_line.split(' ')
    tokens = [
        tokens[i] for i in range(len(tokens))
        if tokens[i] != ' ' and tokens[i] != ''
    ]
    l = len(tokens)
    padding = MAX_SEQ_LIMIT - l
    
    # make sure length of each text is equal to MAX_SEQ_LENGTH, and replace the less common word with <RARE> token
    for i in range(padding):
        tokens.append('<PAD>')
    line = [
        word2Id_dict[tokens[k]]
        if tokens[k] in word2Id_dict else word2Id_dict['<RARE>']
        for k in range(len(tokens))
    ]

    return line

In [8]:
text = "the flower shown has yellow anther red pistil and bright red petals."
print(text)
print(sent2IdList(text))

the flower shown has yellow anther red pistil and bright red petals.
['9', '1', '82', '5', '11', '70', '20', '31', '3', '29', '20', '2', '5427', '5427', '5427', '5427', '5427', '5427', '5427', '5427']


### 2. 將圖片與文字對應

In [9]:
df = pd.read_pickle(data_path + '/text2img_cls_embedding.pkl')
num_training_sample = len(df)
n_images_train = num_training_sample
print('There are %d image in training data' % (n_images_train))

There are 7370 image in training data


In [10]:
df.head(5)

Unnamed: 0_level_0,Captions,ImagePath,texts,embeddings
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6734,"[[9, 2, 17, 9, 1, 6, 14, 13, 18, 3, 41, 8, 11,...",./102flowers/image_06734.jpg,[the petals of the flower are pink in color an...,"[[0.18257322907447815, 0.7088410258293152, 0.4..."
6736,"[[4, 1, 5, 12, 2, 3, 11, 31, 28, 68, 106, 132,...",./102flowers/image_06736.jpg,[this flower has white petals and yellow pisti...,"[[0.1587948352098465, 0.7034167051315308, 0.48..."
6737,"[[9, 2, 27, 4, 1, 6, 14, 7, 12, 19, 5427, 5427...",./102flowers/image_06737.jpg,[the petals on this flower are pink with white...,"[[0.18289019167423248, 0.7226691246032715, 0.4..."
6738,"[[9, 1, 5, 8, 54, 16, 38, 7, 12, 116, 325, 3, ...",./102flowers/image_06738.jpg,[the flower has a smooth purple petal with whi...,"[[0.17855443060398102, 0.7165486812591553, 0.4..."
6739,"[[4, 12, 1, 5, 29, 11, 19, 7, 26, 70, 5427, 54...",./102flowers/image_06739.jpg,[this white flower has bright yellow stamen wi...,"[[0.17934608459472656, 0.7307049632072449, 0.4..."


### 3. 生成dataset
in this competition, you have to generate image in size 64x64x3

In [11]:
def training_data_generator(caption, image_path):
    # load in the image according to image path
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, size=[IMAGE_HEIGHT, IMAGE_WIDTH])
    img.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL])
    caption = tf.cast(caption, tf.int32)

    return img, caption

def dataset_generator(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    captions = df['Captions'].values
    caption = []
    # each image has 1 to 10 corresponding captions
    # we choose one of them randomly for training
    for i in range(len(captions)):
        caption.append(random.choice(captions[i]))
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    image_path = df['ImagePath'].values
    
    # assume that each row of `features` corresponds to the same row as `labels`.
    assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

In [12]:
dataset = dataset_generator(data_path + '/text2ImgData.pkl', BATCH_SIZE, training_data_generator)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


## 二. Conditional GAN Model
### 1. Text Encoder
A RNN encoder that captures the meaning of input text.
1. Input: text, which is a list of ids.
2. Output: embedding, or hidden representation of input text.

In [13]:
class TextEncoder(tf.keras.Model):
    """
    Encode text (a caption) into hidden representation
    input: text, which is a list of ids
    output: embedding, or hidden representation of input text in dimension of RNN_HIDDEN_SIZE
    """
    def __init__(self, hparas):
        super(TextEncoder, self).__init__()
        self.hparas = hparas
        self.batch_size = self.hparas['BATCH_SIZE']
        
        # embedding with tensorflow API
        self.embedding = layers.Embedding(self.hparas['VOCAB_SIZE'], self.hparas['EMBED_DIM'])
        # RNN, here we use GRU cell, another common RNN cell similar to LSTM
        self.gru = layers.GRU(self.hparas['RNN_HIDDEN_SIZE'],
                              return_sequences=True,
                              return_state=True,
                              recurrent_initializer='glorot_uniform')
    
    def call(self, text, hidden):
        text = self.embedding(text)
        output, state = self.gru(text, initial_state = hidden)
        return output[:, -1, :], state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.hparas['BATCH_SIZE'], self.hparas['RNN_HIDDEN_SIZE']))

### 2. Generator
A image generator which generates the target image illustrating the input text.

1. Input: hidden representation of input text and random noise z with random seed.
2. Output: target image, which is conditioned on the given text, in size 64x64x3.

In [14]:
class Generator(tf.keras.Model):
    """
    Generate fake image based on given text(hidden representation) and noise z
    input: text and noise
    output: fake image with size 64*64*3
    """
    def __init__(self, hparas):
        super(Generator, self).__init__()
        self.hparas = hparas
        self.flatten1 = tf.keras.layers.Flatten()
        self.d1 = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d2 = tf.keras.layers.Dense(64*3)
        self.resnet101 = ResNet101(include_top=False, 
                                   weights='imagenet', 
                                   input_tensor=None, 
                                   input_shape=INPUT_SHAPE)
        self.flatten2 = tf.keras.layers.Flatten()
        self.d3 = tf.keras.layers.Dense(64*64*3)
        
    def call(self, text, noise_z):
        text = self.flatten1(text)
        text = self.d1(text)
        text = tf.nn.leaky_relu(text)
        
        # concatenate input text and random noise
        text_concat = tf.concat([noise_z, text], axis=1)
        text_concat = self.d2(text_concat)
        text_concat = tf.reshape(text_concat, [-1, 64, 64, 3])
        text_concat = self.resnet101(text_concat)
        text_concat = self.flatten1(text_concat)
        text_concat = self.d3(text_concat)
        logits = tf.reshape(text_concat, [-1, 64, 64, 3])
        output = tf.nn.tanh(logits)
        
        return logits, output

### 3. Discriminator
A binary classifier which can discriminate the real and fake image:

1. Real image

    Input: real image and the paired text
    
    Output: a floating number representing the result, which is expected to be 1.
    
2. Fake Image

    Input: generated image and paired text
    
    Output: a floating number representing the result, which is expected to be 0.

In [15]:
class Discriminator(tf.keras.Model):
    """
    Differentiate the real and fake image
    input: image and corresponding text
    output: labels, the real image should be 1, while the fake should be 0
    """
    def __init__(self, hparas):
        super(Discriminator, self).__init__()
        self.hparas = hparas
        
        # text
        self.flatten_text = tf.keras.layers.Flatten()
        self.d_text = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        
        # image
        self.resnet50 = ResNet50(include_top=False, 
                                   weights='imagenet', 
                                   input_tensor=None, 
                                   input_shape=INPUT_SHAPE)
        self.flatten_img = tf.keras.layers.Flatten()
        self.d_img = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        
        # concat
        self.d = tf.keras.layers.Dense(1)
    
    def call(self, img, text):
        text = self.flatten_text(text)
        text = self.d_text(text)
        text = tf.nn.leaky_relu(text)
        
        img = self.resnet50(img)
        img = self.flatten_img(img)
        img = self.d_img(img)
        img = tf.nn.leaky_relu(img)
        
        # concatenate image with paired text
        img_text = tf.concat([text, img], axis=0)
        
        logits = self.d(img_text)
        output = tf.nn.sigmoid(logits)
        
        return logits, output

### 4. 組裝Conditional GAN

In [16]:
hparas['N_SAMPLE'] = num_training_sample # size of training data
hparas['VOCAB_SIZE'] = len(word2Id_dict) # size of dictionary of captions

In [17]:
text_encoder = TextEncoder(hparas)
generator = Generator(hparas)
discriminator = Discriminator(hparas)

## 三. Conditional GAN的訓練設定
### 1. Loss Function
This method returns a helper function to compute cross entropy loss

In [18]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [19]:
def discriminator_loss(real_logits, fake_logits):
    # output value of real image should be 1
    real_loss = cross_entropy(tf.ones_like(real_logits), real_logits)
    # output value of fake image should be 0
    fake_loss = cross_entropy(tf.zeros_like(fake_logits), fake_logits)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    # output value of fake image should be 0
    return cross_entropy(tf.ones_like(fake_output), fake_output)

### 2. Optimization
we use seperated optimizers for training generator and discriminator

In [20]:
generator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])
discriminator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])

### 3. checkpoint
one benefit of tf.train.Checkpoint() API is we can save everything seperately

In [21]:
checkpoint_dir = hparas['CHECKPOINTS_DIR']
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 text_encoder=text_encoder,
                                 generator=generator,
                                 discriminator=discriminator)

### 4. 定義訓練函式

In [22]:
@tf.function
def train_step(real_image, caption, hidden):
    # random noise for generator
    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        text_embed, hidden = text_encoder(caption, hidden)
        _, fake_image = generator(text_embed, noise)
        real_logits, real_output = discriminator(real_image, text_embed)
        fake_logits, fake_output = discriminator(fake_image, text_embed)

        g_loss = generator_loss(fake_logits)
        d_loss = discriminator_loss(real_logits, fake_logits)

    grad_g = gen_tape.gradient(g_loss, generator.trainable_variables)
    grad_d = disc_tape.gradient(d_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(grad_g, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(grad_d, discriminator.trainable_variables))
    
    return g_loss, d_loss

In [23]:
@tf.function
def test_step(caption, noise, hidden):
    text_embed, hidden = text_encoder(caption, hidden)
    _, fake_image = generator(text_embed, noise)
    return fake_image

## 五. 訓練Conditional GAN
### 1. 視覺化訓練過程

In [24]:
def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    # getting the pixel values between [0, 1] to save it
    return plt.imsave(path, merge(images, size)*0.5 + 0.5)

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

In [25]:
def sample_generator(caption, batch_size):
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    dataset = tf.data.Dataset.from_tensor_slices(caption)
    dataset = dataset.batch(batch_size)
    return dataset

### 2. Random seed設定
We always use same random seed and same senteces during training, which is more convenient for us to evaluate the quality of generated image.

In [26]:
ni = int(np.ceil(np.sqrt(hparas['BATCH_SIZE'])))
sample_size = hparas['BATCH_SIZE']
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
sample_sentence = ["the flower shown has yellow anther red pistil and bright red petals."] * int(sample_size/ni) + \
                  ["this flower has petals that are yellow, white and purple and has dark lines"] * int(sample_size/ni) + \
                  ["the petals on this flower are white with a yellow center"] * int(sample_size/ni) + \
                  ["this flower has a lot of small round pink petals."] * int(sample_size/ni) + \
                  ["this flower is orange in color, and has petals that are ruffled and rounded."] * int(sample_size/ni) + \
                  ["the flower has yellow petals and the center of it is brown."] * int(sample_size/ni) + \
                  ["this flower has petals that are blue and white."] * int(sample_size/ni) +\
                  ["these white flowers have petals that start off white in color and end in a white towards the tips."] * int(sample_size/ni)

for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2IdList(sent)
sample_sentence = sample_generator(sample_sentence, hparas['BATCH_SIZE'])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


### 3. 開始訓練

In [27]:
def train(dataset, epochs):
    hidden = text_encoder.initialize_hidden_state() # hidden state of RNN
    steps_per_epoch = int(hparas['N_SAMPLE']/hparas['BATCH_SIZE'])
    
    for epoch in tqdm(range(hparas['N_EPOCH'])):
        g_total_loss = 0
        d_total_loss = 0
        start = time.time()
        
        for image, caption in dataset:
            g_loss, d_loss = train_step(image, caption, hidden)
            g_total_loss += g_loss
            d_total_loss += d_loss
            
        time_tuple = time.localtime()
        time_string = time.strftime("%m/%d/%Y, %H:%M:%S", time_tuple)
            
        print("Epoch {}, gen_loss: {:.4f}, disc_loss: {:.4f}".format(epoch+1,
                                                                     g_total_loss/steps_per_epoch,
                                                                     d_total_loss/steps_per_epoch))
        print('Time for epoch {} is {:.4f} sec'.format(epoch+1, time.time()-start))
        
        # save the model
        if (epoch + 1) % 50 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
        
        # visualization
        if (epoch + 1) % hparas['PRINT_FREQ'] == 0:
            for caption in sample_sentence:
                fake_image = test_step(caption, sample_seed, hidden)
            save_images(fake_image, [ni, ni], 'samples/demo/train_{:02d}.jpg'.format(epoch))

In [28]:
train(dataset, hparas['N_EPOCH'])

  0%|          | 0/600 [00:00<?, ?it/s]

2021-12-28 15:02:00.926975: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-28 15:02:00.948462: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2592000000 Hz
2021-12-28 15:02:35.707661: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2021-12-28 15:02:36.299309: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8100
2021-12-28 15:02:36.930362: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-12-28 15:02:36.930388: W tensorflow/stream_executor/gpu/asm_compiler.cc:56] Couldn't invoke ptxas --version
2021-12-28 15:02:36.930836: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-12-28 15:02:36.930869: W tensorflow/stream_executor/gpu/redzone_allocator.cc:31

Epoch 1, gen_loss: 0.6710, disc_loss: 1.1220
Time for epoch 1 is 52.6010 sec
Epoch 2, gen_loss: 0.7054, disc_loss: 1.0854
Time for epoch 2 is 18.7803 sec
Epoch 3, gen_loss: 0.7440, disc_loss: 1.0745
Time for epoch 3 is 19.0732 sec
Epoch 4, gen_loss: 0.7836, disc_loss: 1.0524
Time for epoch 4 is 19.1270 sec
Epoch 5, gen_loss: 0.8196, disc_loss: 1.0479
Time for epoch 5 is 18.7572 sec
Epoch 6, gen_loss: 0.8529, disc_loss: 1.0427
Time for epoch 6 is 18.7547 sec
Epoch 7, gen_loss: 0.8847, disc_loss: 1.0371
Time for epoch 7 is 18.8012 sec
Epoch 8, gen_loss: 0.9145, disc_loss: 1.0278
Time for epoch 8 is 18.7411 sec
Epoch 9, gen_loss: 0.9424, disc_loss: 1.0214
Time for epoch 9 is 18.7653 sec
Epoch 10, gen_loss: 0.9651, disc_loss: 1.0260
Time for epoch 10 is 18.8766 sec
Epoch 11, gen_loss: 0.9853, disc_loss: 1.0249
Time for epoch 11 is 18.8212 sec
Epoch 12, gen_loss: 1.0025, disc_loss: 1.0153
Time for epoch 12 is 18.9284 sec
Epoch 13, gen_loss: 1.0176, disc_loss: 1.0202
Time for epoch 13 is 19.

Epoch 105, gen_loss: 1.0717, disc_loss: 1.0177
Time for epoch 105 is 18.6665 sec
Epoch 106, gen_loss: 1.0718, disc_loss: 1.0177
Time for epoch 106 is 18.6620 sec
Epoch 107, gen_loss: 1.0718, disc_loss: 1.0177
Time for epoch 107 is 18.6861 sec
Epoch 108, gen_loss: 1.0718, disc_loss: 1.0177
Time for epoch 108 is 18.6634 sec
Epoch 109, gen_loss: 1.0718, disc_loss: 1.0177
Time for epoch 109 is 18.6577 sec
Epoch 110, gen_loss: 1.0719, disc_loss: 1.0177
Time for epoch 110 is 18.6651 sec
Epoch 111, gen_loss: 1.0719, disc_loss: 1.0177
Time for epoch 111 is 18.6892 sec
Epoch 112, gen_loss: 1.0719, disc_loss: 1.0177
Time for epoch 112 is 18.6674 sec
Epoch 113, gen_loss: 1.0719, disc_loss: 1.0176
Time for epoch 113 is 18.6615 sec
Epoch 114, gen_loss: 1.0719, disc_loss: 1.0176
Time for epoch 114 is 18.6510 sec
Epoch 115, gen_loss: 1.0719, disc_loss: 1.0178
Time for epoch 115 is 18.6606 sec
Epoch 116, gen_loss: 1.0719, disc_loss: 1.0178
Time for epoch 116 is 18.6503 sec
Epoch 117, gen_loss: 1.0720,

Epoch 207, gen_loss: 1.1898, disc_loss: 0.9485
Time for epoch 207 is 18.6741 sec
Epoch 208, gen_loss: 1.2089, disc_loss: 0.9485
Time for epoch 208 is 18.6773 sec
Epoch 209, gen_loss: 1.2200, disc_loss: 0.9485
Time for epoch 209 is 18.6612 sec
Epoch 210, gen_loss: 1.2296, disc_loss: 0.9485
Time for epoch 210 is 18.6769 sec
Epoch 211, gen_loss: 1.2362, disc_loss: 0.9485
Time for epoch 211 is 18.6774 sec
Epoch 212, gen_loss: 1.2428, disc_loss: 0.9485
Time for epoch 212 is 18.6920 sec
Epoch 213, gen_loss: 1.2475, disc_loss: 0.9485
Time for epoch 213 is 18.6887 sec
Epoch 214, gen_loss: 1.2531, disc_loss: 0.9485
Time for epoch 214 is 18.6639 sec
Epoch 215, gen_loss: 1.2588, disc_loss: 0.9485
Time for epoch 215 is 18.6477 sec
Epoch 216, gen_loss: 1.2601, disc_loss: 0.9485
Time for epoch 216 is 18.6773 sec
Epoch 217, gen_loss: 1.2654, disc_loss: 0.9485
Time for epoch 217 is 18.6707 sec
Epoch 218, gen_loss: 1.2666, disc_loss: 0.9485
Time for epoch 218 is 18.6773 sec
Epoch 219, gen_loss: 1.2719,

Epoch 309, gen_loss: 1.3495, disc_loss: 0.9485
Time for epoch 309 is 18.6760 sec
Epoch 310, gen_loss: 1.3491, disc_loss: 0.9485
Time for epoch 310 is 18.6637 sec
Epoch 311, gen_loss: 1.3508, disc_loss: 0.9485
Time for epoch 311 is 18.6761 sec
Epoch 312, gen_loss: 1.3512, disc_loss: 0.9485
Time for epoch 312 is 18.6581 sec
Epoch 313, gen_loss: 1.3512, disc_loss: 0.9485
Time for epoch 313 is 18.6548 sec
Epoch 314, gen_loss: 1.3520, disc_loss: 0.9485
Time for epoch 314 is 18.6558 sec
Epoch 315, gen_loss: 1.3573, disc_loss: 0.9485
Time for epoch 315 is 18.6591 sec
Epoch 316, gen_loss: 1.3518, disc_loss: 0.9485
Time for epoch 316 is 18.6586 sec
Epoch 317, gen_loss: 1.3508, disc_loss: 0.9485
Time for epoch 317 is 18.6463 sec
Epoch 318, gen_loss: 1.3542, disc_loss: 0.9485
Time for epoch 318 is 18.6523 sec
Epoch 319, gen_loss: 1.3542, disc_loss: 0.9485
Time for epoch 319 is 18.6554 sec
Epoch 320, gen_loss: 1.3575, disc_loss: 0.9485
Time for epoch 320 is 18.6542 sec
Epoch 321, gen_loss: 1.3534,

Epoch 411, gen_loss: 1.3218, disc_loss: 0.9485
Time for epoch 411 is 18.6599 sec
Epoch 412, gen_loss: 1.3227, disc_loss: 0.9485
Time for epoch 412 is 18.6713 sec
Epoch 413, gen_loss: 1.3236, disc_loss: 0.9485
Time for epoch 413 is 18.6664 sec
Epoch 414, gen_loss: 1.3244, disc_loss: 0.9485
Time for epoch 414 is 18.6539 sec
Epoch 415, gen_loss: 1.3251, disc_loss: 0.9485
Time for epoch 415 is 18.6573 sec
Epoch 416, gen_loss: 1.3260, disc_loss: 0.9485
Time for epoch 416 is 18.6596 sec
Epoch 417, gen_loss: 1.3264, disc_loss: 0.9485
Time for epoch 417 is 18.6544 sec
Epoch 418, gen_loss: 1.3274, disc_loss: 0.9485
Time for epoch 418 is 18.6438 sec
Epoch 419, gen_loss: 1.3283, disc_loss: 0.9485
Time for epoch 419 is 18.6624 sec
Epoch 420, gen_loss: 1.3286, disc_loss: 0.9485
Time for epoch 420 is 18.6603 sec
Epoch 421, gen_loss: 1.3293, disc_loss: 0.9485
Time for epoch 421 is 18.6752 sec
Epoch 422, gen_loss: 1.3302, disc_loss: 0.9485
Time for epoch 422 is 18.6648 sec
Epoch 423, gen_loss: 1.3306,

Epoch 513, gen_loss: 1.3740, disc_loss: 0.9485
Time for epoch 513 is 18.6439 sec
Epoch 514, gen_loss: 1.3723, disc_loss: 0.9485
Time for epoch 514 is 18.6638 sec
Epoch 515, gen_loss: 1.3755, disc_loss: 0.9485
Time for epoch 515 is 18.9003 sec
Epoch 516, gen_loss: 1.3738, disc_loss: 0.9485
Time for epoch 516 is 18.9765 sec
Epoch 517, gen_loss: 1.3762, disc_loss: 0.9485
Time for epoch 517 is 19.0320 sec
Epoch 518, gen_loss: 1.3755, disc_loss: 0.9485
Time for epoch 518 is 18.6926 sec
Epoch 519, gen_loss: 1.3745, disc_loss: 0.9485
Time for epoch 519 is 18.9789 sec
Epoch 520, gen_loss: 1.3769, disc_loss: 0.9485
Time for epoch 520 is 18.8431 sec
Epoch 521, gen_loss: 1.3755, disc_loss: 0.9485
Time for epoch 521 is 19.4866 sec
Epoch 522, gen_loss: 1.3791, disc_loss: 0.9485
Time for epoch 522 is 18.6836 sec
Epoch 523, gen_loss: 1.3767, disc_loss: 0.9485
Time for epoch 523 is 18.8783 sec
Epoch 524, gen_loss: 1.3782, disc_loss: 0.9485
Time for epoch 524 is 20.2625 sec
Epoch 525, gen_loss: 1.3775,

## 六. Evaluation
dataset/testData.pkl is a pandas dataframe containing testing text with attributes 'ID' and 'Captions'.
1. 'ID': text ID used to name generated image.
2. 'Captions': text used as condition to generate image.
For each captions, you need to generate inference_ID.png to evaluate quality of generated image. You must name the generated image in this format, otherwise we cannot evaluate your images.

### 1. Testing Dataset
If you change anything during preprocessing of training dataset, you must make sure same operations have be done in testing dataset.

In [86]:
def testing_data_generator(caption, index):
    caption = tf.cast(caption, tf.float32)
    return caption, index

def testing_dataset_generator(batch_size, data_generator):
    data = pd.read_pickle('./dataset/testData_cls_embedding.pkl')
    captions = data['Captions'].values
    caption = []
    for i in range(len(captions)):
        caption.append(captions[i])
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    index = data['ID'].values
    index = np.asarray(index)
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, index))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat().batch(batch_size)
    
    return dataset

In [87]:
testing_dataset = testing_dataset_generator(hparas['BATCH_SIZE'], testing_data_generator)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


In [88]:
data = pd.read_pickle('./dataset/testData_cls_embedding.pkl')
captions = data['Captions'].values

NUM_TEST = len(captions)
EPOCH_TEST = int(NUM_TEST / hparas['BATCH_SIZE'])

### 2. Inferece

In [89]:
def inference(dataset):
    hidden = text_encoder.initialize_hidden_state()
    sample_size = hparas['BATCH_SIZE']
    sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
    
    step = 0
    start = time.time()
    for captions, idx in dataset:
        if step > EPOCH_TEST:
            break
        
        fake_image = test_step(captions, sample_seed, hidden)
        step += 1
        for i in range(hparas['BATCH_SIZE']):
            plt.imsave('./inference/demo/inference_{:04d}.jpg'.format(idx[i]), fake_image[i].numpy()*0.5 + 0.5)
            
    print('Time for inference is {:.4f} sec'.format(time.time()-start))

In [90]:
checkpoint.restore(checkpoint_dir + '/ckpt-12')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8ae8d6b250>

In [91]:
inference(testing_dataset)

ValueError: in user code:

    /tmp/ipykernel_362804/2145740001.py:3 test_step  *
        text_embed, hidden = text_encoder(caption, hidden)
    /tmp/ipykernel_362804/3772598852.py:22 call  *
        output, state = self.gru(text, initial_state = hidden)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:725 __call__  **
        return super(RNN, self).__call__(inputs, **kwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1013 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:215 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer gru is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (64, 1, 20, 256)


### 3. 計算Inception Score & Cosine Similarity

In [None]:
! python ./testing/inception_score.py ./inference/demo ./score_team2.csv 39