# Cup03 Reverse Image Caption
110065508 李丞恩

In [1]:
IMAGE_HEIGHT = 64
IMAGE_WIDTH = 64
IMAGE_CHANNEL = 3
dictionary_path = 'dictionary'
data_path = 'dataset'
BATCH_SIZE = 64
hparas = {
    'MAX_SEQ_LENGTH': 20,                     # maximum sequence length
    'EMBED_DIM': 256,                         # word embedding dimension
    'RNN_HIDDEN_SIZE': 128,                   # number of RNN neurons
    'Z_DIM': 512,                             # random noise z dimension
    'DENSE_DIM': 128,                         # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                # render image size
    'BATCH_SIZE': 64,
    'LR': 1e-4,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 600,
    'CHECKPOINTS_DIR': './checkpoints/demo',  # checkpoint path
    'PRINT_FREQ': 1                           # printing frequency of loss
}

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import re
import PIL
import time
import string
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras import layers
from pathlib import Path
from IPython import display
from tqdm.notebook import tqdm

2021-12-27 02:58:05.235919: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable warnings, info and errors 

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


2021-12-27 02:58:05.923348: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-12-27 02:58:05.989037: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-27 02:58:05.991477: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3060 computeCapability: 8.6
coreClock: 1.852GHz coreCount: 28 deviceMemorySize: 11.77GiB deviceMemoryBandwidth: 335.32GiB/s
2021-12-27 02:58:05.991570: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-12-27 02:58:05.997499: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-12-27 02:58:05.997599: I tensorflow/stream_executor/pl

In [5]:
if not os.path.exists('samples/demo'): os.makedirs('samples/demo')
if not os.path.exists('checkpoints/demo'): os.makedirs('checkpoints/demo')
if not os.path.exists('inference/demo'): os.makedirs('inference/demo')

## 一. 資料前處理
### 1. 文字前處理
感謝助教已經幫我們處理好了以下的部分：

1. Delete text over MAX_SEQ_LENGTH (20).
2. Delete all puntuation in the texts.
3. Encode each vocabulary in dictionary/vocab.npy.
4. Represent texts by a sequence of integer IDs.
5. Replace rare words by $<$RARE$>$ token to reduce vocabulary size for more efficient training.
6. Add padding as $<$PAD$>$ to each text to make sure all of them have equal length to MAX_SEQ_LENGTH (20).
    
There is no necessary to append $<$ST$>$ and $<$ED$>$ to each text because we don't need to generate any sequence in this task.
    
We can decode sequence vocabulary IDs by looking up the vocabulary dictionary:

1. dictionary/word2Id.npy is a numpy array mapping word to id.
2. dictionary/id2Word.npy is a numpy array mapping id back to word.

In [6]:
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

there are 5427 vocabularies in total
Word to id mapping, for example: flower -> 1
Id to word mapping, for example: 1 -> flower
Tokens: <PAD>: 5427; <RARE>: 5428


In [7]:
def sent2IdList(line, MAX_SEQ_LENGTH=20):
    MAX_SEQ_LIMIT = MAX_SEQ_LENGTH
    padding = 0
    
    # data preprocessing, remove all puntuation in the texts
    prep_line = re.sub('[%s]' % re.escape(string.punctuation), ' ', line.rstrip())
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('  ', ' ')
    prep_line = prep_line.replace('.', '')
    tokens = prep_line.split(' ')
    tokens = [
        tokens[i] for i in range(len(tokens))
        if tokens[i] != ' ' and tokens[i] != ''
    ]
    l = len(tokens)
    padding = MAX_SEQ_LIMIT - l
    
    # make sure length of each text is equal to MAX_SEQ_LENGTH, and replace the less common word with <RARE> token
    for i in range(padding):
        tokens.append('<PAD>')
    line = [
        word2Id_dict[tokens[k]]
        if tokens[k] in word2Id_dict else word2Id_dict['<RARE>']
        for k in range(len(tokens))
    ]

    return line

In [8]:
text = "the flower shown has yellow anther red pistil and bright red petals."
print(text)
print(sent2IdList(text))

the flower shown has yellow anther red pistil and bright red petals.
['9', '1', '82', '5', '11', '70', '20', '31', '3', '29', '20', '2', '5427', '5427', '5427', '5427', '5427', '5427', '5427', '5427']


### 2. 將圖片與文字對應

In [9]:
df = pd.read_pickle(data_path + '/text2ImgData.pkl')
num_training_sample = len(df)
n_images_train = num_training_sample
print('There are %d image in training data' % (n_images_train))

There are 7370 image in training data


In [10]:
df.head(5)

Unnamed: 0_level_0,Captions,ImagePath
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6734,"[[9, 2, 17, 9, 1, 6, 14, 13, 18, 3, 41, 8, 11,...",./102flowers/image_06734.jpg
6736,"[[4, 1, 5, 12, 2, 3, 11, 31, 28, 68, 106, 132,...",./102flowers/image_06736.jpg
6737,"[[9, 2, 27, 4, 1, 6, 14, 7, 12, 19, 5427, 5427...",./102flowers/image_06737.jpg
6738,"[[9, 1, 5, 8, 54, 16, 38, 7, 12, 116, 325, 3, ...",./102flowers/image_06738.jpg
6739,"[[4, 12, 1, 5, 29, 11, 19, 7, 26, 70, 5427, 54...",./102flowers/image_06739.jpg


### 3. 生成dataset

In [11]:
# in this competition, you have to generate image in size 64x64x3

def training_data_generator(caption, image_path):
    # load in the image according to image path
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, size=[IMAGE_HEIGHT, IMAGE_WIDTH])
    img.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL])
    caption = tf.cast(caption, tf.int32)

    return img, caption

def dataset_generator(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    captions = df['Captions'].values
    caption = []
    # each image has 1 to 10 corresponding captions
    # we choose one of them randomly for training
    for i in range(len(captions)):
        caption.append(random.choice(captions[i]))
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    image_path = df['ImagePath'].values
    
    # assume that each row of `features` corresponds to the same row as `labels`.
    assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

In [12]:
dataset = dataset_generator(data_path + '/text2ImgData.pkl', BATCH_SIZE, training_data_generator)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


## 二. Conditional GAN Model
### 1. Text Encoder
A RNN encoder that captures the meaning of input text.
1. Input: text, which is a list of ids.
2. Output: embedding, or hidden representation of input text.

In [13]:
class TextEncoder(tf.keras.Model):
    """
    Encode text (a caption) into hidden representation
    input: text, which is a list of ids
    output: embedding, or hidden representation of input text in dimension of RNN_HIDDEN_SIZE
    """
    def __init__(self, hparas):
        super(TextEncoder, self).__init__()
        self.hparas = hparas
        self.batch_size = self.hparas['BATCH_SIZE']
        
        # embedding with tensorflow API
        self.embedding = layers.Embedding(self.hparas['VOCAB_SIZE'], self.hparas['EMBED_DIM'])
        # RNN, here we use GRU cell, another common RNN cell similar to LSTM
        self.gru = layers.GRU(self.hparas['RNN_HIDDEN_SIZE'],
                              return_sequences=True,
                              return_state=True,
                              recurrent_initializer='glorot_uniform')
    
    def call(self, text, hidden):
        text = self.embedding(text)
        output, state = self.gru(text, initial_state = hidden)
        return output[:, -1, :], state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.hparas['BATCH_SIZE'], self.hparas['RNN_HIDDEN_SIZE']))

### 2. Generator
A image generator which generates the target image illustrating the input text.

1. Input: hidden representation of input text and random noise z with random seed.
2. Output: target image, which is conditioned on the given text, in size 64x64x3.

In [14]:
class Generator(tf.keras.Model):
    """
    Generate fake image based on given text(hidden representation) and noise z
    input: text and noise
    output: fake image with size 64*64*3
    """
    def __init__(self, hparas):
        super(Generator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.d1 = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d2 = tf.keras.layers.Dense(64*64*3)
        
    def call(self, text, noise_z):
        text = self.flatten(text)
        text = self.d1(text)
        text = tf.nn.leaky_relu(text)
        
        # concatenate input text and random noise
        text_concat = tf.concat([noise_z, text], axis=1)
        text_concat = self.d2(text_concat)
        
        logits = tf.reshape(text_concat, [-1, 64, 64, 3])
        output = tf.nn.tanh(logits)
        
        return logits, output

### 3. Discriminator
A binary classifier which can discriminate the real and fake image:

1. Real image

    Input: real image and the paired text
    
    Output: a floating number representing the result, which is expected to be 1.
    
2. Fake Image

    Input: generated image and paired text
    
    Output: a floating number representing the result, which is expected to be 0.

In [15]:
class Discriminator(tf.keras.Model):
    """
    Differentiate the real and fake image
    input: image and corresponding text
    output: labels, the real image should be 1, while the fake should be 0
    """
    def __init__(self, hparas):
        super(Discriminator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.d_text = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d_img = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d = tf.keras.layers.Dense(1)
    
    def call(self, img, text):
        text = self.flatten(text)
        text = self.d_text(text)
        text = tf.nn.leaky_relu(text)
        
        img = self.flatten(img)
        img = self.d_img(img)
        img = tf.nn.leaky_relu(img)
        
        # concatenate image with paired text
        img_text = tf.concat([text, img], axis=1)
        
        logits = self.d(img_text)
        output = tf.nn.sigmoid(logits)
        
        return logits, output

### 4. 組裝Conditional GAN

In [16]:
hparas['N_SAMPLE'] = num_training_sample # size of training data
hparas['VOCAB_SIZE'] = len(word2Id_dict) # size of dictionary of captions

In [17]:
text_encoder = TextEncoder(hparas)
generator = Generator(hparas)
discriminator = Discriminator(hparas)

## 三. Conditional GAN的訓練設定
### 1. Loss Function

In [18]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [19]:
def discriminator_loss(real_logits, fake_logits):
    # output value of real image should be 1
    real_loss = cross_entropy(tf.ones_like(real_logits), real_logits)
    # output value of fake image should be 0
    fake_loss = cross_entropy(tf.zeros_like(fake_logits), fake_logits)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    # output value of fake image should be 0
    return cross_entropy(tf.ones_like(fake_output), fake_output)

### 2. Optimization

In [20]:
# we use seperated optimizers for training generator and discriminator
generator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])
discriminator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])

### 3. checkpoint

In [21]:
# one benefit of tf.train.Checkpoint() API is we can save everything seperately
checkpoint_dir = hparas['CHECKPOINTS_DIR']
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 text_encoder=text_encoder,
                                 generator=generator,
                                 discriminator=discriminator)

### 4. 定義訓練函式

In [22]:
@tf.function
def train_step(real_image, caption, hidden):
    # random noise for generator
    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        text_embed, hidden = text_encoder(caption, hidden)
        _, fake_image = generator(text_embed, noise)
        real_logits, real_output = discriminator(real_image, text_embed)
        fake_logits, fake_output = discriminator(fake_image, text_embed)

        g_loss = generator_loss(fake_logits)
        d_loss = discriminator_loss(real_logits, fake_logits)

    grad_g = gen_tape.gradient(g_loss, generator.trainable_variables)
    grad_d = disc_tape.gradient(d_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(grad_g, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(grad_d, discriminator.trainable_variables))
    
    return g_loss, d_loss

In [23]:
@tf.function
def test_step(caption, noise, hidden):
    text_embed, hidden = text_encoder(caption, hidden)
    _, fake_image = generator(text_embed, noise)
    return fake_image

## 五. 訓練Conditional GAN
### 1. 視覺化訓練過程

In [24]:
def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    # getting the pixel values between [0, 1] to save it
    return plt.imsave(path, merge(images, size)*0.5 + 0.5)

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

In [25]:
def sample_generator(caption, batch_size):
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    dataset = tf.data.Dataset.from_tensor_slices(caption)
    dataset = dataset.batch(batch_size)
    return dataset

### 2. Random seed設定
We always use same random seed and same senteces during training, which is more convenient for us to evaluate the quality of generated image.

In [26]:
ni = int(np.ceil(np.sqrt(hparas['BATCH_SIZE'])))
sample_size = hparas['BATCH_SIZE']
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
sample_sentence = ["the flower shown has yellow anther red pistil and bright red petals."] * int(sample_size/ni) + \
                  ["this flower has petals that are yellow, white and purple and has dark lines"] * int(sample_size/ni) + \
                  ["the petals on this flower are white with a yellow center"] * int(sample_size/ni) + \
                  ["this flower has a lot of small round pink petals."] * int(sample_size/ni) + \
                  ["this flower is orange in color, and has petals that are ruffled and rounded."] * int(sample_size/ni) + \
                  ["the flower has yellow petals and the center of it is brown."] * int(sample_size/ni) + \
                  ["this flower has petals that are blue and white."] * int(sample_size/ni) +\
                  ["these white flowers have petals that start off white in color and end in a white towards the tips."] * int(sample_size/ni)

for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2IdList(sent)
sample_sentence = sample_generator(sample_sentence, hparas['BATCH_SIZE'])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


### 3. 開始訓練

In [27]:
def train(dataset, epochs):
    # hidden state of RNN
    hidden = text_encoder.initialize_hidden_state()
    steps_per_epoch = int(hparas['N_SAMPLE']/hparas['BATCH_SIZE'])
    
    for epoch in tqdm(range(hparas['N_EPOCH'])):
        g_total_loss = 0
        d_total_loss = 0
        start = time.time()
        
        for image, caption in dataset:
            g_loss, d_loss = train_step(image, caption, hidden)
            g_total_loss += g_loss
            d_total_loss += d_loss
            
        time_tuple = time.localtime()
        time_string = time.strftime("%m/%d/%Y, %H:%M:%S", time_tuple)
            
        print("Epoch {}, gen_loss: {:.4f}, disc_loss: {:.4f}".format(epoch+1,
                                                                     g_total_loss/steps_per_epoch,
                                                                     d_total_loss/steps_per_epoch))
        print('Time for epoch {} is {:.4f} sec'.format(epoch+1, time.time()-start))
        
        # save the model
        if (epoch + 1) % 50 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
        
        # visualization
        if (epoch + 1) % hparas['PRINT_FREQ'] == 0:
            for caption in sample_sentence:
                fake_image = test_step(caption, sample_seed, hidden)
            save_images(fake_image, [ni, ni], 'samples/demo/train_{:02d}.jpg'.format(epoch))

In [28]:
train(dataset, hparas['N_EPOCH'])

  0%|          | 0/600 [00:00<?, ?it/s]

2021-12-27 02:58:06.731620: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-27 02:58:06.752419: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2592000000 Hz
2021-12-27 02:58:12.940912: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-12-27 02:58:13.337812: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2021-12-27 02:58:13.337900: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2021-12-27 02:58:13.710819: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8100
2021-12-27 02:58:13.710859: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 1, gen_loss: 0.4678, disc_loss: 1.1100
Time for epoch 1 is 7.4006 sec
Epoch 2, gen_loss: 0.5060, disc_loss: 1.0469
Time for epoch 2 is 5.0306 sec
Epoch 3, gen_loss: 0.7571, disc_loss: 0.8117
Time for epoch 3 is 5.1630 sec
Epoch 4, gen_loss: 1.1963, disc_loss: 0.4930
Time for epoch 4 is 5.1649 sec
Epoch 5, gen_loss: 2.2521, disc_loss: 0.1578
Time for epoch 5 is 5.1864 sec
Epoch 6, gen_loss: 1.9384, disc_loss: 0.2211
Time for epoch 6 is 5.1333 sec
Epoch 7, gen_loss: 2.5433, disc_loss: 0.1342
Time for epoch 7 is 5.3131 sec
Epoch 8, gen_loss: 2.3577, disc_loss: 0.1552
Time for epoch 8 is 5.4274 sec
Epoch 9, gen_loss: 3.1850, disc_loss: 0.0892
Time for epoch 9 is 5.3154 sec
Epoch 10, gen_loss: 3.1057, disc_loss: 0.1390
Time for epoch 10 is 5.1081 sec
Epoch 11, gen_loss: 3.4770, disc_loss: 0.1336
Time for epoch 11 is 5.1579 sec
Epoch 12, gen_loss: 3.7218, disc_loss: 0.1883
Time for epoch 12 is 5.0438 sec
Epoch 13, gen_loss: 4.0260, disc_loss: 0.2556
Time for epoch 13 is 5.3961 sec
Epoc

Epoch 107, gen_loss: 1.4558, disc_loss: 1.0464
Time for epoch 107 is 4.9673 sec
Epoch 108, gen_loss: 1.4158, disc_loss: 1.0438
Time for epoch 108 is 4.9975 sec
Epoch 109, gen_loss: 1.3926, disc_loss: 1.0301
Time for epoch 109 is 4.9654 sec
Epoch 110, gen_loss: 1.4968, disc_loss: 1.0713
Time for epoch 110 is 4.9287 sec
Epoch 111, gen_loss: 1.3933, disc_loss: 1.2435
Time for epoch 111 is 4.9561 sec
Epoch 112, gen_loss: 1.4527, disc_loss: 1.1467
Time for epoch 112 is 4.9467 sec
Epoch 113, gen_loss: 1.5974, disc_loss: 1.0696
Time for epoch 113 is 5.0123 sec
Epoch 114, gen_loss: 1.8481, disc_loss: 0.8638
Time for epoch 114 is 5.0143 sec
Epoch 115, gen_loss: 1.6346, disc_loss: 0.9511
Time for epoch 115 is 4.9887 sec
Epoch 116, gen_loss: 1.7582, disc_loss: 0.9248
Time for epoch 116 is 4.9512 sec
Epoch 117, gen_loss: 1.4196, disc_loss: 0.9810
Time for epoch 117 is 5.0082 sec
Epoch 118, gen_loss: 1.6109, disc_loss: 0.9084
Time for epoch 118 is 4.9614 sec
Epoch 119, gen_loss: 1.5652, disc_loss: 

Epoch 210, gen_loss: 1.5297, disc_loss: 1.1404
Time for epoch 210 is 4.9344 sec
Epoch 211, gen_loss: 1.5401, disc_loss: 1.1906
Time for epoch 211 is 4.9614 sec
Epoch 212, gen_loss: 1.3186, disc_loss: 1.2923
Time for epoch 212 is 4.9610 sec
Epoch 213, gen_loss: 1.5054, disc_loss: 1.1187
Time for epoch 213 is 5.0137 sec
Epoch 214, gen_loss: 1.7280, disc_loss: 0.9272
Time for epoch 214 is 4.9365 sec
Epoch 215, gen_loss: 1.6472, disc_loss: 0.9820
Time for epoch 215 is 4.9751 sec
Epoch 216, gen_loss: 1.3863, disc_loss: 1.1109
Time for epoch 216 is 4.9819 sec
Epoch 217, gen_loss: 1.5392, disc_loss: 1.0161
Time for epoch 217 is 4.9925 sec
Epoch 218, gen_loss: 1.4691, disc_loss: 1.1096
Time for epoch 218 is 5.0277 sec
Epoch 219, gen_loss: 1.6580, disc_loss: 1.0258
Time for epoch 219 is 4.9376 sec
Epoch 220, gen_loss: 1.2769, disc_loss: 1.2943
Time for epoch 220 is 4.9450 sec
Epoch 221, gen_loss: 1.4762, disc_loss: 1.2341
Time for epoch 221 is 4.9447 sec
Epoch 222, gen_loss: 1.4170, disc_loss: 

Epoch 313, gen_loss: 1.4395, disc_loss: 1.1189
Time for epoch 313 is 4.9361 sec
Epoch 314, gen_loss: 1.3608, disc_loss: 1.1512
Time for epoch 314 is 4.9621 sec
Epoch 315, gen_loss: 1.6078, disc_loss: 1.1044
Time for epoch 315 is 4.9682 sec
Epoch 316, gen_loss: 1.5013, disc_loss: 1.0811
Time for epoch 316 is 4.9617 sec
Epoch 317, gen_loss: 1.3875, disc_loss: 1.1801
Time for epoch 317 is 4.9541 sec
Epoch 318, gen_loss: 1.4613, disc_loss: 1.2004
Time for epoch 318 is 4.9816 sec
Epoch 319, gen_loss: 1.5308, disc_loss: 1.1873
Time for epoch 319 is 4.9866 sec
Epoch 320, gen_loss: 1.4365, disc_loss: 1.2727
Time for epoch 320 is 4.9661 sec
Epoch 321, gen_loss: 1.7174, disc_loss: 0.9988
Time for epoch 321 is 4.9615 sec
Epoch 322, gen_loss: 1.6531, disc_loss: 0.9866
Time for epoch 322 is 5.0130 sec
Epoch 323, gen_loss: 1.5073, disc_loss: 1.1024
Time for epoch 323 is 4.9721 sec
Epoch 324, gen_loss: 1.7713, disc_loss: 0.9322
Time for epoch 324 is 5.0046 sec
Epoch 325, gen_loss: 1.5278, disc_loss: 

Epoch 416, gen_loss: 1.7563, disc_loss: 1.0232
Time for epoch 416 is 5.0385 sec
Epoch 417, gen_loss: 1.6312, disc_loss: 1.1762
Time for epoch 417 is 4.9010 sec
Epoch 418, gen_loss: 1.7363, disc_loss: 0.9886
Time for epoch 418 is 4.9259 sec
Epoch 419, gen_loss: 1.7442, disc_loss: 0.9431
Time for epoch 419 is 4.9625 sec
Epoch 420, gen_loss: 1.4303, disc_loss: 1.1677
Time for epoch 420 is 4.9695 sec
Epoch 421, gen_loss: 1.7740, disc_loss: 0.9579
Time for epoch 421 is 4.9896 sec
Epoch 422, gen_loss: 1.3537, disc_loss: 1.3060
Time for epoch 422 is 4.9346 sec
Epoch 423, gen_loss: 1.5435, disc_loss: 1.1540
Time for epoch 423 is 4.9403 sec
Epoch 424, gen_loss: 1.5055, disc_loss: 1.1194
Time for epoch 424 is 4.9615 sec
Epoch 425, gen_loss: 1.7622, disc_loss: 0.9168
Time for epoch 425 is 4.9292 sec
Epoch 426, gen_loss: 1.7482, disc_loss: 1.0887
Time for epoch 426 is 4.9626 sec
Epoch 427, gen_loss: 1.4484, disc_loss: 1.2432
Time for epoch 427 is 4.9235 sec
Epoch 428, gen_loss: 1.4851, disc_loss: 

Epoch 519, gen_loss: 1.5291, disc_loss: 1.2143
Time for epoch 519 is 4.9993 sec
Epoch 520, gen_loss: 1.3527, disc_loss: 1.4120
Time for epoch 520 is 4.9357 sec
Epoch 521, gen_loss: 1.5583, disc_loss: 1.2439
Time for epoch 521 is 4.9489 sec
Epoch 522, gen_loss: 1.4994, disc_loss: 1.3077
Time for epoch 522 is 4.9849 sec
Epoch 523, gen_loss: 1.8440, disc_loss: 1.0631
Time for epoch 523 is 5.0292 sec
Epoch 524, gen_loss: 1.4133, disc_loss: 1.3971
Time for epoch 524 is 5.0001 sec
Epoch 525, gen_loss: 1.5394, disc_loss: 1.2859
Time for epoch 525 is 4.9663 sec
Epoch 526, gen_loss: 2.0118, disc_loss: 0.9043
Time for epoch 526 is 4.9396 sec
Epoch 527, gen_loss: 1.8918, disc_loss: 1.0981
Time for epoch 527 is 4.9509 sec
Epoch 528, gen_loss: 1.9998, disc_loss: 0.8640
Time for epoch 528 is 4.9836 sec
Epoch 529, gen_loss: 1.7716, disc_loss: 0.9171
Time for epoch 529 is 4.9079 sec
Epoch 530, gen_loss: 2.1589, disc_loss: 0.7477
Time for epoch 530 is 4.9842 sec
Epoch 531, gen_loss: 2.1152, disc_loss: 

## 六. Evaluation
dataset/testData.pkl is a pandas dataframe containing testing text with attributes 'ID' and 'Captions'.
1. 'ID': text ID used to name generated image.
2. 'Captions': text used as condition to generate image.
For each captions, you need to generate inference_ID.png to evaluate quality of generated image. You must name the generated image in this format, otherwise we cannot evaluate your images.

### 1. Testing Dataset
If you change anything during preprocessing of training dataset, you must make sure same operations have be done in testing dataset.

In [29]:
def testing_data_generator(caption, index):
    caption = tf.cast(caption, tf.float32)
    return caption, index

def testing_dataset_generator(batch_size, data_generator):
    data = pd.read_pickle('./dataset/testData.pkl')
    captions = data['Captions'].values
    caption = []
    for i in range(len(captions)):
        caption.append(captions[i])
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    index = data['ID'].values
    index = np.asarray(index)
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, index))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat().batch(batch_size)
    
    return dataset

In [30]:
testing_dataset = testing_dataset_generator(hparas['BATCH_SIZE'], testing_data_generator)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  caption = caption.astype(np.int)


In [31]:
data = pd.read_pickle('./dataset/testData.pkl')
captions = data['Captions'].values

NUM_TEST = len(captions)
EPOCH_TEST = int(NUM_TEST / hparas['BATCH_SIZE'])

### 2. Inferece

In [32]:
def inference(dataset):
    hidden = text_encoder.initialize_hidden_state()
    sample_size = hparas['BATCH_SIZE']
    sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
    
    step = 0
    start = time.time()
    for captions, idx in dataset:
        if step > EPOCH_TEST:
            break
        
        fake_image = test_step(captions, sample_seed, hidden)
        step += 1
        for i in range(hparas['BATCH_SIZE']):
            plt.imsave('./inference/demo/inference_{:04d}.jpg'.format(idx[i]), fake_image[i].numpy()*0.5 + 0.5)
            
    print('Time for inference is {:.4f} sec'.format(time.time()-start))

In [33]:
checkpoint.restore(checkpoint_dir + '/ckpt-1')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7faaaaf58d60>

In [34]:
inference(testing_dataset)

Time for inference is 0.6415 sec


### 3. 計算Inception Score & Cosine Similarity

In [39]:
! python ./testing/inception_score.py ./inference/demo ./score_demo.csv 39

1 Physical GPUs, 1 Logical GPUs
--------------Evaluation Success-----------------
