In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# We uploaded our Jupyter notebook and data (images, cached features, and captions) onto Google Colaboratory.
# There are a few things that must be taken care of.
# First, we must unzip the folder containing our images.
# In order to reduce the run time, we use the validation set as our training set.
# !unzip '/content/drive/MyDrive/Image_Captioning/val.zip' -d '/content/drive/My Drive/Image_Captioning/data'

# Note that you will have to switch out these 2 ending paths: /MyDrive/Image_Captioning/val.zip and /MyDrive/Image_Captioning/data
# in order to accomodate the coordinates of your files.

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import collections
import random
import numpy as np
import os
import time
import json
from PIL import Image
from tqdm import tqdm
import pandas as pd
import glob

In [None]:
# We change our current directory, so that (os.path.abspath(.) ...) generates the correct path to our dataset.
%cd /content/drive/MyDrive/Image_Captioning

In [None]:
# Listing the paths and filenames of interest.
annotations_path = {'val':'/val_cleaned.json'}
images_path = {'val':'/data/val/'}

# The role of captions_and_images is to generate a 2-tuple, where
# the first entry houses the captions, and
# the second entry houses the associated image paths.

# We shall do this again once we have processed the images according to the requirements of ResNet-50 and
# once we have tokenized and padded our captions.

# As of now, the validation set contains 6,982 images and 32,299 captions.
# We take 80% of those examples for the training phase, and the rest for the validation phase.
# Note that we did train on the entire training dataset (21,220 images and 98,015 captions) for a single epoch.
# This took almost 4.5 hours to complete

# We will uncover shortly that the function below must be slightly altered in order
# to accomodate the nuances featured in our dataset.

def captions_and_images_testing (dataset):

# First, we open the dataset, which is saved as a .json file.
    with open(os.path.abspath('.') + annotations_path[dataset], 'r') as f:
        data = json.load(f)

# Translating the .json file from above to a Pandas data frame.
    df = pd.DataFrame.from_dict(data, orient='index')
# Before we can iterate over the index of the data frame, we must make sure that the indices of neighboring entries differ by 1.
# Note that neighboring indices do not always differ by 1 because we have removed images from our dataset that did not have enough
# captions. Initially, all images had 5 captions, but we deleted any captions that had the options is_precanned or is_rejected set
# equal to True. We then proceeded to drop any images with 2 or less captions, i.e., we kept any image with at least 3 nontrivial captions.
    df = df.reset_index()
# Resetting the index creates a column that houses the previous indices. We drop this column.
    df = df.drop(['index'], axis = 1)

# We collect the image path and caption of every row.
# Then, we form a dictionary, whose keys are the image paths and whose values are the associated captions.
    image_path_to_caption = collections.defaultdict(list)
    for n in range(df.shape[0]):
        image_path = os.path.abspath('.') + images_path[dataset] + df.file_name.iloc[n]
        caption = f'<start> {df.caption.iloc[n]} <end>' # The curly brackets ensure that output is generated into text.
        image_path_to_caption[image_path].append(caption)

# We now separate the captions and image paths into 2 lists.
    captions = [] 
    img_name_vector = []

    for image_path in image_path_to_caption.keys(): # The keys are unique, so we only see each image path once.
        caption_list = image_path_to_caption[image_path] # Each image has at least 3 captions.
        captions.extend(caption_list) # We add the captions to caption_list.
        img_name_vector.extend([image_path]*len(caption_list)) # We add the image path multiple times to reflect that images have
        # multiple captions.
    return captions, img_name_vector

In [None]:
# Before we proceed, we collect the captions according to their length as there might be something fishy.
captions_and_images_explore = captions_and_images_testing('val')
sizes_to_indices = collections.defaultdict(list)
for index, caption in enumerate(captions_and_images_explore[0]):
  size = len(caption)
  sizes_to_indices[size].append(index)

In [None]:
# We check that the validation set has 32,299 captions.
total = 0
for key in sizes_to_indices.keys():
    total += len(sizes_to_indices[key])
print(total==32299)


In [None]:
descending_sizes = sorted(list(sizes_to_indices.keys()),reverse=True)
descending_sizes[:10]

In [None]:
# We now explore what a caption of length 874 looks like.
captions_and_images_explore[0][sizes_to_indices[874][0]]

In [None]:
zeroth_caption = captions_and_images_explore[0][0]
print(f'This is caption of size {len(zeroth_caption)}: {zeroth_caption}')

In [None]:
# Observe that the caption of length 874 is incredibly long. Meanwhile, a caption of length 68 contains 10 words
# and the start and end token. Let's see what a caption of length 200 looks like.
captions_and_images_explore[0][sizes_to_indices[258][0]]

In [None]:
# The above caption seems reasonable and relevant, so we shall impose a cutoff of length 300.
# Imposing the cutoff should generate a collection of captions, where the maximum possible length of a caption is 294.
# Here, one could impose various cutoffs and study how our model performs. We do not pursue this avenue.

def captions_and_images (dataset):

    with open(os.path.abspath('.') + annotations_path[dataset], 'r') as f:
        data = json.load(f)


    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)

    image_path_to_caption = collections.defaultdict(list)
    for n in range(df.shape[0]):
        image_path = os.path.abspath('.') + images_path[dataset] + df.file_name.iloc[n]
        caption = f'<start> {df.caption.iloc[n]} <end>'
        # Here, we use an if-continue statement that will ignore captions of length greater than 300.
        if len(caption) > 300:
          continue
        image_path_to_caption[image_path].append(caption)

    captions = [] 
    img_name_vector = []

    for image_path in image_path_to_caption.keys():
        caption_list = image_path_to_caption[image_path]
        # We have removed some captions, so we might have images with 2 or less captions.
        # We dispose of them with another if-continue statement.
        if len(caption_list) < 3:
          continue
        captions.extend(caption_list)
        img_name_vector.extend([image_path]*len(caption_list))
    return captions, img_name_vector

In [None]:
captions_and_images_validation = captions_and_images('val')
sizes_to_indices_again = collections.defaultdict(list)
for index, caption in enumerate(captions_and_images_validation[0]):
  size = len(caption)
  sizes_to_indices_again[size].append(index)
sorted(list(sizes_to_indices_again.keys()),reverse=True)[:10]

In [None]:
# We check how many captions are left in the validation set.
total_again =  0
for key in sizes_to_indices_again.keys():
    total_again += len(sizes_to_indices_again[key])
print(total_again)

In [None]:
# The function below will output a preprocessed image ready for ResNet-50 and its path.
def load_image (image_path):
    image = tf.io.read_file(image_path) # Loads the raw data as a string
    image = tf.image.decode_jpeg(image, channels = 3) # Decodes JPEG-encoded image to a uint8 (8-bit unsigned integer) tensor
    image = tf.image.resize(image, (224,224)) # Resizes the tensor above to agree with the input size of ResNet-50
    image = tf.keras.applications.resnet50.preprocess_input(image) # Takes care of other ResNet-50 preprocessing steps.
    return image, image_path

In [None]:
# The output of load_image is:
# tf.Tensor of shape == (224, 224, 3), whose entries are float32 and the image path.
# The first index accesses the img_name_vector entry,
# the second index acceses the 0th file path, and
# the third index accesses the processed image as a tensor or its image path.
# load_image(captions_and_images_validation[1][0])[0]
# load_image(captions_and_images_validation[1][0])[1]

In [None]:
# # We comment out this cell because we have already cached the features that was generated by ResNet-50.
# # We now create a model that is ResNet-50 minus the last layer.
# # After the images have run its course through the model, we store the output as a vector, which contains all the extracted features.
# # We save these extracted features to disk.

# def cache_extracted_features (dataset):

# # We instantiate the ResNet-50 model without the top layer and load the ImageNet weights.
# # Setting include_top to False results in dropping these 2 layers: avg_pool (GlobalAveragePooling2) and predictions (Dense).
#     image_model = tf.keras.applications.resnet50.ResNet50(include_top=False, weights='imagenet')

#     new_input = image_model.input # We grab the input of the model.
#     hidden_layer = image_model.layers[-1].output # We grab everything but the last layers.

# # We build a model using the functional API, where input and output are defined above.
#     image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

# # We extract only the unique paths and then sort them.
#     encode_data = sorted(set(img_name_vector))

# # We use the data API to generate a sequence of data items.
# # Note that as of now the image_dataset is a sequence of tensors that contain strings reflecting our file paths.
#     image_dataset = tf.data.Dataset.from_tensor_slices(encode_data)

# # We can take a peak of the first element of this dataset by running the following code:
# # for item in image_dataset.take(1):
# #     print(item)
# # The output for the training data set is:
# # tf.Tensor(b'/Users/calvin/projects/mlp_1/data/images/train/VizWiz_train_00000000.jpg', shape=(), dtype=string)

# # We use the map method to apply the load_image function to our dataset.

# # tf.data.AUTOTUNE ensures that the number of batches that are computed in parallel is based on available sources.
# # The batch method sets the number of images that are dealt with in a single batch.
#     image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE).batch(16)

# # After applying the load_image function to our dataset, we obtain images that have been processed,
# # i.e., they have been read, decoded, resized, and preprocessed according to the requirements of ResNet-50.

# # As before, we can see the output of this newly transformed image_dataset with the following code:
# # for item in image_dataset.take(1):
# #    print(item)
# # The output is a tf.Tensor of shape == (batch_size, 224, 224, 3) with (batch_size) image paths.

# # Now that we have a sequence of data items, specifically, a preprocessed image and its associated file path we perform
# # the following for loop.

# # First, we extract the features of our image. Then, we resize the extracted features.
#     for img, path in tqdm(image_dataset):
#         batch_features = image_features_extract_model(img) 
#         batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3])) 

# # We perform another for loop, but this time it is not over the 2-tuple, (image, path) it is over the 2-tuple,
# # (extracted features, paths). Lastly, we save the path to a binary file using the file format, (...).npy.
#         for bf, p in zip(batch_features, path):
#             path_of_feature = p.numpy().decode("utf-8")
#             np.save(path_of_feature, bf.numpy())

In [None]:
# We use list comprehension to generate the lengths of all the elements of the input, which we call tensor.
# Then, we return the element with the longest length.
# Note that writing max([...]) is valid too, but one can get away with leaving out the square brackets.
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# We tokenize and pad our captions.
def tokenize_pad (captions):
    
    top_k = 5000 # We set a limit to the number of words we hold in our vocabulary.

# num_words: determines the maximum number of words to keep, i.e., only the (num_words - 1) most common words are kept.
# oov_token: if we encounter a word that is not part of our vocabulary, then we assign it the special token, <unk>.
# filters: a string of characters that are filtered from the text being analyzed.
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~')

# We update our internal vocabulary based on the captions we extracted from the data frame above.
# Note that we feed a list of captions to our model, not a sequence, so we use fit_on_texts rather than fit_on_sequences.
    tokenizer.fit_on_texts(captions)

# Creating a word to index mapping and a index to word mapping.
# We reserve the index, 0 for the special token, <pad>.
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'

# Transforming our collection of texts into a sequence of integers.
# As we mentioned above, only the (num_words - 1) most frequent word are taken into account.
    seqs = tokenizer.texts_to_sequences(captions)
    max_length = calc_max_length(seqs)

# Note that we can check out the inner workings of our tokenizer with the following methods and attributes of Tokenizer:
# get_config(), index_word, word_index.

# Note that we did not lower the characters of our string because this is taken care by the default settings of
# tf.keras.preprocessing.text.Tokenizer.

# Not all sequences share the same length, so we first find the longest sequence, and then we add 0's to all sequences shorter than
# the longest sequence.
    cap_vector = tf.keras.preprocessing.sequence.pad_sequences(seqs, padding='post') 

    return cap_vector, tokenizer, seqs

In [None]:
# We make sure that the above generates a functioning tokenizer.
# Note that we will be using the tokenizer during the training phase, which is why we altered the output.
# We also returns seqs in order to appreciate some of the methods and attributes of our tokenizer.
tokenizer_val= tokenize_pad(captions_and_images_validation[0])
# We print the first 8 indices and their associated words.
for n in range(8):
  print(n, tokenizer_val[1].index_word[n])

In [None]:
zeroth_caption = captions_and_images_validation[0][0] # grabbing the 0th caption
print(zeroth_caption) # generating the 0th caption
# We can recover the indices needed to generate the words from the tokenizer by accessing the 0th entry of seqs.
print([tokenizer_val[1].index_word[n] for n in tokenizer_val[2][0]]) # generating the 0th caption for seqs (tokenized)
tokenized_padded_test = [tokenizer_val[1].index_word[n] for n in tokenizer_val[0][0]]
print(tokenized_padded_test) # generating the 0th caption from cap_vector (tokenized and padded)

In [None]:
# We now check the maximum length of our tokenized/padded captions.
print(len(tokenizer_val[0][0]))

In [None]:
# We list the parameters, which we will be using later on.

# BATCH_SIZE = 64
# BUFFER_SIZE = 1000 or buffer_size=tf.data.AUTOTUNE

# embedding_dim = 256 (used in the encoder and decoder models)
# units = 512 (used in the decoder and attention models)
# vocab_size = top_k + 1 (used in the decoder model)

# num_steps equals the number of captions in our dataset divided (integer or floor) by the batch size.
# The number of captions is 98,015 and 32,299 for the training and validation dataset, respectively.
# These values can be found in the notebook, 3_preprocessing_train_val.ipynb.
# These numbers only hold true if we do not remove very long captions like we did above.

# num_steps = len(pro_img_and_tok_cap(dataset)[0]) // BATCH_SIZE

# Shape of the vector extracted from InceptionV3 is (64, 2048).
# Shape of the vector extracted from ResNet-50 is (49, 2048).

# features_shape = 2048
# attention_features_shape = 49

# How can we recover these numbers?
# We generate a summary of the ResNet-50 model using the .summary() method and 
# see the output shape of the layer named conv5_block3_out. It is (None, 7, 7, 2048).
# Repeating this for InceptionV3, we recover a shape of (None, 8, 8,8 2048).

# tf.keras.layers.Layer(tf.keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet')

In [None]:
# We now generate a 2-tuple, where
# the first entry houses the image paths to the preprocessed images and
# the second entry houses the associated tokenized and padded captions.

def pro_img_and_tok_cap (dataset):

# Just as before with captions_and_images, the pattern we follow is:
# Generate a dictionary, where the keys are the images and the values are all of its associated captions.
# Then, generate 2 lists, where the first holds the captions and the second holds the images (multiplicity being taken into account).
    img_to_cap_vector = collections.defaultdict(list) # keys are image paths, values are tokenized/padded captions.
    
# Recall that the output of captions_and_images(dataset) is (captions, img_name_vector).
# Recall that the output of tokenize_pad(captions) is (cap_vector, tokenizer, seqs).

    captions_and_images_data = captions_and_images(dataset)

    img_name_vector = captions_and_images_data[1]
    cap_vector = tokenize_pad(captions_and_images_data[0])[0]

    for img, cap in zip(img_name_vector, cap_vector):
        img_to_cap_vector[img].append(cap)

    # We now create the training and validation split.
    img_keys = list(img_to_cap_vector.keys()) # First, we generate the unique keys, i.e., the unique image paths.
    # Shuffling all the keys. Note that we pick a seed in order to replicate our results.
    random.seed(1729)
    random.shuffle(img_keys)
    # We create an indexing scheme that separates the first 80 percent of the recently shuffled dataset.
    slice_index = int(len(img_keys)*0.8) 
    
    img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[slice_index:]
    
    # We now create 2 lists per dataset (training and validation), which houses the images and captions.

    img_name_train = []
    cap_train = []

    for imgt in img_name_train_keys:
        capt_len = len(img_to_cap_vector[imgt])
        img_name_train.extend([imgt] * capt_len)
        cap_train.extend(img_to_cap_vector[imgt])

    img_name_val = []
    cap_val = []

    for imgv in img_name_val_keys:
        capv_len = len(img_to_cap_vector[imgv])
        img_name_val.extend([imgv] * capv_len)
        cap_val.extend(img_to_cap_vector[imgv])
    
    return img_name_train, cap_train, img_name_val, cap_val

In [None]:
# The function below is used to load the numpy files, which house the extracted features generated by ResNet-50.
# Note that the output is the extracted features of the processed image and its associated tokenized/padded caption.
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [None]:
# We now create a tf.data.Dataset dataset which will be fed into our models.

# We added an option to control the batch size in order to easily generate the dimensionality of the variables in question.
# We set batch_size to a default value of 64.

# Instead of generating solely a dataset, we generate a 2-tuple where the 0th entry is the dataset, but
# the 1st entry is the processed images along with their associated tokenized/padded captions.

# We emphasize that these extra inputs and outputs are to explore the dimensionality of what we are creating.

# Furthermore, since we have performed a training-validation split earlier in the function, pro_img_and_tok_cap, we wish 
# to have the control to work with the training images/captions or the validation images/captions later on.

def create_dataset (data, batch_size=64):
    # Creating a sequence of data items.
    _pro_img_and_tok_cap = pro_img_and_tok_cap(data)
    dataset = tf.data.Dataset.from_tensor_slices((_pro_img_and_tok_cap[0], _pro_img_and_tok_cap[1])) # single positional argument!

    # Use map to load the numpy files holding the cached features in parallel.
    # We use tf.numpy_function to promote the Python function to a TensorFlow operation.
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
            map_func, [item1, item2], [tf.float32, tf.int32]),
            num_parallel_calls=tf.data.AUTOTUNE)

    # Shuffling and creating batches.
    dataset = dataset.shuffle(1000).batch(batch_size) 
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

    return dataset, _pro_img_and_tok_cap

In [None]:
# We now investigate the shape of the outputs of the function, create_dataset.
# We do this for the validation dataset with a batch size equal to 2 in order to shorten the run time.
dataset_testing = create_dataset('val', 1)[0]
# for item in dataset_testing.take(1):
#      print(item)
# The outputs are as follows:
# The first entry is the processed image: tf.Tensor of shape: (batch_size = 2, 49, 2048), dtype = float32
# The second entry is the tokenized/padded caption: tf.Tensor of shape: (batch_size = 2 , 61), dtype = int32
# Note that the 61 is the longest tokenized/padded caption in our dataset.

In [None]:
# We also investigate the shape of several variables that will play a crucial role in what follows.
count = 0
for (batch, (img_tensor, target)) in enumerate(dataset_testing):
  if count == 1:
    break
  print(f'This is the nth batch with n equal to {batch}.')
  print(f'The shape of img_tensor is {img_tensor.shape}.')
  print(f'The shape of target is {target.shape}.')
  print(' ')
  print('Below is the sequence representation of target.')
  print(target.numpy())
  print(' ')
  print('Below is sequence representation of target as a tokenized/padded caption.')
  print([tokenizer_val[1].index_word[n] for n in target[0].numpy()])
  # For the evaluation phase, the target.shape[0]=1 because there is only a single image we must generate a caption for.
  # For the training phase, the target.shape[0]=batch_size because we are training multiple examples simulatenously.
  print(' ')
  initial_decoder_input = tf.expand_dims([tokenizer_val[1].word_index['<start>']]* target.shape[0], 1)
  print(f'This is the initial decoder input: {initial_decoder_input}. The shape of the initial decoder input is {initial_decoder_input.shape}.')
  print(' ')
  empty_list = []
  print('Below we generate the tokenized/padded form of the contents of target.')
  for i in range(11):
    empty_list.append([tokenizer_val[1].index_word[n] for n in target[:,i].numpy()][0])
  print(empty_list)
  print(' ')
  print(f'The shape of the 7th decoder input is {tf.expand_dims(target[:,7],1).shape}.')
  for i in range(11):
    ith_decoder_input = tf.expand_dims(target[:, i], 1)
    print(ith_decoder_input, [tokenizer_val[1].index_word[n] for n in ith_decoder_input[0].numpy()])
  count += 1
  print(' ')

In [None]:
# We are now ready to tackle our encoder-decoder model.
# Constructing the encoder model.

class CNN_Encoder(tf.keras.Model):

    # We are subclassing tf.keras.Model, so we define our layers in __init__ and
    # any calculations this model carries out in the call function.
    # Those operations can be executed on some input, x by running CNN_Encoder(embedding_dim)(x).
    # The syntax has changed in Python 3.0, so we can replace super(CNN_Encoder, self).__init__() with super().__init__()
    
    def __init__(self, embedding_dim):

      super().__init__()
      
      self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x, print_stuff):
      # Recall that if the input to a dense layer has a rank of at least 3, then passing 
      # through a dense layer with (units) neurons yields the following mapping of dimensions:
      # input: (..., n) -> output shape: (..., units)

        if print_stuff:
            print(f'Shape of img_tensor before dense layer and ReLU (CNN): {x.shape}.')
        # shape before dense layer: (batch_size, 49, 2048)

        x = self.fc(x)

        # shape after dense layer: (batch_size, 49, embedding_dim)

        x = tf.nn.relu(x)

        # shape after ReLU: (batch_size, 49, embedding_dim)

        if print_stuff:
            print(f'Shape of img_tensor after dense layer and ReLU (CNN): {x.shape}.')

        return x

In [None]:
# Investigating the shape of the img_tensor before and after it passes through the encoder.
for (batch, (img_tensor, target)) in enumerate(dataset_testing):
     x = CNN_Encoder(256)(img_tensor, True)
     break

In [None]:
# Constructing the attention model.

class BahdanauAttention(tf.keras.Model):

   # The purpose of the attention model is to have the decoder focus on certain portions of
   # the encoder output at each time step.
   # What we implement is not purely Bahdanau attention.
   # It is instead a blend of Bahdanau attention and Luong attention.
   # Bahdanau: we concatenate the decoder hidden state with the encoder output to generate scores.
   # Bahdanau: we use the previous hidden state to generate scores
   # Luong: we use output of the attention model as an input to the decoder.

  def __init__(self, units):
    super().__init__()
    # We have 2 dense layers each with (units) neurons.
    # One layer is for the cached features the CNN encoder generates.
    # The other layer is for the hidden state of the RNN decoder.
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    # We have a layer with a single neuron in order to generate a number, not a tensor
    self.V = tf.keras.layers.Dense(1)
    
  def call(self, features, hidden, print_stuff):
   
    if print_stuff:
      print(f'Shape of hidden before tf.expand_dims (Bahdanau): {hidden.shape}.')
    # hidden shape: (batch_size, units)

    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    if print_stuff:
      print(f'Shape of hidden after tf.expand_dims (Bahdanau): {hidden_with_time_axis.shape}.')
    # hidden_with_time_axis shape: (batch_size, 1, units)

    if print_stuff:
      print(f'Shape of features before W1 (Bahdanau): {features.shape}.')
    # features before W1: (batch_size, 49, embedding_dim)

    W1_product = self.W1(features)

    if print_stuff:
      print(f'Shape of features after W1 (Bahdanau): {W1_product.shape}.')
    # features after W1: (batch_size, 49, units)

    if print_stuff:
      print(f'Shape of hidden_with_time_axis before W2 (Bahdanau): {hidden_with_time_axis.shape}.')
    # hidden_with_time_axis before W2: (batch_size, 1, units)

    W2_product = self.W2(hidden_with_time_axis)

    if print_stuff:
      print(f'Shape of hidden_with_time_axis after W2 (Bahdanau): {W2_product.shape}.')
    # hidden_with_time_axis after W2: (batch_size, 1, units)

    # Note that we are adding two objects whose shapes are:
    # (batch_size, 49, units) and (batch_size, 1, units).

    # At first glance this algebraic operation might seem incompatible.
    # Recall we have added/multiplied numbers of different dimensions.
    # As an example, think about the shapes involved with scalar multiplication.
    # Here, we multiply a single number with an array of numbers.

    # The rule that TensorFlow follows is called broadcasting.
    # First, start at the rightmost entry, and check if they are either equal or 1.
    # If either of those conditons are met, then the resulting dimension is the largest one
    # Obviosuly, if they are the same, then they are both the largest ones.
    # Proceed to the left entry and repeat.

    attention_hidden_layer = (tf.math.tanh(W1_product + W2_product))

    if print_stuff:
      print(f'To obtain the shape of the attention_hidden_layer, we add 2 tensors (Bahdanau).')
      print(f'Their shapes are (Bahdanau): {W1_product.shape} and {W2_product.shape}.')
      print(f'Shape of attention_hidden_layer (Bahdanau): {attention_hidden_layer.shape}.')
    # attention_hidden_layer shape: (batch_size, 49, units)

    score = self.V(attention_hidden_layer)

    if print_stuff:
      print(f'Shape of score (Bahdanau): {score.shape}.')
    # score shape: (batch_size, 49, 1)

    # One way to understand why we choose the 1st axis for tf.nn.softmax
    # is to note that the 0th axis refers to the batch size, which we want to avoid
    # and the 2nd axis is 1-dimensional, so the only option left is the 1st axis.

    attention_weights = tf.nn.softmax(score, axis=1)
  
    if print_stuff:
      print(f'The attention weights are obtained from the score tensor (Bahdanau).')
      print(f'Shape of attention_weights (Bahdanau): {attention_weights.shape}.')
    # attention_weights shape: (batch_size, 49, 1)

    context_vector = attention_weights * features

    if print_stuff:
      print(f'To obtain the shape of the context_vector, we multiply 2 tensors (Bahdanau).')
      print(f'Their shapes are (Bahdanau): {attention_weights.shape} and {features.shape}.')
      print(f'Shape of context vector before sum (Bahdanau): {context_vector.shape}.')
    # context_vector before sum: (batch_size, 49, embedding_dim)

    # One way to understand why we choose the 1st axis for tf.reduce_sum
    # is to note that the 0th axis refers to the batch size, which we want to avoid
    # and the 2nd axis refers to the size of the embedding space that houses our words.
    # Recall that the dimension of our vocabulary space is quite large, and we wish
    # to simplify matters, which is why we shrink ourselves into the embedding space.
    # In the call function for our decoder model, we shall concatenate
    # the context vector and the embedded decoder input, so they should agree
    # on the size of the embedding dimension!

    context_vector = tf.reduce_sum(context_vector, axis=1)

    if print_stuff:
      print(f'Shape of context vector after sum (Bahdanau): {context_vector.shape}.')
    # context_vector after sum: (batch_size, embedding_dim)

    # Recall that the context vector is a weighted sum of the cached features, so
    # the rightmost dimension must be equal to embedding_dim because the cached features
    # passed through a dense layer with (embedding_dim) neurons!

    # We emphasize that at this point in the code the context vector is timeless.
    # It will inherit time when the call function of the decoder acts on it
    # with tf.expand_dims!

    if print_stuff:
      print(' ')

    return context_vector, attention_weights

In [None]:
# Constructing the decoder model.
class RNN_Decoder(tf.keras.Model):

  def __init__(self, embedding_dim, units, vocab_size):

    super().__init__()

    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
    self.fc1 = tf.keras.layers.Dense(self.units)

    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden, print_stuff):

    # The inputs to the decoder call function are:
    # a single target word of the intended caption (x), cached features of ResNet-50(features), and
    # current hidden state of the GRU (hidden)

    # features and hidden are passed through the attention model, which generates
    # a context vector, which will form part of the GRU input.
    # Note that the context vector currently does not have any time dependence!

    # The GRU is also fed the target word, specifically, one entry of a sequential representation of
    # a tokenized/padded caption that has been embedded from vocab space to embedded space, which is
    # also shared by the features! 
    # We emphasize that the decoder input is acted on with tf.expand_dims during training/evaluation.
    # We also emphasize that the decoder input that is acted on with tf.expand_dims consists
    # of a word and its time coordinate!

    # We combine the context vector with time (function of current hidden state and cached features) and
    # a target word of the caption to generate the input for the GRU.

    context_vector, attention_weights = self.attention(features, hidden, print_stuff)

    if print_stuff:
      print(f'Shape of context_vector (RNN): {context_vector.shape}.')
      print(f'Shape of attention_weights (RNN): {attention_weights.shape}.')
    # context_vector shape: (batch_size, embedding_dim)
    # attention_weights shape: (batch_size, 49, 1)

    # Recall that an embedding layer changes an n-index object to an (n+1)-index object,
    # where the last index ranges over d2, where the embedding layer takes in a 2-tuple, (d1,d2):
    # input: (..., n) -> output shape: (..., n, d2)

    if print_stuff:
      print(f'Shape of decoder input before embedding layer (RNN): {x.shape}.')
    # decoder input before embedding: (batch_size, 1)

    x = self.embedding(x)

    if print_stuff:
      print(f'Shape of decoder input after embedding layer (RNN): {x.shape}.')
    # decoder input after embedding: (batch_size, 1, embedding_dim)

    # Concatenation along some axis is only possible if the dimensions of all the other axes
    # are equal. The resulting concatenated dimension is the sum of the dimensions invovled
    # along that particular axis.

    if print_stuff:
      print(f'Shape of expanded context vector (RNN): {tf.expand_dims(context_vector, 1).shape}')
    # Shape of context vector with time: (batch_size, 1, embedding_dim)

    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    if print_stuff:
      print(f'Shape of concatenation of expanded context vector with time and embedded decoder input (RNN): {x.shape}.')
      print(f'Shape of input to GRU (RNN): {x.shape}.')
    # concatenation shape: (batch_size, 1, embedding_dim) + (batch_size, 1, embedding_dim) = (batch_size, 1, 2*embedding_dim)

    # The GRU outputs an output and its hidden state, which we refer to as state.
    # Note that before this output can form part of a caption it must pass through 2 layers.
    # The last layer ultimately recovers the dimensionality of the vocab space!

    output, state = self.gru(x)

    if print_stuff:
      print(f'Shape of output of GRU (RNN): {output.shape}.')
      print(f'Shape of state of GRU (RNN): {state.shape}.')
    # output shape: (batch_size, 1, units)
    # hidden shape: (batch_size, 1, units

    x = self.fc1(output)

    if print_stuff:
      print(f'Shape of output after fc1 (RNN): {x.shape}.')
    # output shape after fc1: (batch_size, 1, units)

    x = tf.reshape(x, (-1, x.shape[2]))
    # Recall that if one of the options of reshape is set equal to -1, then TensorFlow
    # is in charge of generating a tensor that respects the total dimensionality, i.e., 
    # the reshaping of x will take on the form: (n,units), where
    # (batch_size * 1 * units) = (n * units)

    if print_stuff:
      print(f'Shape of output after fc1, reshaping (RNN): {x.shape}.')
    # output shape after fc1, reshaping: (batch_size, units)

    x = self.fc2(x)

    if print_stuff:
      print(f'Shape of output after fc1, reshaping, fc2 (RNN): {x.shape}.')
    # outshape shape after fc1, reshaping, fc2: (batch_size, vocab_size)

    if print_stuff:
      print(f'Shape of prediction (RNN): {x.shape}.')
      print(f'Shape of hidden state (RNN): {state.shape}.')
      print(f'Shape of attention weights (RNN): {attention_weights.shape}.')
    # prediction shape: (batch_size, vocab_size)
    # hidden state shape: (batch_size, units)
    # attention weights shape: (batch_size, 49, 1)
    if print_stuff:
      print(' ')

    return x, state, attention_weights

  # This function below unveils to us that the value of 
  # hidden_size (with no time axis) is equal to units.
  
  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [None]:
# Running this function will generate many print statements revealing the shape
# of all the moving parts involved of our model.

def probing_our_model (print_stuff):
  if print_stuff:
    embedding_dim = 250
    units = 100
    top_k = 5000
    vocab_size = top_k + 1

    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    for (batch, (img_tensor, target)) in enumerate(dataset_testing):

        print(f'This is the nth batch, where n is {batch}.')
        print('Note that we have set the batch size to 1 to improve readability.')

        hidden = decoder.reset_state(batch_size=target.shape[0])

        print(f'Shape of the initial hidden state is {hidden.shape}.')
        print(f'The initial hidden state is initialized to zero. It is a row vector of {units} zeroes.')
        print('Below is the numerical representation of hidden.')
        print(hidden.numpy())
        print(' ')
        
        dec_input = tf.expand_dims([tokenizer_val[1].word_index['<start>']] * target.shape[0], 1)
        
        print((f'Shape of the initial decoder input, which consists of start tokens is {dec_input.shape}.'))
        print(f'This is the numerical representation of the initial decoder input: {dec_input}.')
        print(f'This is the lexical representation of the initial decoder input: {[tokenizer_val[1].index_word[n] for n in dec_input[0].numpy()]}')
        print(' ')

        features = encoder(img_tensor, True)

        # The 3 will be replaced by target.shape[1], so that we go over all the target words
        # that form some caption.

        print(' ')

        for i in range(1, 3):
          print(f'This is the nth batch, where n is {batch}')
          print(f'Engaging in a for loop. This is the ith step, where i is {i}.')
          print(' ')

          predictions, hidden, _ = decoder(dec_input, features, hidden, True)
          
          the_void = []
          for j in range(11):
            the_void.append([tokenizer_val[1].index_word[n] for n in target[:,j].numpy()][0])
          print('This is part of the caption we are feeding bit by bit to our decoder is right below:')
          print(the_void)

          dec_input = tf.expand_dims(target[:, i], 1)
          
          print(f'We are on the ith step, where i is {i}.')
          print(f'Shape of the decoder input is {dec_input.shape}.')
          print(f'This is the current word we are on: {[tokenizer_val[1].index_word[n] for n in dec_input[0].numpy()]}')
          print(f'This is the nth batch, where n is {batch}. We are at the end of the ith step, where i is {i}.')
          print(' ')
        
        break

In [None]:
probing_our_model(True)

In [None]:
# Creating an instance of everything we will use for our models and for the training/evaluation phases.

# Below are the parameters of our models.

embedding_dim = 256 # used by encoder, decoder
units = 512 # used by decoder, attention
top_k = 5000 # used by tokenizer, decoder
vocab_size = top_k + 1 # used by decoder
BATCH_SIZE = 64 

# Below are the encoder and decoder models

encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

# Below we have the altered ResNet-50 model we used to generate our cached features.

image_model = tf.keras.applications.resnet50.ResNet50(include_top=False, weights='imagenet')
new_input = image_model.input 
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

# As we mentioned at the very beginning, we will not be using our training dataset
# because it is too large., so we shall instead use our validation dataset.
# We also will be needing the associated tokenizer.
# We EMPHASIZE that the variables below do not originate from the training dataset even
# though we refer to them as (something)_train. They originate from the validation dataset!

captions_and_images_train = captions_and_images('val')
tokenizer_creation = tokenize_pad(captions_and_images_train[0])
tokenizer_train = tokenizer_creation[1]
sequences_train = tokenizer_creation[2]
dataset_creation = create_dataset('val') # the default batch size for create dataset is 64
dataset_train = dataset_creation[0]
pro_img_and_tok_cap_train = dataset_creation[1]
# the above outputs a 4-tuple: (img_name_train, cap_train, img_name_val, cap_val)
# we used the first 2 entries to create our dataset
# we use the last 2 entries to validate our dataset
val_image_name = pro_img_and_tok_cap_train[2]
val_caption = pro_img_and_tok_cap_train[3]

num_steps = len(pro_img_and_tok_cap_train[0]) // BATCH_SIZE # we use floor division
max_length = calc_max_length(sequences_train)
attention_features_shape = 49

In [None]:
# We choose a fancy relative of stochastic gradient descent as our optimizer.
optimizer = tf.keras.optimizers.Adam() 

# Our task is ultimately a classification problem, where we have various labels or equivalently, words.

# We wish to associate a high probability for the most sensible word, and a low probability for nonsensible words
# because the purpose of these words is to form a caption that will describe the input image.

# Note that we have not one-hot encoded our words, so we do not use CategoricalCrossentropy.
# Instead, we use SparseCategoricalCrossentropy because the numerical representation of the
# words that form our captions take value in the integers.

# There are two options inside SparseCategoricalCrossentropy.
# See https://stackoverflow.com/a/59872518.

# 1. from_logits: We set this equal to True in order to let TensorFlow know that our outputs are
# not normalized. If we were to normalize our outputs, then our tokenizer would suffer greatly because
# it does not like numbers living in the unit interval

# 2. reduction: Setting reduction to 'none' allows us to gain access to the categorical cross entropy
# results. We do this in order to take into account padding. We discuss this in detail below.
# See https://stackoverflow.com/questions/47057361/how-do-i-mask-a-loss-function-in-keras-with-the-tensorflow-backend.

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):

  # First, we ascertain if the numerical representation of the ground truth caption coincides with 0.
  # This is False when we focus on the actual caption or its special tokens, <start> and <end>.
  # This is True when we focus on the special token, <pad> because its associated index is 0.
  # Recall that the <pad> token was introduced to have all captions be of the same length.
  # By negating this sequence of Booleans, we essentially create a filter, whose purpose
  # is to set the loss coming from the padding sector equal to zero.
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  
  loss_ = loss_object(real, pred)

  # In order to multiply our cross entropy by our mask, we must make sure that our mask
  # has the same data type as loss_.
  mask = tf.cast(mask, dtype=loss_.dtype) 
  
  loss_ *= mask # We apply our padding filter.

  return tf.reduce_mean(loss_) # We compute the mean of our cross entropy results.

In [None]:
# We create checkpoints, so we can save/restore our progress to/from disk.

# First, we create a file path to our checkpoint.
checkpoint_path = os.path.abspath('.') + "/checkpoints/training"

# Next, we make sure that our checkpoint has access to our encoder, decoder, and optimizer.
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)

# Lastly, we use a checkpoint manager, which combines everything above and places
# a limit to the number of checkpoints we can save. These are the 3 inputs to
# our checkpoint manager.

# checkpoint: our tf.train.Checkpoint, which houses our encoder, decoder, and optimizer.
# directory: the directory where our checkpoints will be saved to.
# max_to_keep: the maximum number of checkpoints to keep.

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=20)

In [None]:
# We initialize our starting epoch to be 0.
start_epoch = 0

# However, if there are checkpoints saved to disk, then we can recover them using the if statement below.

# ckpt_manager.latest_checkpoint returns the path to the latest checkpoint if it exists (if statement executed).
# ckpt_manager.latest_checkpoint returns None if there are no checkpoints (if statement not executed).

# The function below will dictate if we restore a checkpoint or not.
def restore_checkpoint (answer):
  if answer:
    if ckpt_manager.latest_checkpoint:
      # We use the string method, .split to partition the filepath generated by .latest_checkpoint.
      # By setting the separator equal to a hyphen, we generate a list of the form:
      # ['./checkpoints/train/ckpt', 'n'], where n is the epoch of interest.
      # We can access this epoch by tacking on a [-1].
      start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
      # Below we restore the latest checkpoint, which is saved to disk here: ckpt_manager.latest_checkpoint.
      ckpt.restore(ckpt_manager.latest_checkpoint)

In [None]:
# Running the restore_checkpoint function above might recover a latest checkpoint.
# We avoid doing this because we wish to train our model from scratch, but most importantly,
# because we have made important changes to our data.

# Below we check what epoch we are currently at.
if start_epoch > 0:
  print(f'Path to latest checkpoint: {ckpt_manager.latest_checkpoint}.')
  print(f'We are living in the nth epoch with n={start_epoch}.')
  print('We have been transported to the past!')
else:
  print(f'We are living in the nth epoch with n={start_epoch}.')
  print('We must start at the beginning of time!')
print(' ')
print(f'This is where we save our checkpoints: {checkpoint_path}')

In [None]:
# We will generate a plot showcasing our loss as a function of epochs, so
# we collect our losses in the list below.
loss_plot = []

In [None]:
# The training phase

# @tf.function promotes our train_step function, which gives us access
# to other TensorFlow features. We do not investigate these features in this notebook.

@tf.function

# train_step has 2 inputs:
# img_tensor: a tensor that houses the extracted features generated by some image passing through an altered ResNet-50 model
# target: a tensorial of shape that houses a numerical representation of a caption

def train_step(img_tensor, target):

  loss = 0

  # We initialize the hidden state of our decoder to 0 because any 2 image captions are independent of each other.
  hidden = decoder.reset_state(batch_size=target.shape[0])

  # We initialize the decoder input by generating an array containing (batch_size) start tokens.
  # Then, we expand the shape of the input with tf.expand_dims, so that it has a sense of time.
  dec_input = tf.expand_dims([tokenizer_train.word_index['<start>']] * target.shape[0], 1)

  # We use the line below in order to keep track of certain variables, which we refer to as trainable_variables.
  with tf.GradientTape() as tape: 

      features = encoder(img_tensor, False) 

      # The for loop below ranges from 1 to the maximum length of a tokenized/padded caption in our
      # dataset, which equals 61. The index, i will be used to access the entries of our target caption.
      for i in range(1, target.shape[1]):

          # Note that we are actually tackling multiple examples simulatenously because batch_size = 64.
          # Our decoder takes in the following inputs.
          # dec_input: a numerical representation of a tokenized/padded caption
          # features: extracted features from ResNet-50
          # hidden: the decoder's hidden state
          predictions, hidden, _ = decoder(dec_input, features, hidden, False)

          # Below we calculate the cross entropy (sparse categorical) with respect to
          # predictions and the intended word of our target caption.
          loss += loss_function(target[:, i], predictions)

          # By using teacher forcing, we choose the next decoder input to equal the
          # target word of this iteration, i.e., we show the model the answer for this round.
          dec_input = tf.expand_dims(target[:, i], 1)

  # We do not understand total_loss.
  # Generating an integer from target.shape[1] in order to execute division.
  total_loss = (loss / int(target.shape[1]))
  
  # Here, we can appreciate easily why we would want to inherit attributes and methods from tf.keras.Model.
  # tf.keras.Model inherits from tf.Module.
  # tf.Module has an attribute called trainable_variables, which gives us access to all the variables
  # that are being trained. Hence, we can now differentiate with respect to the variables of both
  # our encoder and decoder models.
  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  # We access the tape from above in order to compute the gradient of our loss function
  # with respect to our trainable variables.
  gradients = tape.gradient(loss, trainable_variables) 

  # Lastly, we apply the gradients we just calculated in order to take a step along the gradient and towards the descent.
  optimizer.apply_gradients(zip(gradients, trainable_variables)) 

  return loss, total_loss

In [None]:
EPOCHS = 20

# We train our model through the entire dataset 20 times.
for epoch in range(start_epoch, EPOCHS):

    # We start a timer, so we can share how long it takes for our model to train
    # through the entire dataset, i.e., the duration of a epoch.
    start = time.time()

    # We initiate the total loss to 0.
    total_loss = 0

    # This for loop allows us to iterate through the entire dataset.
    for (batch, (img_tensor, target)) in enumerate(dataset_train):

        # We record both the batch loss and the total loss, so we can share the average batch
        # loss and so we can generate a plot of the loss versus the number of epochs.
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        # If the current batch is divisible by 100, we generate the following summary.
        if batch % 100 == 0: 
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
    
    # We store our total loss after training through the entire dataset.
    # As we mentioned before, this is done in order to create a plot of the loss
    # versus the number of epochs.
    loss_plot.append(total_loss / num_steps)

    # If the current epoch is even, we call our checkpoint manager and
    # the configurations of our encoder, decoder, and optimizer.
    if epoch % 2 ==0:
        ckpt_manager.save()

    # Due to the nature of Python's range function, we will not be able to save the data
    # from the 20th epoch. So, we insert another if statement in order to guarantee
    # that our checkpoint manager sees that data.
    if epoch == EPOCHS - 1:
      ckpt_manager.save()

    # We finish off this iteration of the for loop by providing the following summary.
    print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

In [None]:
# Generating our plot with the appropiate labels and title
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

In [None]:
# The evaluation phase

def evaluate(image):

    # We have already calculated the max_length and attention_features_shape.
    # I do not know why the dimensions of the attention plot are determined
    # by max_length and attentions_features_shape.
    attention_plot = np.zeros((max_length, attention_features_shape))

    # We are evaluating a single image, so our batch consists of a single image!
    hidden = decoder.reset_state(batch_size=1)

    # load_image(image)[0] generates a tensor of shape, (224, 224, 3) with entries haveing dtype = float32.
    # The aftermath of tf.expand_dims is a tensor of shape: (1, 224, 224, 3).
    temp_input = tf.expand_dims(load_image(image)[0], 0)

    # We defined image_features_extract_model earlier as ResNet-50 without its top layer.
    # We run over image through our altered ResNet-50 model.
    img_tensor_val = image_features_extract_model(temp_input)

    # We reshape our tensor to recover a tensor of shape: (1, 49, 2048).
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0],
                                                 -1,
                                                 img_tensor_val.shape[3]))

    # We pass the image tensor above through our encoder.
    features = encoder(img_tensor_val, False)

    # We initialize the decoder input to be the start token.
    dec_input = tf.expand_dims([tokenizer_train.word_index['<start>']], 0)

    # We save the generated caption to a list.
    result = []

    for i in range(max_length):

        # Saving the outputs of our decoder model.
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden, False)

        # Having generated attention weights, we generate our attention plot.
        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        # Note that our decoder outputs a categorical probability distribution, which
        # we refer to as predictions. This is a discrete distribution with 5001 categories.
        # We emphasize that the number of categories is determined by the size of our vocabulary.

        # We take a random sample from our categorical probability distribution, predictions.
        # Then, we tack on a .numpy() at the end in order to generate a number.
        # This number is an index that can be fed into our tokenizer.
        # Lastly, we record the word that the tokenizer generates in our result list.
        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer_train.index_word[predicted_id])

        # If the index that we generate from sampling our distribution
        # recovers the end token, we return the caption and the plot.
        if tokenizer_train.index_word[predicted_id] == '<end>':
            return result, attention_plot

        # We update the decoder input to be the word we just generated.
        dec_input = tf.expand_dims([predicted_id], 0)

    # The x-axis of the attention plot is governed by max_length.
    # It is possible that the caption we generate has a length that is smaller
    # than the max_length, so we trim the x-axis of our attention plot.
    # However, we keep full size of the attention features shape.
    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [None]:
# We will describe this cell later.
def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for i in range(len_result):
        temp_att = np.resize(attention_plot[i], (8, 8))
        grid_size = max(np.ceil(len_result/2), 2)
        ax = fig.add_subplot(grid_size, grid_size, i+1)
        ax.set_title(result[i])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

In [None]:
# We will describe this cell later.
rid = np.random.randint(0, len(val_image_name))
image = val_image_name[rid]
real_caption = ' '.join([tokenizer_train.index_word[i]
                        for i in val_caption[rid] if i not in [0]])
result, attention_plot = evaluate(image)

print('Real Caption:', real_caption)
print('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot)