#Deep Learning Assignment - Image Caption Generator Model

## Introduction

### Team members

* Mahavithana S. G - 2020/E/087
* Weerakoon A. B   - 2020/E/169
* Somapala M. S    - 2020/E/193

## Code

#### Imports

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import re

import IPython

from google.colab import drive

from tqdm.notebook import tqdm

#### Data loading and pre-processing

In [2]:
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


Setting data directories

In [3]:
data_dir = "/content/drive/My Drive/Image caption generator Dataset"
image_dir = data_dir + "/Images"
csv_file = data_dir + "/captions.txt"

Loading captions

In [4]:
caption_df = pd.read_csv(csv_file, delimiter=',')

print(f'The shape of dataframe: {caption_df.shape}')
print(f'The columns in the dataframe: {caption_df.columns}')
print('First 10 rows of the dataframe:')
print(caption_df.head(10))
print(f'Unique image names: {len(pd.unique(caption_df["image"]))}')

The shape of dataframe: (40455, 2)
The columns in the dataframe: Index(['image', 'caption'], dtype='object')
First 10 rows of the dataframe:
                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   
5  1001773457_577c3a7d70.jpg   
6  1001773457_577c3a7d70.jpg   
7  1001773457_577c3a7d70.jpg   
8  1001773457_577c3a7d70.jpg   
9  1001773457_577c3a7d70.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  
5         A black dog and a spotted dog are fighting  
6  A black dog and a tri-colored dog playing with...  
7  A black dog and a white dog with brown spots a...  
8  Two dogs

Shuffle dataframe

In [5]:
caption_df = caption_df.sample(frac=1).reset_index(drop=True)
print(caption_df.head(10))

                       image  \
0  2704362232_7d84503433.jpg   
1  3545427060_c16a8b7dfd.jpg   
2  3424605029_53078d3505.jpg   
3   330849796_c575c3108a.jpg   
4  3530087422_7eb2b2c289.jpg   
5  2968135512_51fbb56e3e.jpg   
6  3249062399_0dafe5e4f5.jpg   
7  3437273677_47d4462974.jpg   
8  2696060728_3043cfc38c.jpg   
9  2881468095_d4ce8c0c52.jpg   

                                             caption  
0   A man and dog are splashing in a swimming pool .  
1             A group of kids play around in water .  
2  A young boy wearing a blue shirt is playing at...  
3  A woman , curled up , sleeps in her seat on a ...  
4  A person in a red and white suit kneels down o...  
5     Young skateboarder doing a trick in the park .  
6       Three women walk in a line on the sidewalk .  
7  A woman eating at a restaurant with other peop...  
8  A man helps a girl get up on a turquoise surfb...  
9      A skateboarder is balancing on a brick wall .  


Clean captions and start/end tags

In [6]:
def clean_captions(captions):

    # remove punctuations
    captions = re.sub(r'[^\w\s]', '', captions)

    # convert to lowercase
    captions = captions.lower()

    # remove multiple consecutive spaces
    captions = re.sub(r'\s+', ' ', captions).strip()
    return captions

caption_df['caption'] = caption_df['caption'].apply(clean_captions)

# add start and end to captions
caption_df['caption'] = '<start> ' + caption_df['caption'] + ' <end>'

print('First 10 rows of the dataframe cleaned:')
print(caption_df.head(10))

First 10 rows of the dataframe cleaned:
                       image  \
0  2704362232_7d84503433.jpg   
1  3545427060_c16a8b7dfd.jpg   
2  3424605029_53078d3505.jpg   
3   330849796_c575c3108a.jpg   
4  3530087422_7eb2b2c289.jpg   
5  2968135512_51fbb56e3e.jpg   
6  3249062399_0dafe5e4f5.jpg   
7  3437273677_47d4462974.jpg   
8  2696060728_3043cfc38c.jpg   
9  2881468095_d4ce8c0c52.jpg   

                                             caption  
0  <start> a man and dog are splashing in a swimm...  
1  <start> a group of kids play around in water <...  
2  <start> a young boy wearing a blue shirt is pl...  
3  <start> a woman curled up sleeps in her seat o...  
4  <start> a person in a red and white suit kneel...  
5  <start> young skateboarder doing a trick in th...  
6  <start> three women walk in a line on the side...  
7  <start> a woman eating at a restaurant with ot...  
8  <start> a man helps a girl get up on a turquoi...  
9  <start> a skateboarder is balancing on a brick...  


Set absolute image paths

In [7]:
caption_df['image'] = image_dir + '/' +caption_df['image']

Split training, validating and testing datasets

In [8]:
train_size = int(0.7 * len(caption_df))
val_size = int(0.2 * len(caption_df))

# split datasets
train_df = caption_df[:train_size]
val_df = caption_df[train_size:train_size+val_size]
test_df = caption_df[train_size+val_size:]

Setup tokenizer

In [9]:
# pick 5000 most recurring words
VOCAB_SIZE = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~')

# fit tokenizer on captions
tokenizer.fit_on_texts(caption_df['caption'])

# check and verify the tokenizer
print(f'Vocabulary size: {len(tokenizer.word_index) + 1}')
dog_idx = tokenizer.word_index['dog']
print(f'Index of dog: {dog_idx}')
print(f'Word with index {dog_idx}: {tokenizer.index_word[dog_idx]}')

Vocabulary size: 8832
Index of dog: 10
Word with index 10: dog


Preprocess and generate caption datasets

In [10]:
def tensorflow_caption_dataset_from_dataframe(dataframe):
  sequences = tokenizer.texts_to_sequences(dataframe['caption'])
  padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
  return tf.data.Dataset.from_tensor_slices(padded_sequences)

train_cap_ds = tensorflow_caption_dataset_from_dataframe(train_df)
val_cap_ds = tensorflow_caption_dataset_from_dataframe(val_df)
test_cap_ds = tensorflow_caption_dataset_from_dataframe(test_df)

Loading images

In [11]:
@tf.function
def load_img(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (224, 224))
    return img

Setup image datasets

In [12]:
def tensorflow_image_dataset_from_dataframe(dataframe):
  image_paths = dataframe['image'].values
  return tf.data.Dataset.from_tensor_slices(image_paths).map(load_img)

train_img_ds = tensorflow_image_dataset_from_dataframe(train_df)
val_img_ds = tensorflow_image_dataset_from_dataframe(val_df)
test_img_ds = tensorflow_image_dataset_from_dataframe(test_df)

#### Model Definition

The **show and tell** model contains two main components,

<ol>
  <li> Show (Encoder)
    <ul>
      <li>Encompases image feature extraction</li>
      <li>Uses CNN from InceptionV3</li>
      <li>Passes extracted features as the initial hidden state</li>
    </ul>
  </li>
  <li> Tell (Decoder)
    <ul>
      <li>Encompases caption generation</li>
      <li>Accepts the initial hidden state</li>
      <li>Uses LSTMs (or GRUs) to generate token indieces which can be mapped to words from the loaded vocab</li>
    </ul>
  </li>
</ol>

Encoder

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, embedding_dim, activation='sigmoid'):
        super(Encoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.activation = activation

    def build(self, input_shape):
        self.resnet = tf.keras.applications.ResNet50(include_top=False, weights='imagenet')
        self.resnet.trainable = False
        self.gap = tf.keras.layers.GlobalAveragePooling2D()
        self.fc = tf.keras.layers.Dense(units=self.embedding_dim, activation=self.activation)

    def call(self, x):
        x = self.resnet(x)
        x = self.gap(x)
        x = self.fc(x)
        return x

Decoder

In [14]:
class Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size, rnn_type='GRU', num_layers=4):
        super(Decoder, self).__init__()
        self.units = units
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim)

        self.rnn_layers = []
        for _ in range(self.num_layers):
            if self.rnn_type == 'GRU':
                self.rnn_layers.append(tf.keras.layers.GRU(units=self.units, return_sequences=True, return_state=True))
            elif self.rnn_type == 'LSTM':
                self.rnn_layers.append(tf.keras.layers.LSTM(units=self.units, return_sequences=True, return_state=True))

        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(self.vocab_size)

    def call(self, x, intialize_to_zero=False):
        initial_state = self.reset_state(batch_size=x.shape[0])

        output = x
        state = None

        for rnn_layer in self.rnn_layers:
          if intialize_to_zero:
            output_tuple = rnn_layer(inputs=output, initial_state=initial_state)
            output = output_tuple[0]
            state = output_tuple[1]
          else:
            output_tuple = rnn_layer(inputs=output)
            output = output_tuple[0]
            state = output_tuple[1]

        x = self.fc1(output)
        x = self.fc2(x)

        return x, state

    def embed(self, x):
        return self.embedding(x)

    def reset_state(self, batch_size):
        if self.rnn_type == 'GRU':
            return tf.zeros((batch_size, self.units))
        elif self.rnn_type == 'LSTM':
            return [tf.zeros((batch_size, self.units)), tf.zeros((batch_size, self.units))]

Define loss function

In [15]:
def loss_function(real, pred, loss_object):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

Define train function

In [16]:
@tf.function
def train_step(img_tensor, target, optimizer, encoder, decoder, loss_object):
    loss = 0

    # Record the operations for automatic differentiation
    with tf.GradientTape() as tape:
        features = tf.expand_dims(encoder(img_tensor), 1)
        em_words = decoder.embed(target)
        x = tf.concat([features, em_words], axis=1)
        predictions, _ = decoder(x, True)

        # Compute the loss between the target and predictions
        loss = loss_function(target[:, 1:], predictions[:, 1:-1, :], loss_object)

    # Get the trainable variables from both the encoder and decoder
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    # Compute the gradients of the loss with respect to the trainable variables
    gradients = tape.gradient(loss, trainable_variables)

    # Apply the gradients to update the trainable variables using the optimizer
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss

Define validation function

In [17]:
@tf.function
def val_step(img_tensor, target, encoder, decoder, loss_object):
    loss = 0
    features = tf.expand_dims(encoder(img_tensor),1)
    em_words = decoder.embed(target)
    x = tf.concat([features,em_words],axis=1)
    predictions, _ = decoder(x, True)
    loss = loss_function(target[:,1:], predictions[:,1:-1,:], loss_object)
    return loss

Train and return model

In [18]:
def train_and_return_model(train_img_ds,
                           train_cap_ds,
                           val_img_ds,
                           val_cap_ds,
                           epochs=15,
                           batch_size=512,
                           embedding_dim=512,
                           decoder_dense_units=256,
                           vocab_size=VOCAB_SIZE,
                           rnn_type='GRU',
                           num_layers=4):
  # zip and prefetch datasets
  train_ds = tf.data.Dataset.zip((train_img_ds, train_cap_ds)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
  val_ds = tf.data.Dataset.zip((val_img_ds, val_cap_ds)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

  # get encoder and decoder
  encoder = Encoder(embedding_dim=embedding_dim)
  decoder = Decoder(embedding_dim=embedding_dim, units=decoder_dense_units, vocab_size=vocab_size, rnn_type=rnn_type, num_layers=num_layers)

  # setup optimizer and loss object
  optimizer = tf.keras.optimizers.Adam()
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

  # train the model for number of epochs
  epoch_wise_loss = []
  epoch_wise_val_loss = []
  for epoch in tqdm(range(epochs), desc="Epochs: "):
    batch_wise_loss = []

    for (batch, (img_tensor, target)) in enumerate(tqdm(train_ds, desc="Training Batches: ", leave=False)):
        loss = train_step(img_tensor, target, optimizer, encoder, decoder, loss_object)
        batch_wise_loss.append(loss.numpy())

        if batch % 10 == 0:
            print(f'Epoch: {epoch} Batch: {batch} Loss: {batch_wise_loss[-1]:.3f}')

    epoch_wise_loss.append(np.mean(batch_wise_loss))

    batch_wise_val_loss = []

    for (batch, (img_tensor, target)) in enumerate(tqdm(val_ds, desc="Validation Batches: ", leave=False)):
        loss = val_step(img_tensor, target, encoder, decoder, loss_object)
        batch_wise_val_loss.append(loss.numpy())

    epoch_wise_val_loss.append(np.mean(batch_wise_val_loss))

    print(f'Epoch: {epoch} Total Loss: {epoch_wise_loss[-1]:.3f} Val Loss: {epoch_wise_val_loss[-1]:.3f}')
    print('-' * 40)

  return encoder, decoder, epoch_wise_loss, epoch_wise_val_loss


Test train function

In [19]:
# encoder, decoder, epoch_wise_loss, epoch_wise_val_loss = train_and_return_model(train_img_ds,
#                                                                                 train_cap_ds,
#                                                                                 val_img_ds,
#                                                                                 val_cap_ds)

Install and login to wandb

In [20]:
!pip install wandb -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

Run pre-configured sweep

In [None]:
def tune():
  # initialize a new run
  wandb.init()

  # define hyperparameters to tune
  config = wandb.config

  encoder, decoder, epoch_wise_loss, epoch_wise_val_loss = train_and_return_model(train_img_ds,
                                                                                  train_cap_ds,
                                                                                  val_img_ds,
                                                                                  val_cap_ds,
                                                                                  epochs=config.epochs,
                                                                                  batch_size=config.batch_size,
                                                                                  embedding_dim=config.embedding_dim,
                                                                                  decoder_dense_units=config.decoder_dense_units,
                                                                                  rnn_type=config.rnn_type,
                                                                                  num_layers=config.num_layers)

  wandb.log({"val_loss": np.mean(np.array(epoch_wise_val_loss)), "epoch_wise_val_loss": epoch_wise_val_loss})

sweep_id = "40z0uu0q"

wandb.agent(sweep_id, function=tune)