# Data preparation

In [None]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import random

SEED = 1234
tf.random.set_seed(SEED)
random.seed(SEED)

test_name = 'co_attention_1'

In [None]:
# Set the base directory for Colab and non Colab environment
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  base_dir = '/content/drive/My Drive/AN2DL/homework_3' 
else:
  base_dir = os.getcwd()


Mounted at /content/drive


In [None]:
dataset_dir = 'VQA_Dataset'
if not os.path.exists(dataset_dir):
    !unzip '/content/drive/MyDrive/AN2DL/homework_3/anndl-2020-vqa.zip'

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create training ImageDataGenerator object for images
train_img_data_gen = ImageDataGenerator(rotation_range=10,
                                        width_shift_range=10,
                                        height_shift_range=10,
                                        zoom_range=0.3,
                                        horizontal_flip=True,
                                        vertical_flip=True,
                                        fill_mode='nearest',
                                        rescale=1./255)

# Create validation ImageDataGenerator object
valid_img_data_gen = ImageDataGenerator(rescale=1./255)

In [None]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dictionary containing the references to files
dataset_filepath = os.path.join(dataset_dir, 'train_questions_annotations.json')
with open(dataset_filepath, 'r') as f:
  dataset_file = json.load(f)

# Splitting training and validation
training_count = int(0.8 * len(dataset_file))
validation_count = len(dataset_file) - training_count

items = list(map(lambda x: x[1], dataset_file.items()))
random.shuffle(items)

training_items = items[:training_count]
validation_items = items[-validation_count:]

# Creating Tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on training questions
train_questions = list(map(lambda x: x['question'], training_items))
tokenizer.fit_on_texts(train_questions)

# Using tokenizer to tokenize training questions
tokenized_train_questions = tokenizer.texts_to_sequences(train_questions)
max_question_length = max(len(question) for question in tokenized_train_questions)
question_inputs_train = pad_sequences(tokenized_train_questions, maxlen=max_question_length, padding='post')

# Using tokenizer to tokenize validation questions
valid_questions = list(map(lambda x: x['question'], validation_items))
tokenized_valid_questions = tokenizer.texts_to_sequences(valid_questions)
question_inputs_valid = pad_sequences(tokenized_valid_questions, maxlen=max_question_length, padding='post')

# Setting dimension of the dictionary
dictionary_dim = len(tokenizer.word_index)

In [None]:
from PIL import Image

# Define a custom dataset class extending Keras Sequence
class CustomDataset(tf.keras.utils.Sequence):
  """
    CustomDataset inheriting from tf.keras.utils.Sequence.

    3 main methods:
      - __init__: save dataset params
      - __len__: return the total number of samples in the dataset
      - __getitem__: return a sample from the dataset
  """


  LABELS_DICT = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    'apple': 6,
    'baseball': 7,
    'bench': 8,
    'bike': 9,
    'bird': 10,
    'black': 11,
    'blanket': 12,
    'blue': 13,
    'bone': 14,
    'book': 15,
    'boy': 16,
    'brown': 17,
    'cat': 18,
    'chair': 19,
    'couch': 20,
    'dog': 21,
    'floor': 22,
    'food': 23,
    'football': 24,
    'girl': 25,
    'grass': 26,
    'gray': 27,
    'green': 28,
    'left': 29,
    'log': 30,
    'man': 31,
    'monkey bars': 32,
    'no': 33,
    'nothing': 34,
    'orange': 35,
    'pie': 36,
    'plant': 37,
    'playing': 38,
    'red': 39,
    'right': 40,
    'rug': 41,
    'sandbox': 42,
    'sitting': 43,
    'sleeping': 44,
    'soccer': 45,
    'squirrel': 46,
    'standing': 47,
    'stool': 48,
    'sunny': 49,
    'table': 50,
    'tree': 51,
    'watermelon': 52,
    'white': 53,
    'wine': 54,
    'woman': 55,
    'yellow': 56,
    'yes': 57
  }


  def __init__(self, items, question_inputs=None, max_question_length=None, img_generator=None, out_shape=None):
    """
      Initialize the object.

      Keyword arguments:
      which_subset -- 'training' for the training set, else 'validation'
      tokenizer -- tokenizer object for the 'validatio' set, in case of 'training' the object will be created
      max_question_length -- max length of a question of the training set (used for tokenizing 'validation'), in case of 'training' the parameter will be calculated
      img_generator -- ImageDataGenerator objet to apply to the images or None
      out_shape -- output shape for the images, a tuple (height, width) or None for original size
    """
    
    # Set class properties
    self.items = items
    self.question_inputs = question_inputs
    self.max_question_length = max_question_length
    self.img_generator = img_generator
    self.out_shape = out_shape

  def __len__(self):
    """
      Return the length of the dataset.
    """
    return len(self.items)

  def __getitem__(self, index):
    """
      Return an item from the set.

      Keyword arguments:
      index -- index of the item to return
    """

    # Read Image
    curr_item = self.items[index]
    img_id = curr_item['image_id']
    img = Image.open(os.path.join(dataset_dir, 'Images', img_id + '.png'))

    # Convert image from RGBA to RGB
    img = img.convert('RGB')

    # Resize image
    if self.out_shape is not None:
      img = img.resize(self.out_shape)

    img_arr = np.array(img)

    if self.img_generator is not None:
      # Perform data augmentation
      # Get a random transformation from the ImageDataGenerator and we can apply it to the image
      img_t = self.img_generator.get_random_transform(img_arr.shape, seed=SEED)
      img_arr = self.img_generator.apply_transform(img_arr, img_t)

    # Convert answer to one-hot
    answer = np.zeros(len(self.LABELS_DICT))
    answer[self.LABELS_DICT[curr_item['answer']]] = 1

    return (self.question_inputs[index], np.float32(img_arr)), answer

In [None]:
# Set the sizes for the reshaped images
img_w = 700
img_h = 400

# Set the number of classes
num_classes = 58

# Set batch size
bs = 16

# Create training and validation set generators
dataset = CustomDataset(training_items, question_inputs_train, max_question_length, img_generator=train_img_data_gen, out_shape=(img_w, img_h))
dataset_valid = CustomDataset(validation_items, question_inputs_valid, max_question_length, img_generator=valid_img_data_gen, out_shape=(img_w, img_h))

In [None]:
# Create training data set from the generator
train_dataset = tf.data.Dataset.from_generator(lambda: dataset,
                                               output_types=((np.int32, np.float32), np.int32),
                                               output_shapes=(([dataset.max_question_length,], [img_h, img_w, 3]), (num_classes)))
train_dataset = train_dataset.batch(bs)

train_dataset = train_dataset.repeat()

# Create validation data set from the generator
valid_dataset = tf.data.Dataset.from_generator(lambda: dataset_valid,
                                               output_types=((np.int32, np.float32), np.int32),
                                               output_shapes=(([dataset.max_question_length,], [img_h, img_w, 3]), (num_classes)))
valid_dataset = valid_dataset.batch(1)

valid_dataset = valid_dataset.repeat()

# Model

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embed_dim)
        self.key_dense = tf.keras.layers.Dense(embed_dim)
        self.value_dense = tf.keras.layers.Dense(embed_dim)
        self.combine_heads = tf.keras.layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, query, key, value):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(query)[0]

        # (batch_size, seq_len, embed_dim)
        query = self.query_dense(query)  
        key = self.key_dense(key)  
        value = self.value_dense(value)

        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [None]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        if len(inputs) == 3:
            attn_output = self.att(inputs[0], inputs[1], inputs[2])
        elif len(inputs) == 2:
            attn_output = self.att(inputs[0], inputs[1], inputs[1])
        elif len(inputs) == 1:
            attn_output = self.att(inputs[0], inputs[0], inputs[0])
        else:
            raise ValueError(
                "encoder's input list length must be 1, 2 or 3"
            )


        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs[0] + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.att_s = MultiHeadAttention(embed_dim, num_heads)
        self.att_g = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm_s = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_g = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_o = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_s = tf.keras.layers.Dropout(rate)
        self.dropout_g = tf.keras.layers.Dropout(rate)
        self.dropout_o = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        q = inputs[0]
        if len(inputs) == 2:
            k = inputs[1] 
            v = inputs[1]
        elif len(inputs) == 3:
            k = inputs[1] 
            v = inputs[2]
        else:
            raise ValueError(
                "decoder's input list length must be 2 or 3"
            )

        # Self attention
        attn_output = self.att_s(q, q, q)
        attn_output = self.dropout_s(attn_output, training=training)
        q = self.layernorm_s(q + attn_output)

        # Guided Attention
        attn_output = self.att_g(q, k, v)
        attn_output = self.dropout_g(attn_output, training=training)
        q = self.layernorm_g(q + attn_output)       
       
        ffn_output = self.ffn(q)
        ffn_output = self.dropout_o(ffn_output, training=training)
        return self.layernorm_o(q + ffn_output)

In [None]:
embeddings_index = {}
f = open(os.path.join(base_dir, 'glove.6B', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_matrix = np.zeros((dictionary_dim + 1, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embedding_matrix):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embed_dim,
                                                   weights=[embedding_matrix],
                                                   trainable=False)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def conv2D(x, filters, k_size=3, strirdes=1, relu=0, pool_size=0):
    x = tf.keras.layers.Conv2D(filters=filters, 
                               kernel_size=k_size,
                               strides=strirdes,
                               padding='same',
                               kernel_regularizer='l2',
                               input_shape=[None])(x)
    if relu:
        x = tf.keras.layers.ReLU()(x)
    if pool_size:
        x = tf.keras.layers.MaxPool2D(pool_size=pool_size)(x)
    return x

In [None]:
def vision_embending(inputs, out_length, question_length):
    x = tf.keras.applications.DenseNet121(input_tensor=inputs, include_top=False, weights='imagenet').output
    output = []

    for i in range(question_length):
        y = conv2D(x, out_length, k_size=1, strirdes=1, relu=1)
        y = tf.keras.layers.GlobalAveragePooling2D()(y)
        output.append(y)
    
    x = tf.keras.layers.Concatenate()(output)

    return tf.keras.layers.Reshape((question_length, out_length))(x)

In [1]:
# Free up RAM in case the model definition cells were run multiple times
tf.keras.backend.clear_session()

# Define Transformer for language input
input_length = max_question_length
vocab_size = dictionary_dim + 1
embed_dim = 300  # Embedding size for each token
num_heads = 8  # Number of attention heads
encode_dim = 512  # Encoding size for each encoder
ff_dim = 512  # Hidden layer size in feed forward network inside transformer

################################### Immage Embenddign ##############################################
image_input = tf.keras.layers.Input(shape=(img_h, img_w, 3))
image_embedding = vision_embending(image_input, 512, input_length)

################################### Question Embedding #############################################
question_input = tf.keras.layers.Input(shape=(input_length), dtype='int32')
#question_embedding = TokenAndPositionEmbedding(input_length, vocab_size, embed_dim, embedding_matrix)(question_input)
question_embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                               output_dim=embed_dim,
                                               weights=[embedding_matrix])(question_input)
question_embedding = tf.keras.layers.LSTM(encode_dim, return_sequences=True)(question_embedding)

################################## Encoder #########################################################
# 1 SAtt
enc = TransformerEncoder(encode_dim, num_heads, ff_dim)([question_embedding])
# 2 SAtt
enc = TransformerEncoder(encode_dim, num_heads, ff_dim)([enc]) 
# 3 SAtt 
enc = TransformerEncoder(encode_dim, num_heads, ff_dim)([enc])

################################## Decoder #########################################################
# 1 SGAtt
dec = TransformerEncoder(encode_dim, num_heads, ff_dim)([image_embedding, enc])
# 2 SGAtt
dec = TransformerEncoder(encode_dim, num_heads, ff_dim)([dec, enc])
# 3 SGAtt
dec = TransformerEncoder(encode_dim, num_heads, ff_dim)([dec, enc])

################################### Feature fusion #################################################
# Question
enc_alpha = tf.keras.layers.Dense(encode_dim, activation='relu')(enc)
enc_alpha = tf.keras.layers.Dropout(0.1)(enc_alpha)
enc_alpha = tf.keras.layers.Dense(1, activation='softmax')(enc_alpha)

question_out = tf.keras.layers.multiply([enc_alpha, enc])
question_out = tf.reduce_sum(question_out, 1)
question_out = tf.keras.layers.Dense(encode_dim, activation='relu')(question_out)

# Image
dec_alpha = tf.keras.layers.Dense(encode_dim, activation='relu')(dec)
dec_alpha = tf.keras.layers.Dropout(0.1)(dec_alpha)
dec_alpha = tf.keras.layers.Dense(1, activation='softmax')(dec_alpha)

image_out = tf.keras.layers.multiply([dec_alpha, dec])
image_out = tf.reduce_sum(image_out, 1)
image_out = tf.keras.layers.Dense(encode_dim, activation='relu')(image_out)

################################### Merging and output #############################################
merged = tf.keras.layers.LayerNormalization()(question_out + image_out)
output = tf.keras.layers.Dense(units=num_classes, activation='softmax')(merged)
model = tf.keras.models.Model(inputs=[question_input, image_input], outputs=output)

# Visualize created model
model.summary()

NameError: ignored

# Training

In [None]:
# Loss function
loss = tf.keras.losses.CategoricalCrossentropy() 

# Learning rate and oprimizer
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# Number of epochs
epochs = 100

# Validation metrics
metrics = ['accuracy']

# Compile Model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
exp_dir = os.path.join(base_dir, test_name)
if not os.path.exists(exp_dir):
  os.makedirs(exp_dir)
    
callbacks = []

# Checkpoint callback, generate a checkpoint at each epoch
# There is only one checkpoint overwritten only if the new one has an lower validation loss
ckpt_dir = os.path.join(exp_dir, 'checkpoints')
if not os.path.exists(ckpt_dir):
  os.makedirs(ckpt_dir)
checkpoint = os.path.join(ckpt_dir, 'checkpoint_' + test_name + '.ckpt')
ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint,
                                                   save_weights_only=True,
                                                   monitor='val_loss',
                                                   mode='min',
                                                   save_best_only=True)
callbacks.append(ckpt_callback)

# Callback for tensorboard logs
tb_dir = os.path.join(exp_dir, 'tb_logs')
if not os.path.exists(tb_dir):
  os.makedirs(tb_dir)
    
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                             profile_batch=0,
                                             histogram_freq=1)
callbacks.append(tb_callback)


# Callback for early stopping in order to optimize the number of epochs
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5)
callbacks.append(es_callback)

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir '$base_dir'

In [None]:
# If there exists a checkpoint it's loaded, otherwise the model is trained
# If train is True and there exists a checkpoint the trainig continues from it

train = True
load = False

if os.path.exists(os.path.join(ckpt_dir, 'checkpoint')) and load:
  model.load_weights(checkpoint)


if train:
  model.fit(x=train_dataset,
            epochs=epochs,
            steps_per_epoch=len(dataset)/bs,
            validation_data=valid_dataset,
            #initial_epoch=7,
            validation_steps=len(dataset_valid),
            callbacks=callbacks)
  
  # Reload the weights to load the best checkpoint
  model.load_weights(checkpoint)

  # Save model as h5
  #model_file = os.path.join(exp_dir, test_name + '.h5')
  #model.save(model_file, include_optimizer=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: ignored

# Prediction

In [None]:
import json

# Image data set generator and path
test_img_data_gen = ImageDataGenerator(rescale=1./255)
test_filepath = os.path.join(dataset_dir, 'test_questions.json')

# Load test dictionary
with open(test_filepath, 'r') as f:
  test_items = json.load(f)

# Initialize submission dict
submission_dict = {}

for key, item in test_items.items():
  # Open test image
  img = Image.open(os.path.join(dataset_dir, 'Images', item['image_id'] + '.png'))

  # Convert image to RGB and resize
  img = img.convert('RGB')
  img = img.resize((img_w, img_h))
  img_arr = np.array(img)

  # Apply test transformation
  img_t = test_img_data_gen.get_random_transform(img_arr.shape, seed=SEED)
  img_arr = np.float32(test_img_data_gen.apply_transform(img_arr, img_t))

  # Tokenize question
  question = item['question']
  tokenized_question = tokenizer.texts_to_sequences([question])
  question_input = pad_sequences(tokenized_question, maxlen=max_question_length, padding='post')

  # Prediction
  out_sigmoid = model.predict([question_input, np.expand_dims(img_arr, axis=0)])[0]

  prediction = tf.argmax(out_sigmoid, -1).numpy()

  submission_dict[key] = prediction

In [None]:
import os

csv_fname = 'submission.csv'

# Write submission dict as csv file
with open(os.path.join('/content/drive/MyDrive/AN2DL', csv_fname), 'w') as f:

    f.write('Id,Category\n')

    for key, value in submission_dict.items():
        f.write(key + ',' + str(value) + '\n')