In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%tensorflow_version 2.1.2

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.1.2`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [2]:
import os
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU, add, Conv2D, Reshape
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, multiply
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import VGG16, VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.utils import Sequence
from tensorflow.keras import utils
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import image, text, sequence
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
K.set_image_data_format('channels_first')


import numpy as np

SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)  



In [3]:
from os import listdir, makedirs
from os.path import isfile, join, basename, splitext
from random import seed, shuffle
import glob
# set the matplotlib backend so figures can be saved in the background
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("Agg")
# import the necessary packages

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.metrics import classification_report

from imutils import paths
import math
import numpy as np
import pickle
import operator
from operator import itemgetter
from itertools import zip_longest
from collections import defaultdict
import json
import joblib
from tqdm import tqdm
import pandas as pd
from nltk.tokenize.treebank import TreebankWordTokenizer
import pandas as pd
import seaborn as sns
import datetime

In [139]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%%capture
!unzip /content/drive/My\ Drive/anndl-2020-vqa.zip

In [4]:
from os import listdir, makedirs
from os.path import isfile, join, basename, splitext
from random import seed, shuffle
from PIL import Image
import json
import cv2
from tensorflow.keras.preprocessing.sequence import pad_sequences

imgs_path = '/content/VQA_Dataset/Images'
train_json_path = '/content/VQA_Dataset/train_questions_annotations.json'
test_json_path = '/content/VQA_Dataset/test_questions.json'
  
DATASET_SPLIT = 0.8
BATCH_SIZE = 128
  
classes = {'0': 0,
            '1': 1,
            '2': 2,
            '3': 3,
            '4': 4,
            '5': 5,
            'apple': 6,
            'baseball': 7,
            'bench': 8,
            'bike': 9,
            'bird': 10,
            'black': 11,
            'blanket': 12,
            'blue': 13,
            'bone': 14,
            'book': 15,
            'boy': 16,
            'brown': 17,
            'cat': 18,
            'chair': 19,
            'couch': 20,
            'dog': 21,
            'floor': 22,
            'food': 23,
            'football': 24,
            'girl': 25,
            'grass': 26,
            'gray': 27,
            'green': 28,
            'left': 29,
            'log': 30,
            'man': 31,
            'monkey bars': 32,
            'no': 33,
            'nothing': 34,
            'orange': 35,
            'pie': 36,
            'plant': 37,
            'playing': 38,
            'red': 39,
            'right': 40,
            'rug': 41,
            'sandbox': 42,
            'sitting': 43,
            'sleeping': 44,
            'soccer': 45,
            'squirrel': 46,
            'standing': 47,
            'stool': 48,
            'sunny': 49,
            'table': 50,
            'tree': 51,
            'watermelon': 52,
            'white': 53,
            'wine': 54,
            'woman': 55,
            'yellow': 56,
            'yes': 57}
  
N_CLASSES = len(classes)

In [5]:
def image_feature_extractor(target_path, image_list, BATCH_SIZE):
	"""
	Extracts (512, 7, 7)-dimensional CNN features and save them locally

	Input:
		target_path: path to save the features
		image_list: image filenames
		BATCH_SIZE: batch size

	Returns:
		None
	"""
	 
	model = VGG19(weights="imagenet", include_top=False, input_tensor=Input(shape=(3, 224, 224)))

  	# add a progress bar
	progbar = utils.Progbar(int(np.ceil(len(image_list) / float(BATCH_SIZE))))

  	# loop over the images in batches
	for (b, i) in enumerate(range(0, len(image_list), BATCH_SIZE)):
		# extract batch of images and prepare them to pass it through the VGG 
		# network for feature extraction

		progbar.update(b+1)
		
		batch_range = range(i, min(i + BATCH_SIZE, len(image_list)))
		batchPaths = image_list[batch_range[0]: batch_range[-1]+1]

		batchImages = []
		batchIds = []
		# loop over the images and labels in the current batch
		for imagePath in batchPaths:

            # load the input image using the Keras helper utility
            # while ensuring the image is resized to 224x224 pixels
			img = image.load_img(os.path.join(imgs_path,imagePath), target_size=(224, 224))
			img = image.img_to_array(img)
    
            # preprocess the image by 
            # (1) expanding the dimensions to include batch dim and
            # (2) subtracting the mean RGB pixel intensity from the ImageNet dataset
			img = np.expand_dims(img, axis=0)
			img = preprocess_input(img)
    
            # add the image to the batch
			batchImages.append(img)
			# image ids of the batch
			batchIds.append(imagePath.split('.')[0][-6:])
	  
		batchImages = np.vstack(batchImages) # (BATCH_SIZE, 3, 224, 224)

		# pass the images through the network and use the outputs as our actual features
		features = model.predict(batchImages) # (BATCH_SIZE, 512, 7, 7)
		features = tf.reshape(features, (features.shape[0], features.shape[1], -1)) # (BATCH_SIZE, 512, 49)
		features = tf.transpose(features, perm =[0,2,1])  # (BATCH_SIZE, 49, 512)

		# loop over the batch to save them locally
		for id, feat in zip(batchIds, features):
			np.save(os.path.join(target_path, id), feat)

In [6]:
image_list = os.listdir("/content/VQA_Dataset/Images")
BATCH_SIZE = 300
target_path = '/content/drive/My Drive/VQA/features'

In [169]:
image_feature_extractor(target_path, image_list, BATCH_SIZE)



In [7]:
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer


def process_sentence(sentence):
    """
    Cleans a given raw sentence
    Input:
        sentence: a raw sentence
    Returns:
        Returns the cleaned version of the sentence
    """
    # remove the character ".", except from floating numbers
    periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
    # remove any "," between digits, eg: 5,6
    commaStrip   = re.compile("(\d)(\,)(\d)")
    # list of punctuations to remove
    punct        = [';', r"/", '[', ']', '"', '{', '}',
                    '(', ')', '=', '+', '\\', '_', '-',
                    '*', ':', '^', '%', '$', '#', '&',
                    '>', '<', '@', '`', ',', '?', '!']
    # contraction mappings
    contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
                    "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
                    "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
                    "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
                    "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
                    "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
                    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
                    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
                    "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
                    "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
                    "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
                    "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
                    "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
                    "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
                    "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
                    "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
                    "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "where's": "where is", "whereve": "where've", \
                    "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
                    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
                    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
                    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
                    "youll": "you'll", "youre": "you're", "youve": "you've"}

    # replace new line with a white space
    inText = sentence.replace('\n', ' ')
    # replace multiple white space with single white space
    inText = inText.replace('\t', ' ')
    inText = inText.strip()
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) or \
           (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    outText = outText.lower().split()
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText





In [8]:
#read train JSON file
with open(train_json_path, 'r') as f:
    train_data = json.load(f)
f.close()

#read test JSON file
with open(test_json_path, 'r') as f:
    test_data = json.load(f)
f.close()


TOT_QUESTIONS = len(train_data)
indx = list(train_data.keys())
train_indx = indx[:int(TOT_QUESTIONS*DATASET_SPLIT)]
valid_indx = indx[int(TOT_QUESTIONS*DATASET_SPLIT):]





In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import re

questions=[]
for ind in indx:
  questions.append(train_data[ind]["question"])
test_indx = list(test_data.keys())
for ind in test_indx:
  questions.append(test_data[ind]["question"])

questions_processed = pd.Series(questions).apply(process_sentence)

MAX_SEQ = 95
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(questions_processed)
vocab_size = len(tokenizer.word_index) + 1
print(f'Vocab Size: {vocab_size}')


Vocab Size: 4641


In [167]:
# save to disk
import joblib
with open('/content/drive/My Drive/VQA/text_tokenizer.pkl', 'wb') as f:
   joblib.dump(tokenizer, f)

In [10]:
train_questions=[]

for indx in train_data.keys():
  train_questions.append(train_data[indx]["question"])


questions_train_processed  = pd.Series(train_questions).apply(process_sentence)


In [11]:
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
question_data_train = tokenizer.texts_to_sequences(questions_train_processed)
question_len = [len(text) for text in question_data_train]
plt.figure(figsize=(7,5))
sns.distplot(question_len, color='red')
plt.title('Distribution of Question length')
plt.xlabel('Length of Question')
plt.ylabel('Question count')
plt.xlim(0, 30)
plt.show()

<Figure size 504x360 with 0 Axes>



<matplotlib.axes._subplots.AxesSubplot at 0x7f18eacf3a20>

Text(0.5, 1.0, 'Distribution of Question length')

Text(0.5, 0, 'Length of Question')

Text(0, 0.5, 'Question count')

(0.0, 30.0)

In [12]:

for i in range(0,11):
    print(10*i,'percentile value is', np.percentile(question_len,10*i))

0 percentile value is 2.0
10 percentile value is 4.0
20 percentile value is 5.0
30 percentile value is 5.0
40 percentile value is 5.0
50 percentile value is 6.0
60 percentile value is 6.0
70 percentile value is 7.0
80 percentile value is 8.0
90 percentile value is 9.0
100 percentile value is 21.0


In [13]:

for i in range(0,11):
    print(90+i,'percentile value is',np.percentile(question_len,90+i))

90 percentile value is 9.0
91 percentile value is 9.0
92 percentile value is 9.0
93 percentile value is 9.0
94 percentile value is 10.0
95 percentile value is 10.0
96 percentile value is 10.0
97 percentile value is 11.0
98 percentile value is 11.0
99 percentile value is 13.0
100 percentile value is 21.0


In [14]:
MAX_LEN = 22

question_data_train=sequence.pad_sequences(question_data_train, maxlen=MAX_LEN, padding='post')


In [15]:
answer_train = []
for indx in train_data.keys():
  answer_train.append(train_data[indx]["answer"])

labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(answer_train)

print(len(labelencoder.classes_))

LabelEncoder()

58


In [184]:
with open('/content/drive/My Drive/VQA/labelencoder.pkl', 'wb') as f:
  joblib.dump(labelencoder, f)

In [21]:
def get_answers_matrix(answers, encoder):
	'''
	One-hot-encodes the answers

	Input:
		answers:	list of answer
		encoder:	a scikit-learn LabelEncoder object
  
	Output:
		A numpy array of shape (# of answers, # of class)
	'''
	y = encoder.transform(answers) #string to numerical class
	nb_classes = encoder.classes_.shape[0]
	Y = utils.to_categorical(y, nb_classes)
	return Y

In [16]:
questions_train = list(map(itemgetter('question'), train_data.values()))
answer_train = list(map(itemgetter('answer'), train_data.values()))
images_train =  list(map(itemgetter('image_id'), train_data.values()))

In [17]:
sss = StratifiedShuffleSplit(n_splits=1, test_size= 0.20,random_state=42)

for train_index, val_index in sss.split(images_train, answer_train):
  TRAIN_INDEX = train_index
  VAL_INDEX = val_index

In [18]:
# image data
image_list_tr, image_list_vl = np.array(images_train)[TRAIN_INDEX.astype(int)], np.array(images_train)[VAL_INDEX.astype(int)]

In [22]:
# answer data
answer_matrix = get_answers_matrix(answer_train, labelencoder)
answer_tr, answer_vl = answer_matrix[TRAIN_INDEX], answer_matrix[VAL_INDEX]

In [23]:
# question data
question_tr, question_vl = question_data_train[TRAIN_INDEX], question_data_train[VAL_INDEX]

In [24]:
BATCH_SIZE = 300
BUFFER_SIZE = 5000
def map_func(img_name, ques, ans):
    img_tensor = np.load('features/' + img_name.decode('utf-8').split('.')[0][-6:] + '.npy')
    return img_tensor, ques, ans

In [25]:

dataset_tr = tf.data.Dataset.from_tensor_slices((image_list_tr, question_tr, answer_tr))

# Use map to load the numpy files in parallel
dataset_tr = dataset_tr.map(lambda item1, item2, item3: tf.numpy_function(
    map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset_tr = dataset_tr.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_tr = dataset_tr.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [26]:
dataset_vl = tf.data.Dataset.from_tensor_slices((image_list_vl, question_vl, answer_vl))

# Use map to load the numpy files in parallel
dataset_vl = dataset_vl.map(lambda item1, item2, item3: tf.numpy_function(
    map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset_vl = dataset_vl.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_vl = dataset_vl.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [27]:
class AttentionMaps(tf.keras.layers.Layer):
  """
  Given an image feature map V ∈ R(d×N), and the question representation Q ∈ R(d×T), 
  calculates the affinity matrix C ∈ R(T×N): C = tanh((QT)(Wb)V) ; 
  where Wb ∈ R(d×d) contains the weights. (Refer eqt (3) section 3.3).

  Given this affinity matrix C ∈ R(T×N), predicts image and question attention maps 
  (Refer eqt (4) section 3.3).

  Arguments:
    dim_k     : hidden attention dimention
    reg_value : Regularization value


  Inputs:
    image_feat,    V : shape (N,  d) or (49, dim_d)
    ques_feat,     Q : shape (T,  d) or (23, dim_d)

  Outputs:
    Image and Question attention maps viz:
    a) Hv = tanh(WvV + (WqQ)C) and
    b) Hq = tanh(WqQ + (WvV )CT)
  """
  def __init__(self, dim_k, reg_value, **kwargs):
    super(AttentionMaps, self).__init__(**kwargs)

    self.dim_k = dim_k
    self.reg_value = reg_value

    self.Wv = Dense(self.dim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=2))
    self.Wq = Dense(self.dim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=3))

  def call(self, image_feat, ques_feat):
    """
    The main logic of this layer.
    """  

    # Affinity Matrix C
    # (QT)(Wb)V 
    C = tf.matmul(ques_feat, tf.transpose(image_feat, perm=[0,2,1])) # [b, 23, 49]
    # tanh((QT)(Wb)V)
    C = tf.keras.activations.tanh(C) 

    # (Wv)V
    WvV = self.Wv(image_feat)                             # [b, 49, dim_k]
    # (Wq)Q
    WqQ = self.Wq(ques_feat)                              # [b, 23, dim_k]

    # ((Wq)Q)C
    WqQ_C = tf.matmul(tf.transpose(WqQ, perm=[0,2,1]), C) # [b, k, 49]
    WqQ_C = tf.transpose(WqQ_C, perm =[0,2,1])            # [b, 49, k]

    # ((Wv)V)CT                                           # [b, k, 23]
    WvV_C = tf.matmul(tf.transpose(WvV, perm=[0,2,1]), tf.transpose(C, perm=[0,2,1]))  
                        
    WvV_C = tf.transpose(WvV_C, perm =[0,2,1])            # [b, 23, k]

    #---------------image attention map------------------
    # We find "Hv = tanh((Wv)V + ((Wq)Q)C)" ; H_v shape [49, k]

    H_v = WvV + WqQ_C                                     # (Wv)V + ((Wq)Q)C
    H_v = tf.keras.activations.tanh(H_v)                  # tanh((Wv)V + ((Wq)Q)C) 

    #---------------question attention map---------------
    # We find "Hq = tanh((Wq)Q + ((Wv)V)CT)" ; H_q shape [23, k]

    H_q = WqQ + WvV_C                                     # (Wq)Q + ((Wv)V)CT
    H_q = tf.keras.activations.tanh(H_q)                  # tanh((Wq)Q + ((Wv)V)CT) 
        
    return [H_v, H_q]                                     # [b, 49, k], [b, 23, k]
  
  def get_config(self):
    """
    This method collects the input shape and other information about the layer.
    """
    config = {
        'dim_k': self.dim_k,
        'reg_value': self.reg_value
    }
    base_config = super(AttentionMaps, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

In [28]:
class ContextVector(tf.keras.layers.Layer):
  """
  Method to find context vector of the image and text features
  (Refer eqt (4) and (5) section 3.3).
  
  Arguments:
    reg_value : Regularization value
    
  Inputs:
    image_feat V: image features, (49, d)
    ques_feat  Q: question features, (23, d)
    H_v: image attention map, (49, k)
    H_q: question attention map, (23, k)

  Outputs:
    Returns d-dimenstional context vector for image and question features
  """
  def __init__(self, reg_value, **kwargs):
    super(ContextVector, self).__init__(**kwargs)

    self.reg_value = reg_value

    self.w_hv = Dense(1, activation='softmax',\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=4))
    self.w_hq = Dense(1, activation='softmax',\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=5)) 
    

  def call(self, image_feat, ques_feat, H_v, H_q):
    """
    The main logic of this layer.
    """  
    # attention probabilities of each image region vn; a_v = softmax(wT_hv * H_v)
    a_v = self.w_hv(H_v)                               # [b, 49, 1]

    # attention probabilities of each word qt ;        a_q = softmax(wT_hq * H_q)
    a_q = self.w_hq(H_q)                               # [b, 23, 1]

    # context vector for image
    v = a_v * image_feat                               # [b, 49, dim_d]
    v = tf.reduce_sum(v, 1)                            # [b, dim_d]

    # context vector for question
    q = a_q * ques_feat                                # [b, 23, dim_d]
    q = tf.reduce_sum(q, 1)                            # [b, dim_d]


    return [v, q]

  def get_config(self):
    """
    This method collects the input shape and other information about the layer.
    """
    config = {
        'reg_value': self.reg_value
    }
    base_config = super(ContextVector, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

In [29]:
class PhraseLevelFeatures(tf.keras.layers.Layer):
  """
  We compute the phrase features by applying 1-D convolution on the word embedding 
  vectors with filters of three window sizes: unigram, bigram and trigram.
  The word-level features Qw are appropriately 0-padded before feeding into bigram and 
  trigram convolutions to maintain the length of the sequence after convolution.
  Given the convolution result, we then apply max-pooling across different n-grams at each word
  location to obtain phrase-level features
  (Refer eqt (1) and (2) section 3.2).

  Arguments:
    dim_d: hidden dimension

  Inputs:
    word_feat Q : word level features of shape (23, dim_d)

  Outputs:
    Phrase level features of the question of shape (23, dim_d)
  """
  def __init__(self, dim_d, **kwargs):
    super(PhraseLevelFeatures, self).__init__(**kwargs)
    
    self.dim_d = dim_d
    
    self.conv_unigram = Conv1D(self.dim_d, kernel_size=1, strides=1,\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=6)) 
    self.conv_bigram =  Conv1D(self.dim_d, kernel_size=2, strides=1, padding='same',\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=7)) 
    self.conv_trigram = Conv1D(self.dim_d, kernel_size=3, strides=1, padding='same',\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=8)) 


  def call(self, word_feat):
    """
    The main logic of this layer.

    Compute the n-gram phrase embeddings (n=1,2,3)
    """
    # phrase level unigram features
    x_uni = self.conv_unigram(word_feat)                    # [b, 23, dim_d]

    # phrase level bigram features
    x_bi  = self.conv_bigram(word_feat)                     # [b, 23, dim_d]

    # phrase level trigram features
    x_tri = self.conv_trigram(word_feat)                    # [b, 23, dim_d]

    # Concat
    x = tf.concat([tf.expand_dims(x_uni, -1),\
                    tf.expand_dims(x_bi, -1),\
                    tf.expand_dims(x_tri, -1)], -1)         # [b, 23, dim_d, 3]

    # https://stackoverflow.com/a/36853403
    # Max-pool across n-gram features; over-all phrase level feature
    x = tf.reduce_max(x, -1)                                # [b, 23, dim_d]

    return x

  def get_config(self):
    """
    This method collects the input shape and other information about the layer.
    """
    config = {
        'dim_d': self.dim_d
    }
    base_config = super(PhraseLevelFeatures, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

In [30]:
def build_model(max_answers, max_seq_len, vocab_size, dim_d, dim_k, l_rate, d_rate, reg_value):
    """
    Defines the Keras model.

    Arguments
    ----------
    max_answers : Number of output targets of the model.
    max_seq_len : Length of input sequences
    vocab_size  : Size of the vocabulary, i.e. maximum integer index + 1.
    dim_d       : Hidden dimension
    dim_k       : Hidden attention dimension
    l_rate      : Learning rate for the model
    d_rate      : Dropout rate
    reg_value   : Regularization value

    Returns
    ----------
    Returns the Keras model.
    """
    # inputs 
    image_input = Input(shape=(49, 512, ), name='Image_Input')
    ques_input = Input(shape=(22, ), name='Question_Input')

    # image feature; (Wb)V                                          # [b, 49, dim_d]
    image_feat = Dense(dim_d, activation=None, name='Image_Feat_Dense',\
                            kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=1))(image_input)
    image_feat = Dropout(d_rate, seed=1)(image_feat)

    # word level
    ques_feat_w = Embedding(input_dim=vocab_size, output_dim=dim_d, input_length=max_seq_len,\
                            mask_zero=True)(ques_input)
    
    Hv_w, Hq_w = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Word')(image_feat, ques_feat_w)
    v_w, q_w = ContextVector(reg_value, name='ContextVector_Word')(image_feat, ques_feat_w, Hv_w, Hq_w)
    feat_w = tf.add(v_w,q_w)
    h_w = Dense(dim_d, activation='tanh', name='h_w_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=13))(feat_w)

    # phrase level
    ques_feat_p = PhraseLevelFeatures(dim_d, name='PhraseLevelFeatures')(ques_feat_w)

    Hv_p, Hq_p = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Phrase')(image_feat, ques_feat_p)
    v_p, q_p = ContextVector(reg_value, name='ContextVector_Phrase')(image_feat, ques_feat_p, Hv_p, Hq_p)
    feat_p = concatenate([tf.add(v_p,q_p), h_w], -1) 
    h_p = Dense(dim_d, activation='tanh', name='h_p_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=14))(feat_p)

    # sentence level
    ques_feat_s = LSTM(dim_d, return_sequences=True, input_shape=(None, max_seq_len, dim_d),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(ques_feat_p)

    Hv_s, Hq_s = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Sent')(image_feat, ques_feat_s)
    v_s, q_s = ContextVector(reg_value, name='ContextVector_Sent')(image_feat, ques_feat_p, Hv_s, Hq_s)
    feat_s = concatenate([tf.add(v_s,q_s), h_p], -1) 
    h_s = Dense(2*dim_d, activation='tanh', name='h_s_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=15))(feat_s)

    z   = Dense(2*dim_d, activation='tanh', name='z_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(h_s)
    z   = Dropout(d_rate, seed=16)(z)

    # result
    result = Dense(max_answers, activation='softmax')(z)

    model = Model(inputs=[image_input, ques_input], outputs=result)

    return model

In [31]:
# params 1

EPOCHS      = 60
max_seq_len = 22
vocab_size  = len(tokenizer.word_index) + 1
dim_d       = 512
dim_k       = 256
l_rate      = 1e-4
d_rate      = 0.5
reg_value   = 0.01

base_path = '/content/drive/My Drive/VQA/'

In [32]:
# create model
model = build_model(N_CLASSES, max_seq_len, vocab_size, dim_d, dim_k, l_rate, d_rate, reg_value)

ValueError: ignored

In [None]:

steps_per_epoch = int(np.ceil(len(image_list_tr)/BATCH_SIZE))
boundaries      = [50*steps_per_epoch]
values          = [l_rate, l_rate/10]

In [None]:
# we reduce the l_rate after 50th epoch (from 1e-4 to 1e-5)
learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)
optimizer        = tf.keras.optimizers.Adam(learning_rate=learning_rate_fn)

loss_object      = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction='auto')

In [None]:
checkpoint_directory = base_path+"/training_checkpoints/"+str(l_rate)+"_"+str(dim_k)
SAVE_CKPT_FREQ = 5

ckpt = tf.train.Checkpoint(step=tf.Variable(0), optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(ckpt, checkpoint_directory, max_to_keep=3)

In [None]:
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)

train_score = F1Score(num_classes=max_answers, average='micro', name='train_score')
val_score = F1Score(num_classes=max_answers, average='micro', name='val_score')

In [None]:
train_log_dir = base_path+'/logs/'+str(l_rate)+"_"+str(dim_k)+'/train'
val_log_dir   = base_path+'/logs/'+str(l_rate)+"_"+str(dim_k)+'/validation'

train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)

In [None]:

# @tf.function
def train_step(model, img, ques, ans, optimizer):
  with tf.GradientTape() as tape:
    # forward pass
    predictions = model([img, ques], training=True)
    loss = loss_object(ans, predictions)

  # backward pass
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  # record results
  train_loss(loss)
  train_score(ans, predictions)

  # all gradients
  grads_ = list(zip(grads, model.trainable_variables))
  return grads_

def test_step(model, img, ques, ans):
  predictions = model([img, ques])
  loss = loss_object(ans, predictions)

  # record results
  val_loss(loss)
  val_score(ans, predictions)

In [None]:
if manager.latest_checkpoint:
    ckpt.restore(manager.latest_checkpoint)
    print("Restored from {}".format(manager.latest_checkpoint))
    START_EPOCH = int(manager.latest_checkpoint.split('-')[-1]) * SAVE_CKPT_FREQ
    print("Resume training from epoch: {}".format(START_EPOCH))
else:
    print("Initializing from scratch")
    START_EPOCH = 0

In [None]:
for epoch in range(START_EPOCH, EPOCHS):

  start = time.time()

  for img, ques, ans in (dataset_tr):
    grads = train_step(model, img, ques, ans, optimizer)

  # tensorboard  
  with train_summary_writer.as_default():
    # Create a summary to monitor cost tensor
    tf.summary.scalar('loss', train_loss.result(), step=epoch)
    # Create a summary to monitor accuracy tensor
    tf.summary.scalar('f1_score', train_score.result(), step=epoch)
    # Create summaries to visualize weights
    for var in model.trainable_variables:
        tf.summary.histogram(var.name, var, step=epoch)
    # Summarize all gradients
    for grad, var in grads:
        tf.summary.histogram(var.name + '/gradient', grad, step=epoch)

  for img, ques, ans in (dataset_vl):
    test_step(model, img, ques, ans)
  
  # tensorboard
  with val_summary_writer.as_default():
    # Create a summary to monitor cost tensor
    tf.summary.scalar('loss', val_loss.result(), step=epoch)
    # Create a summary to monitor accuracy tensor
    tf.summary.scalar('f1_score', val_score.result(), step=epoch)
  
  template = 'Epoch {}, loss: {:.4f}, f1_score: {:.4f}, val loss: {:.4f}, val f1_score: {:.4f}, time: {:.0f} sec'
  print (template.format(epoch + 1,
                         train_loss.result(), 
                         train_score.result(),
                         val_loss.result(), 
                         val_score.result(),
                         (time.time() - start)))

  # Reset metrics every epoch
  train_loss.reset_states()
  train_score.reset_states()
  val_loss.reset_states()
  val_score.reset_states()

  # save checkpoint every SAVE_CKPT_FREQ step
  ckpt.step.assign_add(1)
  if int(ckpt.step) % SAVE_CKPT_FREQ == 0:
      manager.save()
      print('Saved checkpoint.')

In [28]:
  pred = vqa_model.predict_generator(test_generator)



In [29]:
import os
from datetime import datetime
from google.colab import files

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')
    
    return csv_fname

results = {}

for i in range(len(pred)):
    results[test_generator.list_IDs[i]] = np.argmax(pred[i])

name = create_csv(results)

files.download(name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>