In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
import copy
import csv
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import json

from collections import Counter
from os import listdir
from os.path import isfile, join
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet152

# Images

In [3]:
class Dataset(Dataset):
    def __init__(self, img_dir, transform=None):
        super(Dataset, self).__init__()
        self.img_dir = img_dir
        self.transform = transform
        self.image_names = [image for image in listdir(self.img_dir) if isfile(join(self.img_dir, image))]

    def __len__(self):
        return len(self.image_names)
    
    def __getitem__(self, idx):
        img_path = self.image_names[idx]
        img = Image.open(join(self.img_dir, img_path)).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        return (img_path, img)


class ResNet(torch.nn.Module):
    """ ResNet is used to obtain image features. """  
    def __init__(self):
        super(ResNet, self).__init__()
        self.model = resnet152()
        self.model.avgpool = torch.nn.AdaptiveAvgPool2d(output_size=1)

    def forward(self, x):
        return self.model(x)

In [4]:
src_dir_train = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_abstract_v002_train2015'
dst_dir_train = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015'

resize_dim = 448
batch_size = 6
num_workers = 2

transform = transforms.Compose([transforms.Resize((resize_dim, resize_dim)), transforms.ToTensor()])

dataset_train = Dataset(src_dir_train, transform=transform)

loader_train = DataLoader(dataset_train, batch_size=batch_size, num_workers=num_workers, shuffle=False)

if not os.path.exists(dst_dir_train):
    os.makedirs(dst_dir_train)
  
model = ResNet()

for i, (img_paths, images) in enumerate(loader_train):
    output = model(images)
    if i == 0:
        print(output.shape)

    for j in range(len(img_paths)):
        feat_name = img_paths[j].replace('.png', '.npy')
        feat_name = join(dst_dir_train, feat_name)
        print(feat_name)
        np.save(feat_name, output[j].data.numpy())

torch.Size([6, 1000])
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019980.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019981.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019982.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019983.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019984.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019985.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015/abstract_v002_train2015_000000019986.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_

In [5]:
src_dir_val = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_abstract_v002_val2015'
dst_dir_val = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015'

dataset_val = Dataset(src_dir_val, transform=transform)

loader_val = DataLoader(dataset_val, batch_size=batch_size, num_workers=num_workers, shuffle=False)

if not os.path.exists(dst_dir_val):
    os.makedirs(dst_dir_val)

for i, (img_paths, images) in enumerate(loader_val):
    output = model(images)
    if i == 0:
        print(output.shape)

    for j in range(len(img_paths)):
        feat_name = img_paths[j].replace('.png', '.npy')
        feat_name = join(dst_dir_val, feat_name)
        print(feat_name)
        np.save(feat_name, output[j].data.numpy())

torch.Size([5, 1000])
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015/abstract_v002_val2015_000000029995.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015/abstract_v002_val2015_000000029996.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015/abstract_v002_val2015_000000029998.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015/abstract_v002_val2015_000000029999.npy
/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015/abstract_v002_val2015_000000029997.npy


# Questions and Annotations

In [6]:
# Interface for accessing the VQA dataset.

# This code is based on the code available at the following link: https://github.com/GT-Vision-Lab/VQA/blob/master/PythonHelperTools/vqaTools/vqa.py.


class VQA:

    def __init__(self, annotation_file=None, question_file=None):
        """ Constructor of VQA class for reading and visualizing questions and answers. """
        # load dataset
        self.dataset = {}
        self.questions = {}
        self.qa = {}
        self.qqa = {}
        self.imgToQA = {}
        if annotation_file is not None and question_file is not None:
            dataset = json.load(open(annotation_file, 'r'))
            questions = json.load(open(question_file, 'r'))
            self.dataset = dataset
            self.questions = questions
            self.createIndex()

    def createIndex(self):

        # create index
        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
        for ann in self.dataset['annotations']:
            imgToQA[ann['image_id']] += [ann]
            qa[ann['question_id']] = ann
        for ques in self.questions['questions']:
            qqa[ques['question_id']] = ques

        # create class members
        self.qa = qa
        self.qqa = qqa
        self.imgToQA = imgToQA

In [7]:
# This code is based on the code available at the following link: https://github.com/GT-Vision-Lab/VQA/blob/master/PythonEvaluationTools/vqaEvaluation/vqaEval.py


contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't",
                "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't",
                "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've",
                "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've",
                "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's",
                "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've",
                "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't",
                "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've",
                "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've",
                "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll",
                "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've",
                "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've",
                "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've",
                "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've",
                "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't",
                "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're",
                "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've",
                "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll",
                "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've",
                "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've",
                "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've",
                "youll": "you'll", "youre": "you're", "youve": "you've"}

manualMap = {'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'}

articles = ['a', 'an', 'the']


def processDigitArticle(inText):
	outText = []
	tempText = inText.lower().split()  # Making all characters lowercase.
	for word in tempText:
		word = manualMap.setdefault(word, word)  # Converting number words to digits.
		# The setdefault() method returns the value of the item with the specified key. If the key does not exist, insert the key, with the specified value.
		if word not in articles:
			outText.append(word)  # Removing articles (a, an, the).
		else:
			pass
	for wordId, word in enumerate(outText):
		if word in contractions:
			outText[wordId] = contractions[word]  # Adding apostrophe if a contraction is missing it.
	outText = ' '.join(outText)
	# The join() method takes all items in an iterable and joins them into one string. ' ' is the separator.
	return outText


periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
commaStrip = re.compile("(\d)(\,)(\d)")
punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!']  # r means the string will be treated as raw string.


def processPunctuation(inText):
	outText = inText
	for p in punct:  # Replacing all punctuation (except apostrophe and colon) with a space character.
		if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) is not None):  # In case of comma, no space is inserted if it occurs between digits.
			outText = outText.replace(p, '')
		else:
			outText = outText.replace(p, ' ')
	outText = periodStrip.sub("", outText, re.UNICODE)  # Removing periods except if it occurs as decimal.
	return outText

In [8]:
# This code is inspired by the code available at the following link: https://github.com/Cyanogenoid/pytorch-vqa/blob/master/data.py.


class VqaTrainDataset(Dataset):

    """ Load the VQA dataset using the VQA class. """

    def __init__(self, question_json_file_path, annotation_json_file_path, image_filename_pattern, img_features_dir, vocab_json_filename):
          
        """
        Args:
            question_json_file_path (string): Path to the json file containing the questions
            annotation_json_file_path (string): Path to the json file containing the annotations
            image_filename_pattern (string): Pattern used by the filenames of the images in this dataset (eg "abstract_v002_train2015_{}.png")
            img_features_dir (string): Path to the directory with image features
            vocab_json_filename (string): Path to the vocabulary.
        """

        vqa_db = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path)

        self.max_words_in_ques = -1  
        self.dataset = []

        ques_list = []
        ans_list = []

        for q_id, annotation in vqa_db.qa.items():
            entry = {}
            question = vqa_db.qqa[q_id]['question']
            question = processPunctuation(question)
            question = processDigitArticle(question)
            words = question.split(' ')
            if len(words) > self.max_words_in_ques:
                self.max_words_in_ques = len(words)
            ques_list += words
            entry['ques'] = words
            answer_objs = annotation['answers']

            possible_answers = [a['answer'] for a in answer_objs]

            entry['possible_answers'] = []
            for answer in possible_answers:
                mod_ans = processPunctuation(answer)
                mod_ans = processDigitArticle(mod_ans)
                ans_list.append(mod_ans)
                entry['possible_answers'].append(mod_ans)
      
            img_full_idx = "%012d" % annotation['image_id']  # '00000000000image_id'
            img_name = image_filename_pattern.replace('{}', img_full_idx)
            img_feature_loc = os.path.join(img_features_dir, img_name.replace('.png', '.npy'))
            entry['img_feat_loc'] = img_feature_loc

            self.dataset.append(entry)

        q_vocab = self.build_vocab(ques_list)
        a_vocab = self.build_vocab(ans_list)
        vocab = {'q': q_vocab, 'a': a_vocab}

        f = open(vocab_path, "w")
        json.dump(vocab, f)
        f.close()
        
        self.q_vocab = vocab['q']
        self.q_vocab_size = len(self.q_vocab.keys())
            
        self.a_vocab = vocab['a']
        self.a_vocab_size = len(self.a_vocab.keys())

    def build_vocab(self, data):
        counter = Counter(data)
        words = counter.keys()
        tokens = sorted(words, key=lambda x: (counter[x], x), reverse=True)  # reverse=True: sorts in a descending order.
        vocab = {t: i for i, t in enumerate(tokens)}
        return vocab

    def _get_q_encoding(self, questions):
        vec = torch.zeros(self.q_vocab_size)
        for question in questions:
            if question in self.q_vocab:
                vec[self.q_vocab[question]] += 1
        return vec, len(questions)

    def _get_a_encoding(self, answers):
        vec = torch.zeros(self.a_vocab_size)
        for answer in answers:
            if answer in self.a_vocab:
                vec[self.a_vocab[answer]] += 1
        return vec

    def __getitem__(self, idx):
        entry = self.dataset[idx]

        image_encoding = np.load(entry['img_feat_loc'])

        ques = entry['ques']
        ques_encoding, ques_len = self._get_q_encoding(ques)

        possible_answers = entry['possible_answers']
        ans_encoding = self._get_a_encoding(possible_answers)

        return {'image_enc': image_encoding, 'ques_enc': ques_encoding, 'ques_len': ques_len, 'ans_enc': ans_encoding}

    def __len__(self):
        return len(self.dataset)

In [9]:
# This code is inspired by the code available at the following link: https://github.com/Cyanogenoid/pytorch-vqa/blob/master/data.py.


class VqaValDataset(Dataset):

    """
    Load the VQA dataset using the VQA class.
    """

    def __init__(self, question_json_file_path, annotation_json_file_path, image_filename_pattern, img_features_dir, vocab_json_filename):
                 
        """
        Args:
            question_json_file_path (string): Path to the json file containing the questions
            annotation_json_file_path (string): Path to the json file containing the annotations
            image_filename_pattern (string): Pattern used by the filenames of the images in this dataset (eg "abstract_v002_val2015_{}.png")
            img_features_dir (string): Path to the directory with image features
            vocab_json_filename (string): Path to the vocabulary.
        """

        vqa_db = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path)

        self.max_words_in_ques = -1
            
        self.dataset = []

        for q_id, annotation in vqa_db.qa.items():
            entry = {}
            question = vqa_db.qqa[q_id]['question']
            question = processPunctuation(question)
            question = processDigitArticle(question)
            words = question.split(' ')
            if len(words) > self.max_words_in_ques:
                self.max_words_in_ques = len(words)
            entry['ques'] = words
            answer_objs = annotation['answers']

            possible_answers = [a['answer'] for a in answer_objs]

            entry['possible_answers'] = []
            for answer in possible_answers:
                mod_ans = processPunctuation(answer)
                mod_ans = processDigitArticle(mod_ans)
                entry['possible_answers'].append(mod_ans)
                
            img_full_idx = "%012d" % annotation['image_id']  # '00000000000image_id'
            img_name = image_filename_pattern.replace('{}', img_full_idx)
            img_feature_loc = os.path.join(img_features_dir, img_name.replace('.png', '.npy'))
            entry['img_feat_loc'] = img_feature_loc
            
            self.dataset.append(entry)

        vocab = json.load(open(vocab_json_filename, "r"))

        self.q_vocab = vocab['q']
        self.q_vocab_size = len(self.q_vocab.keys())
            
        self.a_vocab = vocab['a']
        self.a_vocab_size = len(self.a_vocab.keys())

    def _get_q_encoding(self, questions):
        vec = torch.zeros(self.q_vocab_size)
        for question in questions:
            if question in self.q_vocab:
                vec[self.q_vocab[question]] += 1
        return vec, len(questions)

    def _get_a_encoding(self, answers):
        vec = torch.zeros(self.a_vocab_size)
        for answer in answers:
            if answer in self.a_vocab:
                vec[self.a_vocab[answer]] += 1
        return vec

    def __getitem__(self, idx):
        entry = self.dataset[idx]

        image_encoding = np.load(entry['img_feat_loc'])

        ques = entry['ques']
        ques_encoding, ques_len = self._get_q_encoding(ques)

        possible_answers = entry['possible_answers']
        ans_encoding = self._get_a_encoding(possible_answers)

        return {'image_enc': image_encoding, 'ques_enc': ques_encoding, 'ques_len': ques_len, 'ans_enc': ans_encoding}

    def __len__(self):
        return len(self.dataset)

# Models

## SimpleBaselineNet

In [10]:
class SimpleBaselineNet(nn.Module):

    """ Predicts an answer to a question about an image using the Simple Baseline for Visual Question Answering (Zhou et al, 2017) paper. """

    """ 
      The input question is first converted to a one-hot vector, which is transformed to a word feature via a word embedding layer and then is concatenated with the image feature from CNN.
      The combined feature is sent to the softmax layer to predict the answer class.
      This model consists of only two linear transformations from the one hot vector to the answer response:
        - one is the word embedding;
        - the other is the softmax matrix multiplication. 
    """

    def __init__(self, img_feat_size, q_vocab_size, a_vocab_size):
        super().__init__()

        self.img_feat_size = img_feat_size
        self.q_vocab_size = q_vocab_size
        self.a_vocab_size = a_vocab_size
        self.q_embedding_size = 1024

        self.linear_layer = nn.Linear(self.q_vocab_size, self.q_embedding_size, bias=False)  # Applies a linear transformation to the incoming data: y = xA^T + b.
        self.classifier = nn.Linear(self.img_feat_size + self.q_embedding_size, self.a_vocab_size, bias=False)

    def forward(self, image_encoding, question_encoding):

        image_encoding = image_encoding.view(image_encoding.shape[0], -1)  # image_encoding.size(): torch.Size([24, 1000]).

        question_embedding = self.linear_layer(question_encoding)

        x = torch.cat((image_encoding, question_embedding), dim=-1)  # Concatenates the given sequence of tensors (image_encoding, question_embedding) in the given dimension (dim=-1).

        out = self.classifier(x)

        return out

## CoattentionNet

In [11]:
class QuestionProcessor(nn.Module):

    """
      - At the word level, words are embedded into a vector space through an embedding matrix.

      - At the phrase level, 1-dimensional convolution neural networks (CNN) are used to capture the information contained in unigrams, bigrams and trigrams, and then combine various n-gram responses by pooling 
        them into a single phrase level representation. 

      - At the question level, recurrent neural networks (RNN) encode the entire question, exactly a LSTM encodes the sequence after a max-pooling.  
    """

    def __init__(self, q_vocab_size, img_feat_size):
        super().__init__()
        self.word_embedding = nn.Embedding(q_vocab_size, img_feat_size)

        self.phrase_unigram = nn.Conv1d(img_feat_size, img_feat_size, kernel_size=1, stride=1, padding=0)
        self.phrase_bigram = nn.Conv1d(img_feat_size, img_feat_size, kernel_size=2, stride=1, padding=1, dilation=2)  # dilation controls the spacing between the kernel points.
        self.phrase_trigram = nn.Conv1d(img_feat_size, img_feat_size, kernel_size=3, stride=1, padding=2, dilation=2)
        
        self.max_pool = nn.MaxPool2d(1)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(0.5)

        self.lstm = nn.LSTM(input_size=img_feat_size, hidden_size=img_feat_size, num_layers=2, batch_first=True)  
        # batch_first=True: the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature). 

    def forward(self, question_encoding, question_length):
        w_embed = self.word_embedding(question_encoding.long())  # long() converts a torch.FloatTensor to a torch.LongTensor.
        w_embed_permute = w_embed.permute(0, 2, 1)  # permute() returns a view of the original tensor with its dimensions permuted.

        uni_embed = self.phrase_unigram(w_embed_permute)
        bi_embed = self.phrase_bigram(w_embed_permute)
        tri_embed = self.phrase_trigram(w_embed_permute)

        p_embed = self.max_pool(torch.cat((uni_embed, bi_embed, tri_embed), dim=2))
        # torch.cat() concatenates the given sequence of tensors (uni_embed, bi_embed, tri_embed) in the given dimension (dim=2).
        p_embed = self.dropout(self.activation(p_embed.permute(0, 2, 1)))  

        packed = nn.utils.rnn.pack_padded_sequence(p_embed, question_length, batch_first=True, enforce_sorted=False)  # nn.utils.rnn.pack_padded_sequence() packs a Tensor containing padded sequences of variable length.
        # enforce_sorted=False: the input will get sorted unconditionally.
        q_embed, (_, _) = self.lstm(packed)
        q_embed, _ = nn.utils.rnn.pad_packed_sequence(q_embed, batch_first=True)  # nn.utils.rnn.pad_packed_sequence() pads a packed batch of variable length sequences.
        # pad_packed_sequence() is an inverse operation to pack_padded_sequence().

        return (w_embed, p_embed, q_embed)


class ParallelAttention(nn.Module):

    """
      The parallel co-attention model generates image attention and question attention simultaneously. 
      It connects the image and the question by calculating the similarity between image and question features at all pairs of image-location and question-location.
      Given an image feature map V and a question representation Q, the affinity matrix C is calculated by: C = tanh(Q^T*Wb*V) where Wb contains weights.
      Considering this affinity matrix as a feature, the model learns to predict image and question attention maps via following equations: 
          Hv = tanh(Wv*V + C(Wq*Q)), Hq = tanh(Wq*Q + C^T(Wv*V))
          av = softmax((whv)^T*Hv), aq = softmax((whq)^T*Hq)
      where Wv, Wq, whv, whq are weight parameters, while av and aq are attention probabilities of each image region v and word q respectively.
      Image and question attention vectors are calculated as the weighted sum of image and question features.
    """

    def __init__(self, img_feat_size, hidden_size):
        super().__init__()

        self.img_feat_size = img_feat_size    
        self.hidden_size = hidden_size
        
        self.linear_1 = nn.Linear(self.img_feat_size, self.hidden_size, bias=False)
        self.linear_2 = nn.Linear(self.hidden_size, 1, bias=False)

        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, ques_embed, img_embed):

        ques_embed_permute = ques_embed.permute(0, 2, 1)  # permute() returns a view of the original tensor with its dimensions permuted.
        img_embed_permute = img_embed.permute(0, 2, 1)

        activation = self.activation(torch.matmul(self.linear_1(ques_embed), img_embed))  # matmul() returns the matrix product of two arrays.
        activation_permute = activation.permute(0, 2, 1)

        img = self.activation(self.linear_1(img_embed_permute) + torch.matmul(activation_permute, self.linear_1(ques_embed)))
        ques = self.activation(self.linear_1(ques_embed) + torch.matmul(activation, self.linear_1(img_embed_permute)))

        attention_img = self.softmax(self.linear_2(img).squeeze())  # squeeze() returns a tensor with all the dimensions of input of size 1 removed.
        attention_ques = self.softmax(self.linear_2(ques).squeeze()).unsqueeze(1)  # unsqueeze() returns a new tensor with a dimension of size one inserted at the specified position.

        attention_img_feat = torch.sum(torch.mul(img_embed, attention_img), dim=-1)  # mul() returns the product of two variables.
        attention_ques_feat = torch.sum(torch.mul(ques_embed_permute, attention_ques), dim=-1)

        return attention_img_feat, attention_ques_feat


class AlternateAttention(nn.Module):
  
    """
      The alternating co-attention model sequentially alternates between generating image and question attentions.
      It summarizes the question into a single vector, then it attends to the image based on the question feature, and, in the end, it attends to the question based on the attended image feature. 
      It defines an attention operation which takes image (or question) features X and an attention guidance g derived from question (or image) as inputs, and outputs the attended image (or question) vector. 
      The operation can be expressed in following steps:
          H = tanh(Wx*X + (Wg*g)1^T)
          ax = softmax((whx)^T*H)
          x = sum(ax*x)
      where 1 is a vector with all elements equal to one. Wx, Wg and whx are parameters, while ax is the attention weight of the feature X.
    """

    def __init__(self, img_feat_size, hidden_size):
        super().__init__()
        
        self.img_feat_size = img_feat_size
        self.hidden_size = hidden_size 
        
        self.linear_1 = nn.Linear(self.img_feat_size, self.hidden_size, bias=False)
        self.linear_2 = nn.Linear(self.hidden_size, 1, bias=False)

        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, ques_embed, img_embed):
        
        img_embed_permute = img_embed.permute(0, 2, 1)  # permute() returns a view of the original tensor with its dimensions permuted.

        a = torch.mul(ques_embed, self.softmax(self.linear_2(self.activation(self.linear_1(ques_embed)))))  # mul() returns the product of two variables.
        a = torch.sum(a, dim=1).squeeze()  # squeeze() returns a tensor with all the dimensions of input of size 1 removed.

        a = torch.mul(img_embed_permute, self.softmax(self.linear_2(self.activation(self.linear_1(img_embed_permute) + self.linear_1(a).unsqueeze(1)))))  # unsqueeze() returns a new tensor with a dimension of size one inserted at the specified position.
        attention_img_feat = torch.sum(a, dim=1).squeeze()

        a = torch.mul(ques_embed, self.softmax(self.linear_2(self.activation(self.linear_1(ques_embed) + self.linear_1(attention_img_feat).unsqueeze(1)))))
        attention_ques_feat = torch.sum(a, dim=1).squeeze()

        return attention_img_feat, attention_ques_feat


class CoattentionNet(nn.Module):

    """ Predicts an answer to a question about an image using the Hierarchical Question-Image Co-Attention for Visual Question Answering (Lu et al, 2017) paper. """

    """
      Builds a hierarchical architecture that co-attends to the image and to the question on three levels: word level, phrase level, question level.
      A Multi-Layer Perceptron (MLP) is used to recursively encode attention features. 
      A MLP is a fully connected neural network classifier with 2 hidden layers with tanh non-linearity:
          hw = tanh(Ww(qw + vw))
          hp = tanh(Wp[(qp + vp), hw])
          hs = tanh(Ws[(qs + vs), hp])
      where Ww, Wp and Ws are weight parameters.
    """

    def __init__(self, img_feat_size, q_vocab_size, a_vocab_size):
        super().__init__()

        self.img_feat_size = img_feat_size
        self.q_vocab_size = q_vocab_size
        self.a_vocab_size = a_vocab_size
        self.hidden_size = 1000

        self.q_network = QuestionProcessor(self.q_vocab_size, self.img_feat_size)
        
        # self.attention = ParallelAttention(self.img_feat_size, self.hidden_size)
        self.attention = AlternateAttention(self.img_feat_size, self.hidden_size)

        self.attention_word = nn.Linear(self.img_feat_size, self.img_feat_size, bias=False)
        self.attention_phrase = nn.Linear(2*self.img_feat_size, self.img_feat_size, bias=False)
        self.attention_question = nn.Linear(2*self.img_feat_size, self.img_feat_size, bias=False)

        self.classifier = nn.Linear(self.img_feat_size, self.a_vocab_size, bias=False)

        self.activation = nn.Tanh()

    def forward(self, image_encoding, question_encoding, question_length):
        (w_embed, p_embed, q_embed) = self.q_network(question_encoding, question_length)

        image_encoding_size = image_encoding.size()  # image_encoding.size(): torch.Size([24, 1000]).
        image_embed = image_encoding.view(image_encoding_size[0], image_encoding_size[1], -1)

        img_feat_word, ques_feat_word = self.attention(w_embed, image_embed)
        img_feat_phrase, ques_feat_phrase = self.attention(p_embed, image_embed)
        img_feat_question, ques_feat_question = self.attention(q_embed, image_embed)

        res_word = self.activation(self.attention_word(img_feat_word + ques_feat_word))

        res_phrase = self.activation(self.attention_phrase(torch.cat([img_feat_phrase + ques_feat_phrase, res_word], dim=-1)))  
        # torch.cat() concatenates the given sequence of tensors (img_feat_phrase + ques_feat_phrase, res_word) in the given dimension (dim=-1).

        res_question = self.activation(self.attention_question(torch.cat([img_feat_question + ques_feat_question, res_phrase], dim=-1)))
        # torch.cat() concatenates the given sequence of tensors (img_feat_question + ques_feat_question, res_phrase) in the given dimension (dim=-1).

        out = self.classifier(res_question)
        return out

## Transformer

In [12]:
def attention(query, key, value, scaled_term, dropout):

    """
    Scale Dot-Product Attention operation: 
    First, it computes the dot product of the query Q and the key K and divides it by the square root of the scaled term. 
    Then, it applies a softmax function to get weights on values.
    At the end, it computes the dot product of the output of the softmax function and the value V.
    The attended feature F is obtained as follows:
        W = score(Q, K) = softmax((Q*transpose(K)) / square_root(scaled_term))
        F = attention(Q, K, V) = W*V
    where W is the weight matrix.
    """

    score = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(scaled_term)  # matmul() returns the matrix product of two arrays.
    score = F.softmax(score, dim=-1)  # softmax() for scaling in range [0, 1].
    if dropout is not None:
        score = dropout(score)
    output = torch.matmul(score, value)  # matmul() returns the matrix product of two arrays.
    return output


class MultiHeadAttention(nn.Module):
    """ The Multi-Head Attention includes num_heads parallel heads, and each head is equivalent to an independent Scale Dot-Product Attention operation. """
    def __init__(self, num_heads, embedding_dim, dropout):
        super().__init__()
        
        self.num_heads = num_heads
        self.embedding_dim = embedding_dim
        self.scaled_term = self.embedding_dim // self.num_heads
        
        self.linear = nn.Linear(embedding_dim, embedding_dim) 
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, query, key, value):      
        query_size = query.size(0)

        key = self.linear(key).view(query_size, -1, self.num_heads, self.scaled_term)  # view() returns a new tensor with the same data but of a different shape.
        query = self.linear(query).view(query_size, -1, self.num_heads, self.scaled_term)
        value = self.linear(value).view(query_size, -1, self.num_heads, self.scaled_term)
        
        key = key.transpose(1, 2)
        query = query.transpose(1, 2)
        value = value.transpose(1, 2)
        
        att = attention(query, key, value, self.scaled_term, self.dropout)
        att = att.transpose(1, 2)

        output = self.linear(att.contiguous().view(query_size, -1, self.embedding_dim))  # contiguous() returns a contiguous in memory tensor containing the same data.
        return output


class PositionwiseFeedForward(nn.Module):
    """ Feed Forward consists of two fully connected layers with ReLU function. Dropout is applied to prevent overfitting. """    
    def __init__(self, embedding_dim, feed_forward_dim, dropout):
        super(PositionwiseFeedForward, self).__init__()

        self.linear_1 = nn.Linear(embedding_dim, feed_forward_dim)
        self.linear_2 = nn.Linear(feed_forward_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear_2(self.dropout(self.relu(self.linear_1(x))))


class LayerNorm(nn.Module):
    def __init__(self, embedding_dim, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.eps = eps

        self.ones = nn.Parameter(torch.ones(embedding_dim))
        self.zeros = nn.Parameter(torch.zeros(embedding_dim))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)  # keepdim=True: the output tensor has dim retained.
        std = x.std(-1, keepdim=True)
        return self.ones * (x - mean) / (std + self.eps) + self.zeros


def get_clones(module, N):
    """ Produces N identical layers. """
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [13]:
class FeatureEmbedding(nn.Module):
    """ Projects image features into a space of dimensionality `embedding_dim`. """
    def __init__(self, img_feat_size, embedding_dim):
        super().__init__()
        self.linear = nn.Linear(img_feat_size, embedding_dim)

    def forward(self, x):
        return self.linear(x)


class EncoderLayer(nn.Module):
    """ A single encoder layer consists of one Multi-Head Attention, Feed Forward, Layer Normalization and Dropout. """
    def __init__(self, embedding_dim, feed_forward_dim, num_heads, dropout):
        super().__init__()

        self.norm = LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = MultiHeadAttention(num_heads, embedding_dim, dropout)
        self.ff = PositionwiseFeedForward(embedding_dim, feed_forward_dim, dropout)
        
    def forward(self, x):
        x2 = self.norm(x)

        # Encoder self-attention
        x = x + self.dropout(self.attn(x2, x2, x2))
        x2 = self.norm(x)

        x = x + self.dropout(self.ff(x2))
        return x


class Encoder(nn.Module):
    """ Encoder with N-stacked EncoderLayers and a layer normalization. """
    def __init__(self, img_feat_size, embedding_dim, feed_forward_dim, num_layers, num_heads, dropout):
        super().__init__()

        self.num_layers = num_layers
        self.feature_embedding = FeatureEmbedding(img_feat_size, embedding_dim)
        self.layers = get_clones(EncoderLayer(embedding_dim, feed_forward_dim, num_heads, dropout), num_layers)
        self.norm = LayerNorm(embedding_dim)

    def forward(self, x):
        for i in range(self.num_layers):
            x2 = self.layers[i](self.feature_embedding(x))
        return self.norm(x2)

In [14]:
class DecoderLayer(nn.Module):
    """ A single decoder layer consists of two Multi-Head Attentions, Feed Forward, Layer Normalization and Dropout. """
    def __init__(self, embedding_dim, feed_forward_dim, num_heads, dropout):
        super().__init__()

        self.norm = LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = MultiHeadAttention(num_heads, embedding_dim, dropout)
        self.ff = PositionwiseFeedForward(embedding_dim, feed_forward_dim, dropout)

    def forward(self, x, encoder_outputs):
        x2 = self.norm(x)

        # Decoder self-attention
        x = x + self.dropout(self.attn(x2, x2, x2))
        x2 = self.norm(x)

        # Encoder Decoder attention
        x = x + self.dropout(self.attn(x2, encoder_outputs, encoder_outputs))
        x2 = self.norm(x)

        x = x + self.dropout(self.ff(x2))
        return x


class PositionalEncoding(nn.Module):
    """ Positional Encoding adds additional positional information to Embeddings. """
    def __init__(self, embedding_dim, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len).unsqueeze(1)  # torch.arange() returns a 1-D tensor with values from the interval [0, max_len).
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * -(math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # unsqueeze() returns a new tensor with a dimension of size one inserted at the specified position.
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)] 
        return self.dropout(x)


class Embeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Embeddings, self).__init__()

        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x): 
        x = self.embedding(x.long())  # long() converts a torch.FloatTensor to a torch.LongTensor.
        return x


class Decoder(nn.Module):
    """ Decoder with N-stacked DecoderLayers and a layer normalization. """
    def __init__(self, vocab_size, embedding_dim, feed_forward_dim, num_layers, num_heads, dropout):
        super().__init__()

        self.num_layers = num_layers
        self.embed = Embeddings(vocab_size, embedding_dim)
        self.position = PositionalEncoding(embedding_dim, dropout)
        self.layers = get_clones(DecoderLayer(embedding_dim, feed_forward_dim, num_heads, dropout), num_layers)
        self.norm = LayerNorm(embedding_dim)

    def forward(self, x, encoder_outputs):
        for i in range(self.num_layers):
            x2 = self.layers[i](self.position(self.embed(x)), encoder_outputs)
        return self.norm(x2)

In [15]:
class Transformer(nn.Module):
    """
      Includes an Encoder module, a Decoder module and a fully connected layer.
    """
    def __init__(self, img_feat_size, vocab_size, num_classes, embedding_dim, feed_forward_dim, num_layers, num_heads, dropout):
        super().__init__()
        
        self.encoder = Encoder(img_feat_size, embedding_dim, feed_forward_dim, num_layers, num_heads, dropout)
        self.decoder = Decoder(vocab_size, embedding_dim, feed_forward_dim, num_layers, num_heads, dropout)
        self.classifier = nn.Linear(embedding_dim, num_classes)

    def forward(self, image_encoding, question_encoding):
        encoder_outputs = self.encoder(image_encoding)
        decoder_output = self.decoder(question_encoding, encoder_outputs)

        output = self.classifier(decoder_output).mean(dim=1)
        return output


def get_transformer_model(img_feat_size, vocab_size, num_classes): 

    transformer_config = {
        'img_feat_size':     img_feat_size,
        'vocab_size':        vocab_size, 
        'num_classes':       num_classes,
        "embedding_dim":     512,
        "feed_forward_dim":  2048,
        "num_layers":        6,
        "num_heads":         4,
        "dropout":           0.1,
    }

    return Transformer(**transformer_config)

# Train and Validate 

In [19]:
train_question_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/OpenEnded_abstract_train2015_questions.txt'
train_annotation_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/abstract_train2015_annotations.txt'
train_img_feat_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_train2015'

val_question_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/OpenEnded_abstract_val2015_questions.txt'
val_annotation_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/abstract_val2015_annotations.txt'
val_img_feat_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/scene_img_feat_abstract_v002_val2015'

vocab_path = '/content/drive/MyDrive/EasyVisualQuestionAnswering/vocab.json'

# results = '/content/drive/MyDrive/EasyVisualQuestionAnswering/SimpleBaselineNet.txt'
# results = '/content/drive/MyDrive/EasyVisualQuestionAnswering/ParallelCoattentionNet.txt'
# results = '/content/drive/MyDrive/EasyVisualQuestionAnswering/AlternateCoattentionNet.txt'
results = '/content/drive/MyDrive/EasyVisualQuestionAnswering/EncoderDecoder.txt'

num_epochs = 5

batch_size = 6
num_data_loader_workers = 2

img_feat_size = 1000  

In [17]:
def optimize(criterion, predicted_answer, optimizer, ground_truth_answer):

    majority_ans = torch.argmax(ground_truth_answer, dim=-1)
    loss = criterion(predicted_answer, majority_ans)  # loss = criterion(output_model, target)
    optimizer.zero_grad()  # zero_grad(): zeroes the grad attribute of all the parameters passed to the optimizer.
    loss.backward()  # backward(): performs the gradient of all the parameters for which require_grad = True and stores the gradient in parameter.grad attribute for every parameter.
    optimizer.step()  # step(): updates the value of all the parameters passed to the optimizer (based on parameter.grad).
    return loss


def validate(model, val_dataset_loader):

    correct_answers = 0
    total_answers = 0

    for batch_id, batch_data in enumerate(val_dataset_loader):
        image_encoding = batch_data['image_enc']
        question_encoding = batch_data['ques_enc']
        question_length = batch_data['ques_len']
        ground_truth_answer = batch_data['ans_enc']

        batch_size = ground_truth_answer.shape[0]

        logits = model(image_encoding, question_encoding)  # for SimpleBaselineNet and Transformer
        # logits = model(image_encoding, question_encoding, question_length)  # for CoattentionNet 

        probs = F.softmax(logits, dim=-1)
        predicted_answer = torch.argmax(probs, dim=-1)

        counts = ground_truth_answer[torch.arange(batch_size), predicted_answer]  # torch.arange(): returns a 1-D tensor with values in the range [start, end) with start = 0 and end = batch_size.
        correct_answers = correct_answers + float(torch.sum(torch.min(counts/3, torch.ones(1))))
            
        total_answers = total_answers + batch_size

    return (correct_answers / total_answers) * 100

In [20]:
train_dataset = VqaTrainDataset(question_json_file_path=train_question_path,
                                annotation_json_file_path=train_annotation_path,
                                image_filename_pattern="abstract_v002_train2015_{}.png",
                                img_features_dir=train_img_feat_path,
                                vocab_json_filename=vocab_path)

val_dataset = VqaValDataset(question_json_file_path=val_question_path,
                            annotation_json_file_path=val_annotation_path,
                            image_filename_pattern="abstract_v002_val2015_{}.png",
                            img_features_dir=val_img_feat_path,
                            vocab_json_filename=vocab_path)


train_dataset_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,  # batch_size: represents how many samples per batch to load.
                                  shuffle=True,
                                  num_workers=num_data_loader_workers)  # num_workers: represents how many subprocesses to use for loading data.

val_dataset_loader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=num_data_loader_workers)


q_vocab_size = train_dataset.q_vocab_size
a_vocab_size = train_dataset.a_vocab_size

# model = SimpleBaselineNet(img_feat_size, q_vocab_size, a_vocab_size)
# model = CoattentionNet(img_feat_size, q_vocab_size, a_vocab_size)
model = get_transformer_model(img_feat_size=img_feat_size, vocab_size=q_vocab_size, num_classes=a_vocab_size)

out_filename = open(results, "w")

for epoch in range(num_epochs):
    num_batches = len(train_dataset_loader)

    for batch_id, batch_data in enumerate(train_dataset_loader):
        model.train()  # Set the model to train mode

        image_encoding = batch_data['image_enc']
        question_encoding = batch_data['ques_enc']
        question_length = batch_data['ques_len']
        ground_truth_answer = batch_data['ans_enc']

        predicted_answer = model(image_encoding, question_encoding)  # for SimpleBaselineNet and Transformer
        # predicted_answer = model(image_encoding, question_encoding, question_length)  # only for CoattentionNet 
        
        criterion = torch.nn.CrossEntropyLoss()

        # optimizer = torch.optim.SGD([{'params': model.fc_ques.parameters(), 'lr': 0.8}, {'params': model.classifier.parameters(), 'lr': 0.01}])  # for SimpleBaselineNet 

        # optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)  # for CoattentionNet 

        optimizer = torch.optim.AdamW([{'params': model.parameters(), 'lr': 0.0005, 'eps': 1e-6, 'weight_decay': 0.000001}])  # for Transformer

        loss = optimize(criterion, predicted_answer, optimizer, ground_truth_answer)

        if batch_id == (num_batches - 1):
            model.eval()  # Set the model to eval mode
            val_accuracy = validate(model, val_dataset_loader)
            print("Epoch {} has val accuracy: {}".format(epoch, val_accuracy))
            out_filename.write("Epoch {} has val accuracy: {}\n".format(epoch, val_accuracy))


out_filename.close() 

Epoch 0 has val accuracy: 22.22222222222222
Epoch 1 has val accuracy: 0.0
Epoch 2 has val accuracy: 22.22222222222222
Epoch 3 has val accuracy: 22.22222222222222
Epoch 4 has val accuracy: 22.22222222222222


#  Plots and Tables


In [None]:
figure = plt.figure(figsize=(20, 15))

# Simple Baseline Net
plt.subplot(3, 3, 1)
epoch_list_SimpleBaselineNet = []
accuracy_list_SimpleBaselineNet = []

with open('/content/drive/MyDrive/EasyVisualQuestionAnswering/SimpleBaselineNet.txt', 'r') as f:
    data = f.read().split("\n")
for i in range(len(data)-1):
    epoch_list_SimpleBaselineNet.append(int(data[i].split(" ")[1]))
    accuracy_list_SimpleBaselineNet.append(float(data[i].split(" ")[5]))

plt.plot(epoch_list_SimpleBaselineNet, accuracy_list_SimpleBaselineNet, label="Simple Baseline Net", color='blue', linewidth=3.0)
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.legend(loc='lower right', prop={'size': 15})
plt.savefig('/content/drive/MyDrive/EasyVisualQuestionAnswering/SimpleBaselineNet.png')

# Parallel Coattention Net
plt.subplot(3, 3, 4)
epoch_list_ParallelCoattentionNet = []
accuracy_list_ParallelCoattentionNet = []

with open('/content/drive/MyDrive/EasyVisualQuestionAnswering/ParallelCoattentionNet.txt', 'r') as f:
    data = f.read().split("\n")
for i in range(len(data)-1):
    epoch_list_ParallelCoattentionNet.append(int(data[i].split(" ")[1]))
    accuracy_list_ParallelCoattentionNet.append(float(data[i].split(" ")[5]))

plt.plot(epoch_list_ParallelCoattentionNet, accuracy_list_ParallelCoattentionNet, label="Parallel Coattention Net", color='green', linewidth=3.0)
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.legend(loc='lower right', prop={'size': 15})
plt.savefig('/content/drive/MyDrive/EasyVisualQuestionAnswering/ParallelCoattentionNet.png')

# Alternate Coattention Net
plt.subplot(3, 3, 5)
epoch_list_AlternateCoattentionNet = []
accuracy_list_AlternateCoattentionNet = []

with open('/content/drive/MyDrive/EasyVisualQuestionAnswering/AlternateCoattentionNet.txt', 'r') as f:
    data = f.read().split("\n")
for i in range(len(data)-1):
    epoch_list_AlternateCoattentionNet.append(int(data[i].split(" ")[1]))
    accuracy_list_AlternateCoattentionNet.append(float(data[i].split(" ")[5]))

plt.plot(epoch_list_AlternateCoattentionNet, accuracy_list_AlternateCoattentionNet, label="Alternate Coattention Net", color='green', linewidth=3.0)
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.legend(loc='lower right', prop={'size': 15})
plt.savefig('/content/drive/MyDrive/EasyVisualQuestionAnswering/AlternateCoattentionNet.png')

# Encoder Decoder
plt.subplot(3, 3, 7)
epoch_list_Transformer = []
accuracy_list_Transformer = []

with open('/content/drive/MyDrive/EasyVisualQuestionAnswering/Transformer.txt', 'r') as f:
    data = f.read().split("\n")
for i in range(len(data)-1):
    epoch_list_Transformer.append(int(data[i].split(" ")[1]))
    accuracy_list_Transformer.append(float(data[i].split(" ")[5]))

plt.plot(epoch_list_Transformer, accuracy_list_Transformer, label="Transformer", color='red', linewidth=3.0)
plt.xlabel('Epochs', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.legend(loc='lower right', prop={'size': 15})
plt.savefig('/content/drive/MyDrive/EasyVisualQuestionAnswering/Transformer.png')

In [None]:
df = pd.DataFrame({"Net": ["Simple Baseline Net", "Parallel Coattention Net", "Alternate Coattention Net", "Transformer"],
                   "Epochs": [epoch_list_SimpleBaselineNet[-1]+1, epoch_list_ParallelCoattentionNet[-1]+1, epoch_list_AlternateCoattentionNet[-1]+1, epoch_list_Transformer[-1]+1],
                   "Accuracy": [accuracy_list_SimpleBaselineNet[-1], accuracy_list_ParallelCoattentionNet[-1], accuracy_list_AlternateCoattentionNet[-1], accuracy_list_Transformer[-1]]})

df.to_csv('/content/drive/MyDrive/EasyVisualQuestionAnswering/results.csv', index=False, encoding='utf-8')

print(df.to_string(index=False))