In [24]:
import torch
import torchvision
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as T
from copy import deepcopy
import re
from google.colab import drive
import pickle as pkl
from PIL import Image
from torch.utils.data import DataLoader, Dataset

class TextNumericalizer():
	def __init__(self, vocab, tokenizer):
		self.vocab = vocab

	def tokenize(self, sentence):
		return (re.sub(r'[^\w\s]', '', sentence).lower()).split(" ")

	def SentenceToVector(self, sentence):
		return [self.vocab.stoi[token.lower()] for token in self.tokenize(sentence)]

	def VectorToSentence(self, vector):
		return [self.vocab.itos[integer] for integer in vector]
  
	def getVocabularyLength(self):
		return len(self.vocab.stoi.keys())

class ImgCapDataset(Dataset):
    def __init__(self, X, Y, TN, transform=None):
        assert len(X)==len(Y), "Data should be of the same length! [Error: X(" + str(len(X)) + ") != Y(" + str(lenY) + ")]"
        self.X = X
        self.Y = Y
        self.TN = TN
        self.transform = transform

    def __getitem__(self, index):
        temp = deepcopy(self.Y[index])
        for i in range(len(self.Y[index])):
            temp[i] = re.sub(r'[^\w\s]', '', temp[i]).lower()

        return self.transform(self.X[index]), torch.tensor([self.TN.vocab.stoi["<sos>"]]+self.TN.SentenceToVector(temp[0])+[self.TN.vocab.stoi["<eos>"]], dtype=torch.long), [self.TN.SentenceToVector(i) for i in temp]

    def __len__(self):
        return len(self.X)

class CollateFunction():
    def __init__(self, padding):
        self.padding = padding

    def __call__(self, batch):
        return torch.cat([i[0].unsqueeze(0) for i in batch], dim=0), pad_sequence([i[1] for i in batch], batch_first=True, padding_value=self.padding), [i[2] for i in batch]

drive.mount("/content/drive")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#val_loader = pkl.load(open("/content/drive/MyDrive/Data/Val/val_loader.pkl","rb"))
#test_loader = pkl.load(open("/content/drive/MyDrive/Data/Test/test_loader.pkl","rb"))

# Encoder Network
class ENet(torch.nn.Module):
    def __init__(self):
        super(ENet, self).__init__()
        temp = torchvision.models.vgg19(pretrained=True)

        # No Fine-Tuning Of The Pre-Trained Model
        for p in temp.parameters():
            p.requires_grad = False

        self.encoder = torch.nn.Sequential(*list(temp.features.children())[:-2])

        for c in list(self.encoder.children())[5:]:
            for p in c.parameters():
                p.requires_grad = True

        self.feats = torch.nn.AdaptiveAvgPool2d((14,14))

    def forward(self, x):
        enc = self.encoder(x)
        enc = self.feats(enc)
        enc = enc.permute(0, 2, 3, 1)
        enc = enc.view(-1, 196, 512)

        return enc

# Bahdanau Attention Network
class ANet(torch.nn.Module):
    def __init__(self):
        super(ANet, self).__init__()
        self.v = torch.nn.Linear(512, 1)
        self.W = torch.nn.Linear(512, 512)
        self.U = torch.nn.Linear(512, 512)

    def forward(self, feats, h_state):
        U = self.U(h_state).unsqueeze(dim=1)
        W = self.W(feats)
        A = torch.nn.functional.tanh(W+U)
        e = self.v(A).squeeze(dim=2)
        alpha = torch.nn.functional.softmax(e, dim=1)
        context_vector = (feats*alpha.unsqueeze(dim=2)).sum(dim=1)

        return alpha, context_vector

# Decoder Network With Bahdanau Attention Mechanism
class DNet(torch.nn.Module):
    def __init__(self, vocab_size):
        super(DNet, self).__init__()
        self.v_size = vocab_size
        self.ANet = ANet()
        self.init_h = torch.nn.Linear(512, 512)
        self.init_c = torch.nn.Linear(512, 512)
        self.f_beta = torch.nn.Linear(512, 512)
        self.output_layer = torch.nn.Linear(512, vocab_size)
        self.embedding_layer = torch.nn.Embedding(vocab_size, 512)
        self.LSTM = torch.nn.LSTMCell(1024, 512)

    def forward(self, feats, caps):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        avg = torch.mean(feats, dim=1)
        h = torch.nn.functional.tanh(self.init_h(avg))
        c = torch.nn.functional.tanh(self.init_c(avg))
        T = len(caps[0])-1
        embedded_caps = None

        if(self.training):
            embedded_caps = self.embedding_layer(caps)
        else:
            embedded_caps = self.embedding_layer(torch.zeros(feats.size(0), 1).long().to(device))

        sentences = torch.zeros((feats.size(0), T, self.v_size)).to(device)
        weights = torch.zeros((feats.size(0), T, 196)).to(device)

        for t in range(T):
            alpha, context_vector = self.ANet(feats, h)
            gated_context_vector = torch.nn.functional.sigmoid(self.f_beta(h))*context_vector
            input = None

            if(self.training):
                input = torch.cat((embedded_caps[:, t], gated_context_vector), dim=1)
            else:
                embedded_caps = embedded_caps.squeeze(1) if embedded_caps.dim()==3 else embedded_caps
                input = torch.cat((embedded_caps, gated_context_vector), dim=1)

            h, c = self.LSTM(input, (h, c))
            output = self.output_layer((torch.nn.Dropout(p=0.3))(h))
            sentences[:, t] = output
            weights[:, t] = alpha

            if(not self.training):
                embedded_caps = self.embedding_layer(output.max(1)[1].reshape(feats.size(0), 1))

        return weights, sentences

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!pip install --upgrade nltk

Requirement already up-to-date: nltk in /usr/local/lib/python3.7/dist-packages (3.6.2)


In [26]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score

vocab_size = val_loader.dataset.TN.getVocabularyLength()
EnNet = ENet().to(device)
DeNet = DNet(vocab_size).to(device)
EnNet.load_state_dict(torch.load(open("/content/drive/MyDrive/Data/encoder.pth","rb")))
DeNet.load_state_dict(torch.load(open("/content/drive/MyDrive/Data/decoder.pth","rb")))

EnNet.eval()
DeNet.eval()

val_bleu = []
val_meteor = 0

with torch.no_grad():
    hypotheses = []
    references = []

    for img, cap, caps in val_loader:
        img, cap = torch.autograd.Variable(img).to(device), torch.autograd.Variable(cap).to(device)
        feats = EnNet(img)
        weights, sentences = DeNet(feats, cap)

        references.append([val_loader.dataset.TN.VectorToSentence(c) for c in caps[0]])
        hypotheses.append(val_loader.dataset.TN.VectorToSentence([token for token in torch.max(sentences, dim=2)[1][0] if token!=val_loader.dataset.TN.vocab.stoi["<sos>"] and token!=val_loader.dataset.TN.vocab.stoi["<eos>"] and token!=val_loader.dataset.TN.vocab.stoi["<pad>"]]))

        val_meteor += meteor_score([" ".join([ch for ch in r]) for r in references[-1]], " ".join([ch for ch in hypotheses[-1]]))

val_meteor /= len(val_loader.dataset)
val_bleu.append(corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)))
val_bleu.append(corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)))
val_bleu.append(corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)))
val_bleu.append(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)))
print("Validation Set: BLEU Scores: 1: {}, 2: {}, 3: {}, 4: {},".format(val_bleu[0], val_bleu[1], val_bleu[2], val_bleu[3]))
print("Validation Set: Meteor Score: {}".format(val_meteor))

'\nEnNet.eval()\nDeNet.eval()\n\nval_bleu = []\nval_meteor = 0\n\nwith torch.no_grad():\n    hypotheses = []\n    references = []\n\n    for img, cap, caps in val_loader:\n        img, cap = torch.autograd.Variable(img).to(device), torch.autograd.Variable(cap).to(device)\n        feats = EnNet(img)\n        weights, sentences = DeNet(feats, cap)\n\n        references.append([val_loader.dataset.TN.VectorToSentence(c) for c in caps[0]])\n        hypotheses.append(val_loader.dataset.TN.VectorToSentence([token for token in torch.max(sentences, dim=2)[1][0] if token!=val_loader.dataset.TN.vocab.stoi["<sos>"] and token!=val_loader.dataset.TN.vocab.stoi["<eos>"] and token!=val_loader.dataset.TN.vocab.stoi["<pad>"]]))\n\n        val_meteor += meteor_score([" ".join([ch for ch in r]) for r in references[-1]], " ".join([ch for ch in hypotheses[-1]]))\n\nval_meteor /= len(val_loader.dataset)\nval_bleu.append(corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)))\nval_bleu.append(corpus_bleu(r

In [23]:
EnNet.eval()
DeNet.eval()

test_bleu = []
test_meteor = 0

with torch.no_grad():
    hypotheses = []
    references = []

    for img, cap, caps in test_loader:
        img, cap = torch.autograd.Variable(img).to(device), torch.autograd.Variable(cap).to(device)
        feats = EnNet(img)
        weights, sentences = DeNet(feats, cap)

        references.append([test_loader.dataset.TN.VectorToSentence(c) for c in caps[0]])
        hypotheses.append(test_loader.dataset.TN.VectorToSentence([token for token in torch.max(sentences, dim=2)[1][0] if token!=val_loader.dataset.TN.vocab.stoi["<sos>"] and token!=val_loader.dataset.TN.vocab.stoi["<eos>"] and token!=val_loader.dataset.TN.vocab.stoi["<pad>"]]))

        test_meteor += meteor_score([" ".join([ch for ch in r]) for r in references[-1]], " ".join([ch for ch in hypotheses[-1]]))

test_meteor /= len(test_loader.dataset)
test_bleu.append(corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)))
test_bleu.append(corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)))
test_bleu.append(corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)))
test_bleu.append(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)))
print("Test Set: BLEU Scores: 1: {}, 2: {}, 3: {}, 4: {},".format(test_bleu[0], test_bleu[1], test_bleu[2], test_bleu[3]))
print("Test Set: Meteor Score: {}".format(test_meteor))



Test Set: BLEU Scores: 1: 0.5434922977585094, 2: 0.31738726576058, 3: 0.1954854887146144, 4: 0.12069149020351676,
Test Set: Meteor Score: 0.31989724270647407


In [28]:
train_loader = pkl.load(open("/content/drive/MyDrive/Data/Train/eval_train_loader.pkl","rb"))

In [29]:
EnNet.eval()
DeNet.eval()

train_bleu = []
train_meteor = 0

with torch.no_grad():
    hypotheses = []
    references = []

    for img, cap, caps in train_loader:
        img, cap = torch.autograd.Variable(img).to(device), torch.autograd.Variable(cap).to(device)
        feats = EnNet(img)
        weights, sentences = DeNet(feats, cap)

        references.append([train_loader.dataset.TN.VectorToSentence(c) for c in caps[0]])
        hypotheses.append(train_loader.dataset.TN.VectorToSentence([token for token in torch.max(sentences, dim=2)[1][0] if token!=val_loader.dataset.TN.vocab.stoi["<sos>"] and token!=val_loader.dataset.TN.vocab.stoi["<eos>"] and token!=val_loader.dataset.TN.vocab.stoi["<pad>"]]))

        train_meteor += meteor_score([" ".join([ch for ch in r]) for r in references[-1]], " ".join([ch for ch in hypotheses[-1]]))

train_meteor /= len(train_loader.dataset)
train_bleu.append(corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)))
train_bleu.append(corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)))
train_bleu.append(corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)))
train_bleu.append(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)))
print("Train Set: BLEU Scores: 1: {}, 2: {}, 3: {}, 4: {},".format(train_bleu[0], train_bleu[1], train_bleu[2], train_bleu[3]))
print("Train Set: Meteor Score: {}".format(train_meteor))



Train Set: BLEU Scores: 1: 0.7033579613325665, 2: 0.5697779255252082, 3: 0.4985996175006812, 4: 0.4498096499723183,
Train Set: Meteor Score: 0.5389961187733419
