# COMP5623 Coursework on Image Caption Generation

Starter code.



## Text preparation 

We need to build a vocabulary.

In [0]:
# Mounted Drive if using Colab; otherwise, your local path
root = "drive/My Drive/Colab Notebooks/data/Flickr8k/" # <--- replace this with your root data directory
caption_dir = root + "captions/"                       # <--- replace these too
image_dir = root + "images/"                           # <---


token_file = "Flickr8k.token.txt"

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


A helper function to read in our ground truth text file.

In [0]:
def read_lines(filepath):
    """ Open the ground truth captions into memory, line by line. """
    file = open(filepath, 'r')
    lines = []

    while True: 
        # Get next line from file until there's no more
        line = file.readline() 
        if not line: 
            break
        lines.append(line.strip())
    file.close() 
    return lines

You can read all the ground truth captions (5 per image), into memory as follows:

In [0]:
lines = read_lines(caption_dir + token_file)

In [0]:
lines[:5]

['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse .',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a wooden cabin .']

In [0]:
class Vocabulary(object):
    """Simple vocabulary wrapper which maps every unique word to an integer ID. """
    def __init__(self):
        # Intially, set both the IDs and words to empty dictionaries.
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        # If the word does not already exist in the dictionary, add it
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            # Increment the ID for the next word
            self.idx += 1

    def __call__(self, word):
        # If we try to access a word in the dictionary which does not exist, return the <unk> id
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

Extract all the words from ```lines```, and create a list of them in a variable ```words```, for example:

```words = ["a", "an", "the", "cat"... ]```

No need to worry about duplicates.


In [0]:
import itertools
from collections import Counter
words = []
words_split = []
for i in range(len(lines)):
  lines[i] = lines[i].replace('.', '') # Fullstops
  lines[i] = lines[i].replace(',', '') # Commas
  words.append(lines[i].lower().split())
  words_split.append(lines[i].lower().split())

words = list(itertools.chain.from_iterable(words))
word_count = dict(Counter(words))

del_list = []
for key in word_count:
  if word_count[key] <= 3 and 'jpg' not in key:
    del_list.append(key)

for word in words:
  if word in del_list:
    words.remove(word)

In [0]:
len(words)

469706

In [0]:
image_ids = []
del_ids = []

for i in range(len(words_split)):
  del_ids.append(words_split[i][0])
  img = words_split[i][0][:-5]
  image_ids.append(img)

cleaned_captions = []
captions = words_split
for i in range(len(captions)):
  for del_id in del_ids:
    if del_id in captions[i]:
      captions[i].remove(del_id)
  for del_word in del_list:
    if del_word in captions[i]:
      captions[i].remove(del_word)
  words_join = ' '.join(captions[i])
  cleaned_captions.append(words_join)

In [0]:
len(cleaned_captions)

40455

Build the vocabulary.

In [0]:
# Create a vocab instance
vocab = Vocabulary()

# Add the token words first
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')

Add the rest of the words from the parsed captions:

``` vocab.add_word('new_word')```

Don't add words that appear three times or less.

In [0]:
for word in words:
  if 'jpg' not in word:
    vocab.add_word(word)

In [0]:
print(len(vocab))

3748


## Dataset and loaders for training

Keeping the same order, concatenate all the cleaned words from each caption into a string again, and add them all to a list of strings ```cleaned_captions```. Store all the image ids in a list ```image_ids```.

The dataframe for the image paths and captions.

In [0]:
import pandas as pd

data = {
    'image_id': image_ids,
    'path': [image_dir + image_id + ".jpg" for image_id in image_ids],
    'caption': cleaned_captions
}

data_df = pd.DataFrame(data, columns=['image_id', 'path', 'caption'])

In [0]:
data_df.head(n=5)

Unnamed: 0,image_id,path,caption
0,1000268201_693b08cb0e,drive/My Drive/Colab Notebooks/data/Flickr8k/i...,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e,drive/My Drive/Colab Notebooks/data/Flickr8k/i...,a girl going into a wooden building
2,1000268201_693b08cb0e,drive/My Drive/Colab Notebooks/data/Flickr8k/i...,a little girl climbing into a wooden playhouse
3,1000268201_693b08cb0e,drive/My Drive/Colab Notebooks/data/Flickr8k/i...,a little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e,drive/My Drive/Colab Notebooks/data/Flickr8k/i...,a little girl in a pink dress going into a woo...


This is the Flickr8k class for the dataset.

In [0]:
from PIL import Image
import cv2
from nltk import tokenize
from torch.utils.data import Dataset

class Flickr8k(Dataset):
    """ Flickr8k custom dataset compatible with torch.utils.data.DataLoader. """
    
    def __init__(self, df, vocab, transform=None):
        """ Set the path for images, captions and vocabulary wrapper.
        
        Args:
            df: df containing image paths and captions.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.df = df
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """ Returns one data pair (image and caption). """

        vocab = self.vocab

        caption = self.df['caption'][index]
        img_id = self.df['image_id'][index]
        path = self.df['path'][index]

        image = Image.open(open(path, 'rb'))

        if self.transform is not None:
            image = self.transform(image)

        # Convert caption (string) to word ids.
        tokens = caption.split()
        caption = []
        # Build the Tensor version of the caption, with token words
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target

    def __len__(self):
        return len(self.df)

We need to overwrite the default PyTorch ```collate_fn()``` because our ground truth captions are sequential data of varying lengths. The default ```collate_fn()``` does not support merging the captions with padding.

You can read more about it here: https://pytorch.org/docs/stable/data.html#dataloader-collate-fn. 

In [0]:
def caption_collate_fn(data):
    """ Creates mini-batch tensors from the list of tuples (image, caption).
    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.
    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length from longest to shortest.
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths

Now we define the data transform.

In [0]:
from torchvision import transforms

# Crop size matches the input dimensions expected by the pre-trained ResNet
data_transform = transforms.Compose([ 
    transforms.Resize(224),
    transforms.CenterCrop(224),  # Why do we choose 224 x 224?
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),   # Using ImageNet norms
                         (0.229, 0.224, 0.225))])

Initialising the datasets. The only twist is that every image has 5 ground truth captions, so each image appears five times in the dataframe. We don't want an image to appear in more than one set.

In [0]:
unit_size = 5

train_split = 0.95 # Defines the ratio of train/test data.

# We didn't shuffle the dataframe yet so this works
train_size = unit_size * round(len(data_df)*train_split / unit_size)

dataset_train = Flickr8k(
    df=data_df[:train_size].reset_index(drop=True),
    vocab=vocab,
    transform=data_transform,
)

dataset_test = Flickr8k(
    df=data_df[(train_size):].reset_index(drop=True),
    vocab=vocab,
    transform=data_transform,
)

Write the dataloaders ```train_loader``` and ```test_loader``` - explicitly replacing the collate_fn:

```train_loader = torch.utils.data.DataLoader(
  ...,
  collate_fn=caption_collate_fn
)```

Set train batch size to 128 and be sure to set ```shuffle=True```

In [0]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=128, 
    shuffle=True,
    num_workers=0,
    collate_fn=caption_collate_fn
)

test_loader = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=5, 
    shuffle=False,
    num_workers=0,
    collate_fn=caption_collate_fn
)

## Encoder and decoder models

In [0]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True) # Pre-trained on ImageNet by default
        layers = list(resnet.children())[:-1]      # Keep all layers except the last one
        # Unpack the layers and create a new Sequential
        self.resnet = nn.Sequential(*layers)
        
        # We want a specific output size, which is the size of our embedding, so
        # we feed our extracted features from the last fc layer (dimensions 1 x 1000)
        # into a Linear layer to resize
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        
        # Batch normalisation helps to speed up training
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""

        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features
        
     
        


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        
        # What is an embedding layer?
        self.embed = nn.Embedding(vocab_size, embed_size)

        # Define this layer (one at a time)
        # self.lstm / self.rnn

 
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

       
        # self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        # What is "packing" a padded sequence?
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed) # Replace with self.rnn when using RNN
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seq_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids

In [0]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

Set training parameters.

In [0]:
embed_size = 256
hidden_size = 512
num_layers = 1
learning_rate = 0.001
num_epochs = 5
log_step = 10
save_step = 1

Initialize the models and set the learning parameters.

In [0]:
import numpy as np

# Build the models
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

# Optimisation will be on the parameters of BOTH the enocder and decoder,
# but excluding the ResNet parameters, only the new added layers.
params = list(
    decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()
)

optimizer = torch.optim.Adam(params, lr=learning_rate)


## Training the model

The loop to train the model. Feel free to put this in a function if you prefer.

In [0]:
# images, captions, lengths = [x[0] for x in iter (test_loader).next()]

def display_image(display_image):
    tensor_image = display_image + 1
    tensor_image = tensor_image - tensor_image.min()
    picture = tensor_image / (tensor_image.max() - tensor_image.min())

    plt.imshow(picture.permute(1, 2, 0))
    plt.axis("off")


def calculate_bleu(reference, candidate):
    score = sentence_bleu(reference, candidate) 
    return score

def reference_split(reference_unsplit):
    return [sentence.split(" ") for sentence in reference_unsplit]

def candidate_split(candidate_unsplit):
    return candidate_unsplit.split(" ")

def find_captions(caption_tensors):
    reference_captions = []

    for i in range(5):
      reference_captions.append(get_sentence(caption_tensors[i].cpu().numpy()))

    return reference_captions

def get_sentence(caption_tensor): 
    # Convert word_ids to words
    sampled_caption = []
    for word_id in caption_tensor:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption[1:-1])
    return sentence

In [0]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import matplotlib.pyplot as plt
import numpy as np

sample_image = []
sample_caption = []
predicted_caption = []
temp_images = []
temp_captions = []
encoder.eval()
decoder.eval()
for (images, captions, lengths) in test_loader:

  temp_images.append(images)

  temp_captions.append(captions)

sample_image = temp_images[2][2]
sample_caption = temp_captions[2][2]
sample_image = sample_image.to(device)
sample_caption = sample_caption.cpu()

features = encoder(images.to(device))
wordIDs = decoder.sample(features)

caption = decoder.sample(features).cpu().detach().numpy()
for word in caption[0]:
  predicted_caption.append(vocab.idx2word[word])

# display_image(sample_image.cpu())

# newcaption = np.array(wordIDs.cpu()[:1]).squeeze(0)

# candidate_caption = [vocab.idx2word[word] for word in newcaption]
# # print("Candidate")
# print(predicted_caption)

# sample_caption = sample_caption.cpu().squeeze(0).tolist()
# actual_caption = [vocab.idx2word[word] for word in sample_caption]
# # print("Reference")
# # print(actual_caption)

# score = sentence_bleu(actual_caption, sample_caption)
# # print("blue_score:\n" + str(score))

In [0]:
import matplotlib.pyplot as plt
# plt.imshow(images.permute(1, 2, 0)  )
# plt.axis("off")

In [0]:
# encoder.train()
# decoder.train()
# loss1 = []
# # Train the models
# total_step = len(train_loader)
# for epoch in range(num_epochs):
#     for i, (images, captions, lengths) in enumerate(train_loader):

#         # Set mini-batch dataset
#         images = images.to(device)
#         captions = captions.to(device)

#         # Packed as well as we'll compare to the decoder outputs
#         targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

#         # Forward, backward and optimize
#         features = encoder(images)
#         outputs = decoder(features, captions, lengths)

#         loss = criterion(outputs, targets)
        
#         # Zero gradients for both networks
#         decoder.zero_grad()
#         encoder.zero_grad()
   
#         loss.backward()
#         optimizer.step()

#         # Print log info
#         if i % log_step == 0:
#             print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
#                   .format(epoch, num_epochs, i, total_step, loss.item())) 

#         # If you want to save the model checkpoints - recommended once you have everything working
#         # Make sure to save RNN and LSTM versions separately
#         # if (i+1) % save_step == 0:
#     loss1.append(loss.item())
#     # torch.save(decoder.state_dict(),  'decoder{}.ckpt'.format(epoch+1))
#     # torch.save(encoder.state_dict(),  'encoder{}.ckpt'.format(epoch+1))
#     # Encoder = torch.load('drive/My Drive/Colab Notebooks/RNN/decoder1.ckpt')

In [0]:
# print(loss1)

In [0]:
# epochs = range(len(loss1))
# nb_epochs = len(epochs)
# plt.axis((0, 4, 2.1, 3.1))
# plt.plot(epochs, loss1, label = 'Loss Progression')
# plt.plot(epochs, loss1, 'o', label = 'Training loss at each Epoch')
# plt.legend()
# plt.draw()
# plt.title("RNN Decoder")
# plt.pause(0.01)

### Load Models 



In [0]:
#537559285_29be110134.jpg
import os

def load_image(image_path, transform=None):
    image = Image.open(image_path).convert('RGB')
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image


transform = transforms.Compose([
                                transforms.ToTensor(), 
                                transforms.Normalize((0.485, 0.456, 0.406), 
                                                    (0.229, 0.224, 0.225))])

all_models = os.listdir('/content/drive/My Drive/Colab Notebooks/data/RNN/')
#all_models.sort()
# print(all_models)
test_encoder = EncoderCNN(embed_size).to(device)
test_decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).to(device)

# THERE ARE LSTM 12 FILES AND RNN 12 FILES IN THE GOOGL DRIVE SAVED AFTER TRAINING
checkpoint_decoder = torch.load('/content/drive/My Drive/Colab Notebooks/data/LSTM/decoder3.ckpt') 
checkpoint_encoder = torch.load('/content/drive/My Drive/Colab Notebooks/data/LSTM/encoder3.ckpt') 
file_decoder = '/content/drive/My Drive/Colab Notebooks/data/LSTM/decoder5.ckpt'
file_encoder = '/content/drive/My Drive/Colab Notebooks/data/LSTM/encoder5.ckpt'
# NOW LOAD THE CHECKPOINTS TO THE MODELS
test_encoder.load_state_dict(checkpoint_encoder)
test_decoder.load_state_dict(checkpoint_decoder)

# SETTING THE MODELS TO EVAL MODE
test_encoder.eval()
test_decoder.eval()

test_image = load_image(image_dir + "537559285_29be110134.jpg", transform)
test_image = test_image.to(device)

sample_feature = test_encoder(test_image)
sampled_ids = test_decoder.sample(sample_feature)
sampled_ids = sampled_ids[0].cpu().numpy()

test_image = Image.open(image_dir + "537559285_29be110134.jpg")

# sampled_caption = []
# for word_id in sampled_ids:
#     word = vocab.idx2word[word_id]
#     sampled_caption.append(word)
#     if word == '<end>':
#         break
# sentence = ' '.join(sampled_caption[1:-1])

# plt.title(sentence)
# plt.axis("off")
# plt.imshow(test_image)

In [0]:
# ref = [['the', 'little', 'girl', 'wearing', 'a', 'pink', 'hat', 'is', 'bending', 'down', 'to', 'pick', 'up', 'a', 'soccer', 'ball'], ['a', 'little', 'girl', 'in', 'a', 'pink', 'hat', 'is', 'playing', 'with', 'a', 'soccer', 'ball'], ['a', 'young', 'girl', 'is', 'playing', 'with', 'a', 'soccer', 'ball', 'in', 'the', 'grass'], ['young', 'child', 'playing', 'with', 'a', 'soccer', 'ball', 'in', 'a', 'grassy', 'area'], ['little', 'girl', 'with', 'pink', 'hat', 'playing', 'with', 'a', 'soccer', 'ball']]
# can = ['a', 'child', 'in', 'a', 'red', 'collar', 'is', 'running', 'through', 'a', 'field']
# # print("Reference Caption:", ref)
# print("Generated Caption:", can)
# print("Bleu Score:",sentence_bleu(ref,can, weights=(0.10,0.10,0.10,0.10), smoothing_function=SmoothingFunction().method7), )

In [0]:
# ref = [['a','collie', 'is', 'running', 'through', 'an', 'obstacle', 'course'],['collie', 'jumping', 'over', 'a', 'training', 'hurdle', 'that', 'is', 'on', 'the', 'grass'],['collie', 'making', 'a', 'jump', 'over', 'a', 'yellow', 'hurdle'], ['the', 'dog', 'is', 'leaping', 'over', 'a', 'hurdle']]
# can = ['a', 'dog', 'leaps', 'over', 'a', 'hurdle']
# # print("Reference Caption:", ref)
# print("Generated Caption:", can)
# print("Bleu Score:",sentence_bleu(ref,can, weights=(0.10,0.10,0.10,0.10), smoothing_function=SmoothingFunction().method7), )

In [0]:

# def display(display_image):
#     tensor_image = display_image + 1
#     tensor_image = tensor_image - tensor_image.min()
#     picture = tensor_image / (tensor_image.max() - tensor_image.min())
#     # plt.imshow(np.transpose(picture[1], (1,2,0)))
    

#     plt.imshow(picture)
#     plt.axis("off")

In [0]:
bleu_scores = []
total_references=[]
for i, (images, targets, lengths) in enumerate(test_loader): # Iterating the test data loader
  image = images.to(device)
  #image_features = encoder(image)
  test_encoder.eval()
  test_decoder.eval()
  refs = reference_split(find_captions(targets))
  total_references.append(refs)

#test_image = load_image(image_dir + "1007320043_627395c3d8.jpg", transform)
#test_image = test_image.to(device)
  # print ("Epoch", file_encoder)
  sample_feature = test_encoder(image)
  sampled_ids = test_decoder.sample(sample_feature)
  sampled_ids = sampled_ids[0].cpu().numpy()
  #sampled_ids = decoder.sample(image_features) 
  #sampled_ids = sampled_ids[0].cpu().data.numpy()
  referenced_ids = targets[0].cpu().data.numpy()
  reference_sentence = get_sentence(referenced_ids)
  predicted_sentence = get_sentence(sampled_ids)
  bleu_score = sentence_bleu(refs,predicted_sentence, weights=(0.10,0.10,0.10,0.10), smoothing_function=SmoothingFunction().method7)
  bleu_scores.append(bleu_score)
  

  # #Displaying image
  # img = image.cpu().numpy()
  # # display(img)
  # plt.imshow(np.transpose(img[0], (1,2,0)))
  # plt.axis("off")
  # plt.show()
  
  print("Bleu Score",bleu_score)
  if i in bleu_scores == [4]:
    break
  #print("Actual Sentence",reference_sentence)
  print("Reference:", reference_sentence[0:])
  # print("Predicted:",predicted_sentence)
  # print(refs)
  # if i in img[i] <= [4]:
    # break

Bleu Score 0.6445415768239305
Reference: a child playing on the monkey bars at a playground with an adult
Bleu Score 0.6347773372843615
Reference: collie jumping over a training hurdle that is on the grass
Bleu Score 0.65223873280901
Reference: the little girl wearing a pink hat is bending down to pick up a soccer ball
Bleu Score 0.6492426528865304
Reference: a little girl walking on the green grass in front of a big stone
Bleu Score 0.6394583629945195
Reference: two children holding hands going down a large inflatable slide
Bleu Score 0.6478232686564145
Reference: a woman with a green shirt takes a drink from a water fountain
Bleu Score 0.6453631662275289
Reference: a man is holding three things on fire in front of a child 's playground
Bleu Score 0.6448210563693623
Reference: a man is playing with a fire baton in the day light
Bleu Score 0.6476027287498433
Reference: a girl is doing the splits in the air in front of some trees
Bleu Score 0.6413802326715403
Reference: a man wearing a 

KeyboardInterrupt: ignored

In [0]:
# #BLEU Score for entire test set
# test_loader_iter = iter(test_loader)
# total_references=[]
# total_candidates=[]
# bleu_list=[]
# encoder.eval()
# for batch_id, (img, cap, size) in enumerate(test_loader_iter):
#     refs = reference_split(find_captions(cap))
#     cads = candidate_split(generate_caption(img))
#     bleu_list.append(sentence_bleu(refs, cads, weights=(0.50,0.35,0.10,0.05), smoothing_function=SmoothingFunction().method7))
#     total_references.append(refs)
#     total_candidates.append(cads)
#     print(refs)
#     print(cads)
#     print(sentence_bleu(refs, cads, weights=(0.50,0.35,0.10,0.05), smoothing_function=SmoothingFunction().method7))
# encoder.train()
# c=[len(i) for i in total_candidates]

# print("bleu score rnn test (0.50,0.35,0.10,0.05) is: ", corpus_bleu(total_references, total_candidates, weights=(0.50,0.35,0.10,0.05), smoothing_function=SmoothingFunction().method7) )


In [0]:
# for i, (images, targets, lengths) in enumerate(test_loader): # Iterating the test data loader
#   image = images.to(device)
#   image_features = encoder(image)
#   sampled_ids = decoder.sample(image_features) 
#   sampled_ids = sampled_ids[0].cpu().data.numpy()
#   referenced_ids = targets[0].cpu().data.numpy()
#   reference_sentence = get_sentence(referenced_ids)
#   predicted_sentence = get_sentence(sampled_ids)
#   bleu_score = sentence_bleu(reference_sentence, predicted_sentence, weights=(0.50,0.35,0.10,0.05), smoothing_function=SmoothingFunction().method7)
  

#   #Displaying image
#   # img = image.cpu().numpy()
#   # plt.imshow(np.transpose(img[0], (1,2,0)))
#   # plt.show()


#   # display_image(img)