In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

BASE_DIR=os.environ['PROJECT_DIRECTORY']
sys.path.append(BASE_DIR+'sotware_utils/')
sys.path.append(BASE_DIR+'models/')

from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
import pandas as pd
import argparse

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.autograd import Variable

from software_utils.vocabulary import Vocabulary
from software_utils.image_dataloader import get_image_dataloader, ImageDataset
from models.image_captioner import ImageCaptioner

In [3]:
images_path = BASE_DIR +'Dataset/coco/images/'
captions_path = BASE_DIR + 'cocoapi/annotations/coco_captions.csv'
models_path = BASE_DIR+'models/'
batch_size = 64
coco_set = 2014
load_features = True
preload = True
base_model='resnet152' # 'vgg16' # 'resnet152'
embedding_size = 2048 # 25088 # 2048
load_captions = True

In [4]:
vocab_path = BASE_DIR+'Data/processed/coco_vocab.pkl'

In [5]:
print ("Loading validation data...\r", end="")
val_loader = get_image_dataloader('val',coco_set,
                                  images_path, 
                                  vocab_path, captions_path, 
                                  batch_size, 
                                  embedding_size=embedding_size,
                                  load_features=load_features,
                                  load_captions=load_captions,
                                  model=base_model,
                                  preload=preload)
val_loader.dataset.mode = 'val'
print ("Loading validation data...Done")

Loading validation data...Done


In [None]:
vocab_size = val_loader.dataset.get_vocab_size()
start_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.start_word]
end_id = val_loader.dataset.get_idx()[val_loader.dataset.vocab.end_word]
max_caption_length = val_loader.dataset.max_len

print(f"Vocab size {vocab_size}, Caption length {max_caption_length}")

Vocab size 12433, Caption length 30


In [7]:
embed_size = 256
hidden_size = 512
rnn_type = 'lstm'

In [8]:
captioner = ImageCaptioner(embedding_size, embed_size, 
                           hidden_size, vocab_size,
                           max_caption_length,
                           start_id, end_id,
                           rnn_type='lstm')

if torch.cuda.is_available():
  print("CUDA Available")
  captioner.cuda()

CUDA Available


In [9]:
models_path+='image_caption-model11-20-0.1309-5.0.pkl'

In [10]:
checkpoint = torch.load(models_path)

captioner.load_state_dict(checkpoint['params'])
captioner.eval()

  checkpoint = torch.load(models_path)


ImageCaptioner(
  (inp): Linear(in_features=2048, out_features=256, bias=True)
  (inp_dropout): Dropout(p=0.2, inplace=False)
  (inp_bn): BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (embed): Embedding(12433, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=12433, bias=True)
)

In [11]:

val_bleu = 0.0
beam_size = 0

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/607], Bleu: 0.1045
Validation step [250/607], Bleu: 0.1423
Validation step [500/607], Bleu: 0.1091
Validation step [606/607], Bleu: 0.1075
Validation -- bleu: 0.1245


In [12]:

val_bleu = 0.0
beam_size = 3

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/607], Bleu: 0.1409
Validation step [250/607], Bleu: 0.1888
Validation step [500/607], Bleu: 0.1233
Validation step [606/607], Bleu: 0.0951
Validation -- bleu: 0.1310


In [13]:

val_bleu = 0.0
beam_size = 5

for val_id, val_batch in enumerate(val_loader):
  idxs, im_embeddings, caption_embeddings = val_batch

  if torch.cuda.is_available():
    im_embeddings = im_embeddings.cuda()
    caption_embeddings = caption_embeddings.cuda()

  # Get ground truth captions
  refs = val_loader.dataset.get_references(idxs.numpy())
            
  preds = captioner.predict(im_embeddings, beam_size=beam_size)
  
  # Calculate bleu loss per sample in batch
  # Sum and add length normalized sum to val_loss
  batch_bleu = 0.0
  for pred_id in range(len(preds)):
    pred = preds[pred_id].cpu().numpy().astype(int)
    pred_embed = val_loader.dataset.vocab.decode(pred, clean=True)
    batch_bleu += val_loader.dataset.vocab.evaluate(refs[pred_id], pred_embed)
  val_bleu += (batch_bleu/len(preds))

  # Get training statistics
  stats = "Validation step [%d/%d], Bleu: %.4f" \
            % (val_id, val_loader.dataset.get_seq_len(), 
                batch_bleu/len(preds))

  print("\r" + stats, end="")
  sys.stdout.flush()

  if val_id % 250 == 0:
    print('\r' + stats)

val_bleu /= val_loader.dataset.get_seq_len()
print ("\nValidation -- bleu: %.4f" % (val_bleu))

Validation step [0/607], Bleu: 0.1493
Validation step [250/607], Bleu: 0.1295
Validation step [500/607], Bleu: 0.1221
Validation step [606/607], Bleu: 0.1327
Validation -- bleu: 0.1317
