# Download the Images

In [None]:
import urllib.request 
for json in tqdm_notebook(paragraphs):
    urllib.request.urlretrieve(json['url'], "data/" + str(json['image_id']) + ".jpg")

# Necessary Packages and Imports

In [5]:
import json
import pickle
import random
import math
from collections import Counter
from nltk.corpus import wordnet as wn


import pylab as plt

import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.models as models
import torch.utils.data as data
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence

from PIL import Image
from IPython.display import Image as DisplayImage

import spacy
from spacy.vocab import Vocab
from spacy.tokens import Doc
nlp = spacy.load('en')

from tqdm import tqdm_notebook
import progressbar

import os
# GPU Selection
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from nets.vgg16 import vgg16
from nets.resnet_v1 import resnetv1
from model.test import im_detect
import cv2

In [10]:
net = vgg16()
#net = resnetv1(num_layers=152)
net.create_architecture(21, tag='default', anchor_scales=[8, 16, 32])
net.load_state_dict(torch.load("vgg16_faster_rcnn_iter_1190000.pth", map_location=lambda storage, loc: storage))
net.eval()
if not torch.cuda.is_available():
    net._device = 'cpu'
net.to(net._device)

RuntimeError: Error(s) in loading state_dict for vgg16:
	While copying the parameter named "rpn_cls_score_net.weight", whose dimensions in the model are torch.Size([18, 512, 1, 1]) and whose dimensions in the checkpoint are torch.Size([24, 512, 1, 1]).
	While copying the parameter named "rpn_cls_score_net.bias", whose dimensions in the model are torch.Size([18]) and whose dimensions in the checkpoint are torch.Size([24]).
	While copying the parameter named "rpn_bbox_pred_net.weight", whose dimensions in the model are torch.Size([36, 512, 1, 1]) and whose dimensions in the checkpoint are torch.Size([48, 512, 1, 1]).
	While copying the parameter named "rpn_bbox_pred_net.bias", whose dimensions in the model are torch.Size([36]) and whose dimensions in the checkpoint are torch.Size([48]).
	While copying the parameter named "cls_score_net.weight", whose dimensions in the model are torch.Size([21, 4096]) and whose dimensions in the checkpoint are torch.Size([81, 4096]).
	While copying the parameter named "cls_score_net.bias", whose dimensions in the model are torch.Size([21]) and whose dimensions in the checkpoint are torch.Size([81]).
	While copying the parameter named "bbox_pred_net.weight", whose dimensions in the model are torch.Size([84, 4096]) and whose dimensions in the checkpoint are torch.Size([324, 4096]).
	While copying the parameter named "bbox_pred_net.bias", whose dimensions in the model are torch.Size([84]) and whose dimensions in the checkpoint are torch.Size([324]).

In [None]:
print(net)

In [None]:
with open("../../controllable-captioning/raw/splits/train_split.json", "r") as f:
    train_split = json.load(f)
with open("../../controllable-captioning/raw/splits/dev_split.json", "r") as f:
    dev_split = json.load(f)
with open("../../controllable-captioning/raw/splits/test_split.json", "r") as f:
    test_split = json.load(f)
    
with open("../../controllable-captioning/raw/paragraphs_topics_v1.pickle", "rb") as f:
    paragraph_topics = pickle.load(f)
    
with open("../../controllable-captioning/raw/paragraphs_v1.json", "r") as f:
    paragraphs_json = json.load(f)

In [None]:
def parse_data(paragraphs_json, progress_bar=True):
    train_data = []
    dev_data = []
    test_data = []
    image_ids = {}
    topic_set = set()
    for i, json in enumerate(tqdm_notebook(paragraphs_json) if progress_bar else paragraphs_json):
        topic_to_seq = {}
        for j, sentence in enumerate(sent_detector.tokenize(json['paragraph'])):
            sentence = sentence.strip().lower()
            t = nlp(sentence)
            image_id = json['image_id']
            image_ids[image_id] = i
            
            if 'perfect_match' in paragraph_topics[i][j]:
                topic_list = set([topic[0] for topic in paragraph_topics[i][j]['perfect_match']])
                for topic in topic_list:
                    topic_set.add(topic)
                    if topic not in topic_to_seq:
                        topic_to_seq[topic] = []
                    topic_to_seq[topic].extend(t)
            for topic in topic_to_seq:
                if image_id in train_split:
                    train_data.append((image_id, topic, topic_to_seq[topic]))
                elif image_id in dev_split:
                    dev_data.append((image_id, topic, topic_to_seq[topic]))
                elif image_id in test_split:
                    test_data.append((image_id, topic, topic_to_seq[topic]))
    return train_data, dev_data, test_data, image_ids, topic_set
                        
train_data, dev_data, test_data, image_ids, topic_set = parse_data(paragraphs_json)
print("Length of train split: %d" %len(train_data))
print("Length of dev split: %d" %len(dev_data))
print("Length of test split: %d" %len(test_data))

In [None]:
class Vocabulary(object):
    """A vocabulary wrapper, contains a word_to_index dictionary and a index_to_word list"""
    def __init__(self):
        self.word_to_index = {}
        self.index_to_word = []
        self.index = 0

    def add_word(self, word):
        if not word in self.word_to_index:
            self.word_to_index[word] = self.index
            self.index_to_word.append(word)
            self.index += 1

    def __call__(self, word):
        if type(word) == str:
            if not word in self.word_to_index:
                return self.word_to_index['<UNK>']
            return self.word_to_index[word]
        else:
            return self.index_to_word[word]

    def __len__(self):
        return self.index

def build_vocab(sentences, min_occurrences):
    """Builds a Vocabulary object"""
    counter = Counter()
    for sentence in tqdm_notebook(sentences):
        for word in sentence:
            counter[word.text] += 1

    # a word must appear at least min_occurrence times to be included in the vocabulary
    words = [word for word, count in counter.items() if count >= min_occurrences]

    # Creating a vocabulary object
    vocab = Vocabulary()
    vocab.add_word('<SOS>')
    vocab.add_word('<EOS>')
    vocab.add_word('<UNK>')

    # Adds the words from the captions to the vocabulary
    for word in words:
        vocab.add_word(word)
    return vocab

word_vocab = build_vocab([val[2] for val in train_data], 5)
topic_vocab = Vocabulary()
for topic in topic_set:
    topic_vocab.add_word(topic)
print("Length of word vocab: %d" %len(word_vocab))
print("Number of topics: %d" %len(topic_vocab))
print(topic_set)

In [None]:
class BatchedData(object):
    def __init__(self, batch_size):
        self.batched_data = []
        self.index = 0
        self.batch_size = batch_size

    def add_batch(self, batch):
        if len(batch) == self.batch_size:
            self.batched_data.append(batch)
        else:
            print("not the correct size batch!")

    def __call__(self, index):
        if not index < len(self.batched_data):
            return []
        return self.batched_data[index]

    def __len__(self):
        return len(self.batched_data)
    
def batch_data(data, batch_size, max_size=None, progress_bar=True, randomize=True):
    batched_set = {}
    counter = 0
    for (image_id, topic, sentence) in (tqdm_notebook(data) if progress_bar else data):
        # accounting for SOS and EOS tokens
        sentence = [token.text for token in sentence]
        caption_len = len(sentence) + 2
        if caption_len not in batched_set.keys():
            batched_set[caption_len] = []
        batched_set[caption_len].append((image_id, topic, [token for token in sentence]))

    batched_data = BatchedData(batch_size)
    curr_size = 0

    for i in batched_set.keys():
        if len(batched_set[i]) >= batch_size:
            batch = batched_set[i]
            random.shuffle(batch)
        for j in range(len(batch) // batch_size):
            if max_size is None or curr_size < max_size:
                batched_data.add_batch(batch[batch_size * j : batch_size * (j+1)])
                curr_size += 1
    if randomize:
        random.shuffle(batched_data.batched_data)
    return batched_data

batch_size = 32

batched_train_data = batch_data(train_data, batch_size, progress_bar=False)
batched_dev_data = batch_data(dev_data, batch_size, progress_bar=False)
batched_single_dev_data = batch_data(dev_data, 1, progress_bar=False, randomize=False)
batched_test_data = batch_data(test_data, 1, progress_bar=False, randomize=False)

print("number of train batches: %d" %len(batched_train_data))
print("number of dev batches: %d" %len(batched_dev_data))
print("number of test batches: %d" %len(batched_test_data))

In [None]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
      std=[0.229, 0.224, 0.225])
])

In [None]:
class CustomDataSet(data.Dataset):
    def __init__(self, paragraphs, batched_captions, word_vocab, topic_vocab, transform=None):
        """
        Set the path for images, captions and vocabulary wrapper.
    
        Args:
        
                vocab: vocabulary wrapper.
                transform: image transformer.
        """
        self.paragraphs = paragraphs
        self.batched_captions = batched_captions
        self.word_vocab = word_vocab
        self.topic_vocab = topic_vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        images = []
        topics = []
        captions = []
        img_ids = []
        for (image_id, topic, sentence) in self.batched_captions(index):
            #image = Image.open("../data/" + str(image_id) + ".jpg").convert('RGB')
            image = cv2.imread("../data/" + str(image_id) + ".jpg")

            #if self.transform is not None:
            #    image = self.transform(image)
            images.append(image)
            topics.append(self.topic_vocab(topic))
            img_ids.append(image_id)
            captions.append([self.word_vocab('<SOS>')] + [self.word_vocab(token) for token in sentence] + [self.word_vocab('<EOS>')])

        lengths = [len(caption) for caption in captions]
        #return torch.stack(images, 0), torch.LongTensor(topics), torch.LongTensor(captions), lengths, img_ids
        return images, torch.LongTensor(topics), torch.LongTensor(captions), lengths, img_ids

    def __len__(self):
        return len(self.batched_captions)
def collate_fn(data):
    """
    Creates mini-batch tensors from the list of tuples (image, caption).
  
    We should build custom collate_fn rather than using default collate_fn, 
    because merging captions (including padding) is not supported in default.
    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.
    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    images, topics, captions, lengths, image_ids = zip(*data)
    return images[0], topics[0], captions[0], lengths[0], image_ids[0]


def get_loader(paragraphs, batched_data, word_vocab, topic_vocab, transform, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    data_set = CustomDataSet(paragraphs, batched_data, word_vocab, topic_vocab, transform)
    data_loader = torch.utils.data.DataLoader(dataset=data_set, 
                                            shuffle=shuffle,
                                            num_workers=num_workers,
                                            collate_fn=collate_fn)
    return data_loader

single_val_data_loader = get_loader(dev_data, batched_single_dev_data, word_vocab, topic_vocab, transform, False, 2)
test_data_loader = get_loader(test_data, batched_test_data, word_vocab, topic_vocab, transform, False, 2)

In [None]:
def to_var(x, useCuda=True, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, volatile=volatile)

In [None]:
for images, topics, captions, lengths, ids in single_val_data_loader:
    #images = to_var(images, volatile=False)
    topics = to_var(topics, volatile=False)
    scores, boxes = im_detect(net, images[0])
    break