In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

from skimage import io, transform

import matplotlib.pyplot as plt # for plotting
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import itertools

### Image Transforms

In [41]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        return img


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        return image


IMAGE_RESIZE = (256, 256)
# Sequentially compose the transforms
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor()])


### Captions Preprocessing

In [67]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path

        # Read raw captions
        self.raw_captions_dict = self.read_raw_captions()

        # Preprocess captions
        self.captions_dict = self.process_captions()

        # Create vocabulary
        self.vocab = self.generate_vocabulary()

    def read_raw_captions(self):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """

        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                captions_dict[int(img_captions[0])] = img_captions[1:]

        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict
        word_counter = Counter()

        # Do the preprocessing here
        raw_captions_dict = captions_dict
        word_counter = Counter()
        FREQUENCY_THRESHOLD = 10
        #stop_words = set(nltk.corpus.stopwords.words('english'))
        # Do the preprocessing here
        for caption_id in raw_captions_dict:
            #print(caption_id)
            caption_list = raw_captions_dict[caption_id]
            for caption in caption_list:
                #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
                words = word_tokenize(caption.lower())
                #print(words)
                # update word frequency
                word_counter.update(words)
        words_final = [word for word, wordcount in word_counter.items() if wordcount >= FREQUENCY_THRESHOLD] 
        
        #update dictionary
        """
        captions_dict_updated = {}
        for caption_id in raw_captions_dict:
            #print(caption_id)
            caption_list = raw_captions_dict[caption_id]
            new_words = []
            for caption in caption_list:
                #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
                words = word_tokenize(caption.lower())
                new_words.append('<START>') 
                new_words.extend([word for word in words if word in words_final])
                new_words.append('<END>')
            #print(new_words)
            captions_dict_updated[int(caption_id)] = new_words
        """
        captions_dict_updated = {}
        for caption_id in raw_captions_dict:
            #print(caption_id)
            caption_list = raw_captions_dict[caption_id]
            new_words_list = []
            for caption in caption_list:
                #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
                words = word_tokenize(caption.lower())
                new_words = []
                new_words.append('<START>') 
                new_words.extend([word for word in words if word in words_final])
                new_words.append('<END>')
                new_words_list.append(new_words)
            #print(new_words_list)
        captions_dict_updated[int(caption_id)] = new_words_list
        
        #create vocabulary
        '''
        self.vocab['<START>'] = 1
        i=1
        for word, wordcount in word_counter.items():
            self.vocab[word] = i
            i += 1
        self.vocab['<END>'] = i
        '''

        return captions_dict_updated

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """
        #create vocabulary
        raw_captions_dict = self.raw_captions_dict
        word_counter = Counter()
        vocab = {}
        # Do the preprocessing here
        raw_captions_dict = captions_dict
        word_counter = Counter()
        FREQUENCY_THRESHOLD = 20
        #stop_words = set(nltk.corpus.stopwords.words('english'))
        # Do the preprocessing here
        for caption_id in raw_captions_dict:
            #print(caption_id)
            caption_list = raw_captions_dict[caption_id]
            for caption in caption_list:
                #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
                words = word_tokenize(caption.lower())
                #print(words)
                # update word frequency
                word_counter.update(words)
        words_final = [word for word, wordcount in word_counter.items() if wordcount >= FREQUENCY_THRESHOLD]
        
        vocab['<START>'] = 1
        i=1
        for word, wordcount in word_counter.items():
            vocab[word] = i
            i += 1
        vocab['<END>'] = i

        return vocab

    def captions_transform(self, img_caption_list):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        caption_tensor = torch.zeros(len(img_caption_list), 100)
        vocab = self.vocab
        for idx1, caption in enumerate(img_caption_list):
            for idx2, word in enumerate(caption):
                  caption_tensor[idx1][idx2] =  vocab[word]        
        # Generate tensors

        return caption_tensor

# Set the captions tsv file path
CAPTIONS_FILE_PATH = 'data/public_test_captions.tsv'
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

In [68]:
print(captions_preprocessing_obj.captions_transform)

<bound method CaptionsPreprocessing.captions_transform of <__main__.CaptionsPreprocessing object at 0x1c3b6e7d90>>


### Dataset Class

In [69]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image ids (integers)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, 'image_{}.jpg'.format(self.image_ids[idx]))
        image = io.imread(img_name)
        captions = self.captions_dict[self.image_ids[idx]]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            captions = self.captions_transform(captions)

        sample = {'image': image, 'captions': captions}

        return sample

### Model Architecture

In [54]:
class ImageCaptionsNet(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        super(ImageCaptionsNet, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        # Define your architecture here

    def forward(self, x):
        x = image_batch, captions_batch

        # Forward Propogation
        with torch.no_grad():
            features = self.resnet(image_batch)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        embeddings = self.embed(captions_batch)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs_captions_batch = self.linear(hiddens[0])
        return captions_batch

net = ImageCaptionsNet(256, 512, len(captions_preprocessing_obj.vocab), 1)

# If GPU training is required
# net = net.cuda()

Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /Users/anjali/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth


HBox(children=(IntProgress(value=0, max=241530880), HTML(value='')))




### Training Loop

In [70]:
IMAGE_DIR = 'data/public_test_images/'

# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)

# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 1e-1
BATCH_SIZE = 32
NUM_WORKERS = 0 # Parallel threads for dataloading
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE)

# Creating the DataLoader for batching purposes
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
import os
for epoch in range(NUMBER_OF_EPOCHS):
    for batch_idx, sample in enumerate(train_loader):
        net.zero_grad()

        image_batch, captions_batch = sample['image'], sample['captions']

        # If GPU training required
        # image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()

        output_captions = net((image_batch, captions_batch))
        loss = loss_function(output_captions, captions_batch)
        loss.backward()
        optimizer.step()
    print("Iteration: " + str(epoch + 1))

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #3 'mat1' in call to _th_addmm_

In [66]:
a=torch.zeros(len([2,3,4,5]), 30)
print(a[0][30])

IndexError: index 30 is out of bounds for dimension 0 with size 30

In [46]:
IMAGE_DIR = 'data/public_test_images/'

# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)



In [None]:
train_dataset

In [22]:

def read_raw_captions(captions_file_path):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """

        captions_dict = {}
        with open(captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                #print(img_captions[1])
                #print(img_captions[0],img_captions[1:])
                captions_dict[int(img_captions[0])] = img_captions[1:]

        return captions_dict

In [23]:
captions_file_path='data/public_test_captions.tsv'
captions_dict = read_raw_captions(captions_file_path)

In [27]:
raw_captions_dict = captions_dict
#captions_dict = {}
word_counter = Counter()
FREQUENCY_THRESHOLD = 0
#stop_words = set(nltk.corpus.stopwords.words('english'))
# Do the preprocessing here
for caption_id in raw_captions_dict:
    #print(caption_id)
    caption_list = raw_captions_dict[caption_id]
    for caption in caption_list:
        #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
        words = word_tokenize(caption.lower())
        #print(words)
        # update word frequency
        word_counter.update(words)
words_final = [word for word, wordcount in word_counter.items() if wordcount >= FREQUENCY_THRESHOLD] 

In [28]:
vocab = {}
i=0
for word, wordcount in word_counter.items():
    vocab[word] = i
    i += 1
#for word in vocab:
    #print(word,vocab[word])

In [29]:
len(word_counter)

4649

In [30]:
print(sorted(word_counter.items(), key = 
             lambda kv:(kv[1], kv[0]))) 

[('#', 1), ('...', 1), ('10', 1), ('11', 1), ('49ers', 1), ('6', 1), ('7', 1), ('9', 1), ('abbandonato', 1), ('abbia', 1), ('abbracciare', 1), ('abbracciata', 1), ('abbraccio', 1), ('accelerare', 1), ('accendere', 1), ('accendono', 1), ('accentati', 1), ('accenti', 1), ('accesa', 1), ('accese', 1), ('accesi', 1), ('accettare', 1), ('acciaio', 1), ('accompagnati', 1), ('accoppiati', 1), ('accovacciando', 1), ('accumulata', 1), ('acini', 1), ('acquatica', 1), ('acque', 1), ('acquirenti', 1), ('acrobat', 1), ('acrobazie', 1), ('acustiche', 1), ('addestramento', 1), ('addestrando', 1), ('addormentano', 1), ('addormentata', 1), ('addormentati', 1), ('adesivi', 1), ('adombrelli', 1), ('adoravano', 1), ('adulte', 1), ('advice', 1), ('aeree', 1), ('aerei', 1), ('affascinati', 1), ('afferra', 1), ('afferrando', 1), ('affettato', 1), ('affettuosamente', 1), ('affilata', 1), ('affilati', 1), ('affollano', 1), ('affollate', 1), ('affonda', 1), ('affronta', 1), ('africa', 1), ('afroamericana', 1), 

In [21]:
i=0
for word, wordcount in word_counter.items():
    if(i<500):
        print(wordcount);
    i+=1

19833
2325
49685
9358
3629
120
2722
8435
1092
11709
5295
69004
589
144576
1727
401
1577
4776
16639
804
110
8281
530
791
145518
279
34042
7038
10284
365
47
56
15306
140
1538
813
5077
186
7100
50
22
41
61822
6
220
97
21192
510
109
44226
816
96807
1460
1086
390
73
2041
951
7194
368
277
5330
4039
12
7370
3401
2642
18801
83
205
595
11552
20
2283
80
213
1546
389
20506
2552
926
397
15133
374
2053
5372
2582
831
8140
41743
9693
5096
1393
420
824
911
652
29248
99
250
558
1998
5069
204
22126
6649
932
3102
5494
5827
70
73
6
1837
907
10
109
49
1211
156
164
13124
2633
2616
390
1887
473
5719
79
4021
3417
893
123
89
490
2864
159
388
2
864
1930
748
3245
1130
1096
219
44
3504
4620
319
22
6
9
88
2253
984
435
146
4214
1171
1314
821
298
42
430
22
2939
8964
2996
59
4626
25
32
940
1292
1386
8737
33
21509
26
734
447
1083
1048
691
914
303
559
90
301
568
736
3188
2214
5127
1
79
666
1561
116
1625
4
667
239
44
1073
86
623
238
804
99
2126
142
870
3665
833
206
24
170
244
93
280
376
2161
70
7
268
2
1956
13
311
459
2

In [32]:
captions_dict_updated = {}
for caption_id in raw_captions_dict:
    #print(caption_id)
    caption_list = raw_captions_dict[caption_id]
    new_words_list = []
    for caption in caption_list:
        #filtered_caption = ''.join(([w for w in caption if not w in stop_words]))
        words = word_tokenize(caption.lower())
        new_words = []
        new_words.append('<START>') 
        new_words.extend([word for word in words if word in words_final])
        new_words.append('<END>')
        new_words_list.append(new_words)
    print(new_words_list)
    captions_dict_updated[int(caption_id)] = new_words_list
    


[['<START>', 'un', 'gruppo', 'di', 'persone', 'in', 'piedi', 'sul', 'retro', 'di', 'un', 'camion', 'pieno', 'di', 'cotone', '.', '<END>'], ['<START>', 'gli', 'uomini', 'sono', 'in', 'piedi', 'su', 'un', 'camion', 'che', 'trasporta', 'una', 'sostanza', 'bianca', '.', '<END>'], ['<START>', 'un', 'gruppo', 'di', 'persone', 'è', 'in', 'piedi', 'su', 'un', 'mucchio', 'di', 'lana', 'in', 'un', 'camion', '.', '<END>'], ['<START>', 'un', 'gruppo', 'di', 'uomini', 'sta', 'caricando', 'cotone', 'su', 'un', 'camion', '.', '<END>'], ['<START>', 'i', 'lavoratori', 'caricano', 'lana', 'ricamata', 'su', 'un', 'camion', '.', '<END>']]
[['<START>', 'un', 'adulto', 'con', 'una', 'maglietta', 'grigia', 'con', 'maniche', 'rosse', 'che', 'dormiva', 'sul', 'divano', '.', '<END>'], ['<START>', 'una', 'donna', 'in', 'pantaloni', 'neri', 'appoggiata', 'sul', 'divano', '.', '<END>'], ['<START>', 'un', 'uomo', 'che', 'dormiva', 'in', 'una', 'stanza', 'verde', 'su', 'un', 'divano', '.', '<END>'], ['<START>', 'la'

[['<START>', 'una', 'giovane', 'donna', 'che', 'indossa', 'un', 'bustier', 'color', 'turchese', 'sopra', 'una', 'camicia', 'nera', ',', 'una', 'gon', 'na', 'nera', ',', 'stivali', 'a', 'rete', ',', 'e', 'guanti', 'adesivi', 'è', 'seduta', 'accanto', 'ad', 'altre', 'cinque', 'giovani', 'donne', ',', 'la', 'maggior', 'parte', 'delle', 'quali', 'è', 'anche', 'vestita', 'in', 'vestiti', 'a', 'colori', 'neri', '.', '<END>'], ['<START>', 'un', 'gruppo', 'di', 'ragazze', 'vestite', 'perlopiù', 'di', 'nero', 'è', 'seduto', '.', '<END>'], ['<START>', 'un', 'gruppo', 'di', 'donne', 'ben', 'vestite', 'si', 'siedono', 'insieme', '.', '<END>'], ['<START>', 'un', 'gruppo', 'di', 'giovani', 'donne', 'è', 'seduto', 'fuori', '.', '<END>'], ['<START>', 'le', 'ragazze', 'sedute', 'con', 'le', 'mani', 'sulle', 'gambe', '.', '<END>']]
[['<START>', 'una', 'giovane', 'donna', 'a', 'piedi', 'nudi', 'su', 'una', 'strada', 'affollata', 'guarda', 'verso', 'il', 'basso', 'mentre', 'molte', 'persone', 'stanno', 'c

In [None]:
if(caption_id in captions_dict):
        captions_dict_updated[int(caption_id)] = str(captions_dict[int(caption_id)]).join(word for word in new_words)
    else:
        captions_dict_updated[int(caption_id)] = ''.join(word for word in new_words)

In [None]:
for caption in captions_dict_updated:
    caption_list = captions_dict_updated[caption_id]
    for caption in caption_list:
        print(caption)

In [None]:
def word_idx_map(raw_captions, threshold):
    caps = []
    for im in raw_captions:
        for s in raw_captions[im]:
            caps.append(s.split())

    word_freq = nltk.FreqDist(itertools.chain(*caps))
    idx_to_word = ['<pad>'] + [word for word, cnt in word_freq.items() if cnt >= threshold] + ['<unk>']
    word_to_idx = {word:idx for idx, word in enumerate(idx_to_word)}

    return idx_to_word, word_to_idx

In [None]:
idx_to_word, word_to_idx = word_idx_map(captions_dict, 5)

In [None]:
def tokenize(captions, word_to_idx, maxlen):
    '''
    Inputs:
    - captions: dictionary with image_id as key, captions as value
    - word_to_idx: mapping from word to index
    - maxlen: max length of each sequence of tokens
    Returns:
    - tokens: array of shape (data_size, maxlen)
    - image_ids: list of length data_size, mapping token to corresponding image_id
    '''
    tokens, image_ids = [], []
    for im_id in captions:
        for cap in captions[im_id]:
            token = [(lambda x: word_to_idx[x] if x in word_to_idx else word_to_idx['<unk>'])(w) \
                     for w in cap.split()]
            if len(token) > maxlen:
                token = token[:maxlen]
            else:
                token += [0] * (maxlen-len(token))
            tokens.append(token)
            image_ids.append(im_id)
    return np.array(tokens).astype('int32'), np.array(image_ids)

In [None]:
word_to_idx

In [None]:
def process_captions(raw_captions_dict):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = raw_captions_dict
        captions_dict = {}
        word_counter = Counter()
        FREQUENCY_THRESHOLD = 5
        # Do the preprocessing here
        for caption_id in raw_captions_dict:
            caption_list = raw_captions_dict[caption_id]
            for caption in caption_list:
                words = word_tokenize(caption.lower())
                # update word frequency
                word_counter.update(words)
        words_final = [word for word, wordcount in word_counter.items() if wordcount >= FREQUENCY_THRESHOLD] 
        
        #reset dictionary
        for caption_id in raw_captions_dict:
            caption_list = raw_captions_dict[caption_id]
            for caption in caption_list:
                words = word_tokenize(caption.lower())
                # remove less frequent word
                #new_words = [word for word in words if word in words_final]
                new_words = ['<START>'] + [word for word in words if word in words_final] + ['<END>']
                #print(new_words)
                if(caption_id in captions_dict):
                    captions_dict[int(caption_id)] = str(captions_dict[int(caption_id)]).join(word for word in new_words)
                else:
                    captions_dict[int(caption_id)] = ''.join(word for word in new_words)
        
        for caption_id in captions_dict:
            caption_list = captions_dict[caption_id]
            print(caption_id, caption_list)
        
        return captions_dict

In [None]:
process_captions(captions_dict)

In [None]:
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO


In [None]:
coco = COCO(json)
counter = Counter()
ids = coco.anns.keys()
for i, id in enumerate(ids):
    caption = str(coco.anns[id]['caption'])
    tokens = nltk.tokenize.word_tokenize(caption.lower())