# 1. Load Data

In [16]:
from pycocotools.coco import COCO
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import random
import math
import torch
import torchvision.transforms as transforms
import torch.nn as nn
from torchtext.vocab import GloVe
from nltk.tokenize import word_tokenize
from model import CaptionEncoder, ImageEncoder, CaptionEvaluator, train
import sys
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import dill

In [2]:
BASE_PATH = '/scratch/lt2316-h18-resources/coco/'

In [3]:
coco_captions = COCO(BASE_PATH + 'annotations/captions_train2017.json')
# coco_instances = COCO(BASE_PATH + 'annotations/instances_train2017.json')

loading annotations into memory...
Done (t=1.18s)
creating index...
index created!


In [4]:
# I added the <PAD> and <UNK> token to the glove vectors initialized with zeros
glove_vectors = GloVe(name='6B', dim=300)

In [5]:
print(glove_vectors.vectors.shape)

torch.Size([400002, 300])


In [6]:
hyperparameters = {
    'number_of_images': 10000,
    'batch_size': 32,
    # embedding dim -1 will initialize with glove vectors
    'embedding_dim': -1,
    'lstm_out_dim': 512,
    'lstm_layers': 1,
    'hidden_size': 2000,
    'epochs': 25,
    'learning_rate': 0.0002
}

PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'

device = torch.device('cuda:2')

In [7]:
class Sampler():
    def __init__(self, coco_captions, number_of_samples=100, negative_sample_split=0.95, train_split=0.8, val_split=0.05, test_split=0.15) -> None:
        samples = []
        transform = transforms.ToTensor()
        
        random_images = random.sample(list(coco_captions.imgs.values()), number_of_samples)
        CAPTIONS_PER_IMAGE = len(coco_captions.imgToAnns[random_images[0]['id']])

        for image_info in random_images:
            image = Image.open(BASE_PATH + 'train2017/' + image_info['file_name']).resize((100,100)).convert('RGB')
            samples.extend([{
                'image': transform(image),
                'caption': annotation['caption'].lower(),
                'class': 1         
            } for annotation in coco_captions.imgToAnns[image_info['id']]])

        self.max_length_context = max([len(word_tokenize(sample['caption'])) for sample in samples])

        samples.extend([{
            'image': sample['image'],
            'caption': samples[index - CAPTIONS_PER_IMAGE]['caption'],
            'class': 0
        } for index, sample in enumerate(samples[CAPTIONS_PER_IMAGE:int(CAPTIONS_PER_IMAGE * negative_sample_split * number_of_samples)])])

        random.shuffle(samples)

        train_border = int(train_split * len(samples))
        val_border = int((train_split + val_split) * len(samples))

        self.train_samples = samples[:train_border]
        self.val_samples = samples[train_border:val_border]
        self.test_samples = samples[val_border:]

In [8]:
class COCO_Dataset(Dataset):
    def __init__(self, samples, max_length_context, dataset=None) -> None:
        super().__init__()
        
        self.max_length_context = max_length_context

        if dataset is None:
            vocab = {PADDING_TOKEN, UNKNOWN_TOKEN}
            for sample in samples:
                split_caption = word_tokenize(sample['caption'])
                vocab.update(split_caption)

            self.vocab = {word: index for index, word in enumerate(list(vocab))}
        else:
            self.vocab = dataset.vocab

        self.samples = []
        for sample in samples:
            split_caption = word_tokenize(sample['caption'])
            padded_context = [self.get_encoded_word(word) for word in split_caption]
            padded_context.extend([self.get_encoded_word(PADDING_TOKEN)] * (self.max_length_context - len(split_caption)))

            glove_context = [glove_vectors.stoi[word] if word in glove_vectors.itos else glove_vectors.stoi[UNKNOWN_TOKEN] for word in split_caption]
            glove_context.extend([glove_vectors.stoi[PADDING_TOKEN]] * (self.max_length_context - len(split_caption)))

            self.samples.append({
                'image': sample['image'],
                'caption': sample['caption'],
                'encoded_caption': torch.tensor(padded_context),
                'glove_encoded_caption': torch.tensor(glove_context),
                'class': torch.tensor(sample['class'], dtype=torch.float)
            })
        
    def __getitem__(self, index):
        return self.samples[index]
    
    def __len__(self):
        return len(self.samples)

    def get_encoded_word(self, word):
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab[UNKNOWN_TOKEN]

    def get_vocab_size(self):
        return len(self.vocab)

In [9]:
sampler = Sampler(coco_captions,hyperparameters['number_of_images'])

In [10]:
train_dataset = COCO_Dataset(sampler.train_samples, sampler.max_length_context)
val_dataset = COCO_Dataset(sampler.val_samples, sampler.max_length_context, train_dataset)
test_dataset = COCO_Dataset(sampler.test_samples, sampler.max_length_context, train_dataset)

train_dataloader = DataLoader(train_dataset,
                              batch_size=hyperparameters['batch_size'],
                              shuffle=True)
val_dataloader = DataLoader(val_dataset,
                            batch_size=hyperparameters['batch_size'],
                            shuffle=True)
test_dataloader = DataLoader(test_dataset,
                             batch_size=hyperparameters['batch_size'],
                             shuffle=True)


In [11]:
positives = 0
negatives = 0
for sample in train_dataloader.dataset:
    if sample['class'] == 1:
        positives += 1
    else:
        negatives += 1
print(f'negatives: {negatives}')
print(f'positives: {positives}')

negatives: 37943
positives: 40078


# 2. Training

In [12]:
caption_encoder = CaptionEncoder(train_dataloader.dataset.get_vocab_size(),
                                 hyperparameters['embedding_dim'],
                                 hyperparameters['lstm_out_dim'],
                                 hyperparameters['lstm_layers'],
                                 train_dataloader.dataset.get_encoded_word(PADDING_TOKEN),
                                 glove_vectors)

image_encoder = ImageEncoder()
caption_evaluator = CaptionEvaluator(caption_encoder,
                                     image_encoder, 
                                     hyperparameters['hidden_size'])

caption_evaluator.to(device)

train(caption_evaluator, train_dataloader, hyperparameters, device)

25 EPOCHS - 2438 BATCHES PER EPOCH
epoch 0, batch 2438: 0.6978
epoch 1, batch 2438: 0.6544
epoch 2, batch 2438: 0.5843
epoch 3, batch 2438: 0.5037
epoch 4, batch 2438: 0.4139
epoch 5, batch 2438: 0.3187
epoch 6, batch 2438: 0.2413
epoch 7, batch 2438: 0.1869
epoch 8, batch 2438: 0.2719
epoch 9, batch 2438: 0.1734
epoch 10, batch 2438: 0.1328
epoch 11, batch 2438: 0.1098
epoch 12, batch 2438: 0.0995
epoch 13, batch 2438: 0.0796
epoch 14, batch 2438: 0.0701
epoch 15, batch 2438: 0.0587
epoch 16, batch 2438: 0.0571
epoch 17, batch 2438: 0.0507
epoch 18, batch 2438: 0.0455
epoch 19, batch 2438: 0.0439
epoch 20, batch 2438: 0.0383
epoch 21, batch 2438: 0.0363
epoch 22, batch 2438: 0.0354
epoch 23, batch 2438: 0.0329
epoch 24, batch 2438: 0.0307


# 3. Testing

In [13]:
caption_evaluator.eval()

predictions = []
gold_classes = []
for batch in test_dataloader:
    with torch.no_grad():
        predictions.extend(caption_evaluator(batch['image'].to(device), batch['glove_encoded_caption'].to(device)).view(len(batch['image'])))
    
    gold_classes.extend(batch['class'])

In [14]:
CLASS_THRESHOLD = []
for threshold in range(1, 10):
    predicted_classes = []
    predicted_classes.extend([1 if sample > threshold / 10 else 0 for sample in predictions])
    print(f'--- THRESHOLD: {threshold / 10} ---')
    print(f'Accuracy: {accuracy_score(predicted_classes, gold_classes)}')
    print(f'Precision: {precision_score(predicted_classes, gold_classes)}')
    print(f'Recall: {recall_score(predicted_classes, gold_classes)}')
    print(f'F1-Score: {f1_score(predicted_classes, gold_classes)}')

--- THRESHOLD: 0.1 ---
Accuracy: 0.9477101845522898
Precision: 0.9799918886034878
Recall: 0.9215611492499365
F1-Score: 0.9498787918495709
--- THRESHOLD: 0.2 ---
Accuracy: 0.9535201640464799
Precision: 0.9726916317425983
Recall: 0.937703636126678
F1-Score: 0.9548772395487723
--- THRESHOLD: 0.3 ---
Accuracy: 0.9559125085440875
Precision: 0.9676896038934704
Recall: 0.9463246959280803
F1-Score: 0.9568879085622619
--- THRESHOLD: 0.4 ---
Accuracy: 0.956390977443609
Precision: 0.962146816276869
Recall: 0.9521070234113712
F1-Score: 0.9571005917159764
--- THRESHOLD: 0.5 ---
Accuracy: 0.9565276828434723
Precision: 0.957144788427741
Recall: 0.9568860656845519
F1-Score: 0.9570154095701541
--- THRESHOLD: 0.6 ---
Accuracy: 0.9566643882433357
Precision: 0.9521427605786129
Recall: 0.9617643042468933
F1-Score: 0.9569293478260869
--- THRESHOLD: 0.7 ---
Accuracy: 0.9553656869446343
Precision: 0.9449776936595917
Recall: 0.9660033167495854
F1-Score: 0.9553748376956195
--- THRESHOLD: 0.8 ---
Accuracy: 0.953

In [17]:
with open('dataloaders.dill', 'wb') as f:
    dill.dump((train_dataloader, val_dataloader, test_dataloader), f)


torch.save(caption_evaluator.state_dict(), 'caption_evaluator.pt')

OrderedDict([('caption_encoder.embeddings.weight',
              tensor([[ 1.3523e-02,  1.8526e-01,  7.2809e-03,  ...,  1.9830e-02,
                       -2.3915e-01,  5.6054e-02],
                      [-3.1423e-01, -2.7642e-01,  1.3203e-01,  ..., -2.0023e-01,
                       -1.7323e-01,  3.9787e-01],
                      [-1.3877e-01,  5.6329e-02,  9.6273e-02,  ..., -3.6058e-01,
                       -2.7464e-02,  1.1599e-01],
                      ...,
                      [ 4.2919e-01, -2.9690e-01,  1.5011e-01,  ...,  2.8975e-01,
                        3.2618e-01, -5.9053e-02],
                      [ 1.2275e-05, -1.5291e-02, -2.7391e-03,  ..., -8.3573e-03,
                        1.4206e-02, -1.2602e-02],
                      [ 3.6148e-02, -3.5777e-03,  1.0486e-02,  ..., -5.0656e-02,
                        2.7047e-02, -1.5744e-02]], device='cuda:2')),
             ('caption_encoder.rnn.weight_ih_l0',
              tensor([[ 0.0219, -0.0235, -0.0567,  ...,  0.0527,  