# 1. Load Data

In [41]:
from pycocotools.coco import COCO
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import random
import math
import torch
import torchvision.transforms as transforms
import torch.nn as nn
from torchtext.vocab import GloVe
from nltk.tokenize import word_tokenize
from model import CaptionEncoder, ImageEncoder, CaptionEvaluator, train
import sys
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import dill
from pprint import pprint

In [2]:
BASE_PATH = '/scratch/lt2316-h18-resources/coco/'

In [3]:
coco_captions = COCO(BASE_PATH + 'annotations/captions_train2017.json')
# coco_instances = COCO(BASE_PATH + 'annotations/instances_train2017.json')

loading annotations into memory...
Done (t=1.16s)
creating index...
index created!


In [4]:
# I added the <PAD> and <UNK> token to the glove vectors initialized with zeros
glove_vectors = GloVe(name='6B', dim=300)

In [5]:
print(glove_vectors.vectors.shape)

torch.Size([400002, 300])


In [6]:
hyperparameters = {
    'number_of_images': 10000,
    'batch_size': 32,
    # embedding dim -1 will initialize with glove vectors
    'embedding_dim': -1,
    'lstm_out_dim': 512,
    'lstm_layers': 1,
    'hidden_size': 2000,
    'epochs': 25,
    'learning_rate': 0.0002
}

PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'

device = torch.device('cuda:2')

In [7]:
class Sampler():
    def __init__(self, coco_captions, number_of_samples=100, negative_sample_split=0.95, train_split=0.8, val_split=0.05, test_split=0.15) -> None:
        samples = []
        transform = transforms.ToTensor()
        
        random_images = random.sample(list(coco_captions.imgs.values()), number_of_samples)
        CAPTIONS_PER_IMAGE = len(coco_captions.imgToAnns[random_images[0]['id']])

        for image_info in random_images:
            image = Image.open(BASE_PATH + 'train2017/' + image_info['file_name']).resize((100,100)).convert('RGB')
            samples.extend([{
                'id': image_info['id'],
                'image': transform(image),
                'caption': annotation['caption'].lower(),
                'class': 1         
            } for annotation in coco_captions.imgToAnns[image_info['id']]])

        self.max_length_context = max([len(word_tokenize(sample['caption'])) for sample in samples])

        samples.extend([{
            'image': sample['image'],
            'caption': samples[index - CAPTIONS_PER_IMAGE]['caption'],
            'class': 0
        } for index, sample in enumerate(samples[CAPTIONS_PER_IMAGE:int(CAPTIONS_PER_IMAGE * negative_sample_split * number_of_samples)])])

        random.shuffle(samples)

        train_border = int(train_split * len(samples))
        val_border = int((train_split + val_split) * len(samples))

        self.train_samples = samples[:train_border]
        self.val_samples = samples[train_border:val_border]
        self.test_samples = samples[val_border:]

In [8]:
class COCO_Dataset(Dataset):
    def __init__(self, samples, max_length_context, dataset=None) -> None:
        super().__init__()
        
        self.max_length_context = max_length_context

        if dataset is None:
            vocab = {PADDING_TOKEN, UNKNOWN_TOKEN}
            for sample in samples:
                split_caption = word_tokenize(sample['caption'])
                vocab.update(split_caption)

            self.vocab = {word: index for index, word in enumerate(list(vocab))}
        else:
            self.vocab = dataset.vocab

        self.samples = []
        for sample in samples:
            split_caption = word_tokenize(sample['caption'])
            padded_context = [self.get_encoded_word(word) for word in split_caption]
            padded_context.extend([self.get_encoded_word(PADDING_TOKEN)] * (self.max_length_context - len(split_caption)))

            glove_context = [glove_vectors.stoi[word] if word in glove_vectors.itos else glove_vectors.stoi[UNKNOWN_TOKEN] for word in split_caption]
            glove_context.extend([glove_vectors.stoi[PADDING_TOKEN]] * (self.max_length_context - len(split_caption)))

            self.samples.append({
                'id': sample['id'],
                'image': sample['image'],
                'caption': sample['caption'],
                'encoded_caption': torch.tensor(padded_context),
                'glove_encoded_caption': torch.tensor(glove_context),
                'class': torch.tensor(sample['class'], dtype=torch.float)
            })
        
    def __getitem__(self, index):
        return self.samples[index]
    
    def __len__(self):
        return len(self.samples)

    def get_encoded_word(self, word):
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab[UNKNOWN_TOKEN]

    def get_vocab_size(self):
        return len(self.vocab)

In [10]:
sampler = Sampler(coco_captions,hyperparameters['number_of_images'])

In [11]:
train_dataset = COCO_Dataset(sampler.train_samples, sampler.max_length_context)
val_dataset = COCO_Dataset(sampler.val_samples, sampler.max_length_context, train_dataset)
test_dataset = COCO_Dataset(sampler.test_samples, sampler.max_length_context, train_dataset)

train_dataloader = DataLoader(train_dataset,
                              batch_size=hyperparameters['batch_size'],
                              shuffle=True)
val_dataloader = DataLoader(val_dataset,
                            batch_size=hyperparameters['batch_size'],
                            shuffle=False)
test_dataloader = DataLoader(test_dataset,
                             batch_size=hyperparameters['batch_size'],
                             shuffle=False)


In [12]:
positives = 0
negatives = 0
for sample in train_dataloader.dataset:
    if sample['class'] == 1:
        positives += 1
    else:
        negatives += 1
print(f'negatives: {negatives}')
print(f'positives: {positives}')

negatives: 38026
positives: 39990


# 2. Training

In [18]:
caption_encoder = CaptionEncoder(train_dataloader.dataset.get_vocab_size(),
                                 hyperparameters['embedding_dim'],
                                 hyperparameters['lstm_out_dim'],
                                 hyperparameters['lstm_layers'],
                                 train_dataloader.dataset.get_encoded_word(PADDING_TOKEN),
                                 glove_vectors)

image_encoder = ImageEncoder()
caption_evaluator = CaptionEvaluator(caption_encoder,
                                     image_encoder, 
                                     hyperparameters['hidden_size'])

caption_evaluator.to(device)

CaptionEvaluator(
  (caption_encoder): CaptionEncoder(
    (embeddings): Embedding(400002, 300)
    (rnn): LSTM(300, 512, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.15, inplace=False)
  )
  (image_encoder): ImageEncoder(
    (image_encoder): Sequential(
      (0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
      (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (3): Tanh()
    )
  )
  (classifier): Sequential(
    (0): Linear(in_features=2752, out_features=2000, bias=True)
    (1): Dropout(p=0.1, inplace=False)
    (2): Tanh()
    (3): Linear(in_features=2000, out_features=1000, bias=True)
    (4): Tanh()
    (5): Linear(in_features=1000, out_features=1, bias=True)
    (6): Sigmoid()
  )
)

In [19]:
train(caption_evaluator, train_dataloader, hyperparameters, device)

25 EPOCHS - 2438 BATCHES PER EPOCH
epoch 0, batch 2437: 0.6983
epoch 1, batch 2437: 0.6954
epoch 2, batch 2437: 0.6966
epoch 3, batch 2437: 0.6968
epoch 4, batch 2437: 0.6972
epoch 5, batch 2437: 0.6971
epoch 6, batch 2437: 0.6968
epoch 7, batch 2437: 0.6974
epoch 8, batch 2437: 0.6586
epoch 9, batch 2437: 0.6032
epoch 10, batch 2437: 0.5348
epoch 11, batch 2437: 0.4451
epoch 12, batch 2437: 0.3562
epoch 13, batch 2437: 0.2807
epoch 14, batch 2437: 0.2248
epoch 15, batch 2437: 0.1818
epoch 16, batch 2437: 0.1507
epoch 17, batch 2437: 0.1276
epoch 18, batch 2437: 0.1129
epoch 19, batch 2437: 0.0979
epoch 20, batch 2437: 0.0851
epoch 21, batch 2437: 0.0753
epoch 22, batch 2437: 0.0706
epoch 23, batch 2437: 0.0649
epoch 24, batch 2437: 0.0595


# 3. Testing

In [14]:
# load model from file
caption_evaluator.load_state_dict(torch.load('caption_evaluator.pt'))

<All keys matched successfully>

In [36]:
caption_evaluator.eval()

predictions = []
gold_classes = []
for i, batch in enumerate(test_dataloader):
    with torch.no_grad():
        predictions.extend(caption_evaluator(batch['image'].to(device), batch['glove_encoded_caption'].to(device)).view(len(batch['image'])))
    
    gold_classes.extend(batch['class'])

In [37]:
best_threshold = 0
best_f1 = 0
best_predicted_classes = []
for threshold in range(1, 10):
    predicted_classes = []
    predicted_classes.extend([1 if sample > threshold / 10 else 0 for sample in predictions])
    print(f'--- THRESHOLD: {threshold / 10} ---')
    print(f'Accuracy: {accuracy_score(predicted_classes, gold_classes)}')
    print(f'Precision: {precision_score(predicted_classes, gold_classes)}')
    print(f'Recall: {recall_score(predicted_classes, gold_classes)}')
    f1 = f1_score(predicted_classes, gold_classes)
    if f1 > best_f1:
        best_threshold = threshold / 10
        best_f1 = f1
        best_predicted_classes = predicted_classes
    print(f'F1-Score: {f1}')

--- THRESHOLD: 0.1 ---
Accuracy: 0.9267158873393492
Precision: 0.981000802782981
Recall: 0.8874364560639071
F1-Score: 0.9318759532282663
--- THRESHOLD: 0.2 ---
Accuracy: 0.9381323489198797
Precision: 0.9745785389349746
Recall: 0.910613826728341
F1-Score: 0.9415110191947262
--- THRESHOLD: 0.3 ---
Accuracy: 0.9421657095980311
Precision: 0.9680224779234681
Recall: 0.9225962764600867
F1-Score: 0.9447636458605381
--- THRESHOLD: 0.4 ---
Accuracy: 0.9454470877768663
Precision: 0.9636071715279636
Recall: 0.931935817805383
F1-Score: 0.9475069069859229
--- THRESHOLD: 0.5 ---
Accuracy: 0.9480448455017774
Precision: 0.957987690660958
Recall: 0.941362082566395
F1-Score: 0.9496021220159152
--- THRESHOLD: 0.6 ---
Accuracy: 0.9492070002734482
Precision: 0.9522344126304523
Recall: 0.9485539117686259
F1-Score: 0.9503905989183414
--- THRESHOLD: 0.7 ---
Accuracy: 0.9479081214109926
Precision: 0.9431362055124431
Recall: 0.954373138369889
F1-Score: 0.9487213997308209
--- THRESHOLD: 0.8 ---
Accuracy: 0.94551

In [48]:
falses = []
for index, predicted_class in enumerate(best_predicted_classes):
    if predicted_class != gold_classes[index]:
        falses.append({
            'caption': test_dataloader.dataset[index]['caption'],
            'predicted': int(predicted_class),
            'gold': int(gold_classes[index])
        })

false_positives = list(filter(lambda x: x['gold'] == 0, falses))
false_negatives = list(filter(lambda x: x['gold'] == 1, falses))
print(len(false_positives))
print(len(false_negatives))

386
357


In [51]:
pprint(false_positives[10:])

[{'caption': 'two elephants with their trunks tied together with a third '
             'elephant in the background.',
  'gold': 0,
  'predicted': 1},
 {'caption': 'there is a crowd of people at a sporting event',
  'gold': 0,
  'predicted': 1},
 {'caption': 'a large wooden kitchen with two white chairs and an island '
             'counter.',
  'gold': 0,
  'predicted': 1},
 {'caption': 'a family plays baseball at a beach with sailboats in the '
             'background.',
  'gold': 0,
  'predicted': 1},
 {'caption': 'a child rides a toy in the pool.', 'gold': 0, 'predicted': 1},
 {'caption': 'fans sitting and watching a batter at a baseball game.  ',
  'gold': 0,
  'predicted': 1},
 {'caption': 'black and white motorcycle parked on a sidewalk.',
  'gold': 0,
  'predicted': 1},
 {'caption': 'guy with white jacket against the white snow on his snow board',
  'gold': 0,
  'predicted': 1},
 {'caption': 'a woman holding a frisbee next to a frisbee golf basket.',
  'gold': 0,
  'predicted'

In [17]:
torch.save(caption_evaluator.state_dict(), 'caption_evaluator.pt')

OrderedDict([('caption_encoder.embeddings.weight',
              tensor([[ 1.3523e-02,  1.8526e-01,  7.2809e-03,  ...,  1.9830e-02,
                       -2.3915e-01,  5.6054e-02],
                      [-3.1423e-01, -2.7642e-01,  1.3203e-01,  ..., -2.0023e-01,
                       -1.7323e-01,  3.9787e-01],
                      [-1.3877e-01,  5.6329e-02,  9.6273e-02,  ..., -3.6058e-01,
                       -2.7464e-02,  1.1599e-01],
                      ...,
                      [ 4.2919e-01, -2.9690e-01,  1.5011e-01,  ...,  2.8975e-01,
                        3.2618e-01, -5.9053e-02],
                      [ 1.2275e-05, -1.5291e-02, -2.7391e-03,  ..., -8.3573e-03,
                        1.4206e-02, -1.2602e-02],
                      [ 3.6148e-02, -3.5777e-03,  1.0486e-02,  ..., -5.0656e-02,
                        2.7047e-02, -1.5744e-02]], device='cuda:2')),
             ('caption_encoder.rnn.weight_ih_l0',
              tensor([[ 0.0219, -0.0235, -0.0567,  ...,  0.0527,  