In [83]:
# Watch for any changes in model.py, and re-load it automatically.
import math
from model import EncoderCNN, DecoderRNN
from data_loader import get_loader
from data_loader_val import get_loader as val_get_loader
from pycocotools.coco import COCO
from torchvision import transforms
from tqdm.notebook import tqdm
import torch.nn as nn
import torch
import torch.utils.data as data
from collections import defaultdict
import json
import os
import sys
import numpy as np
from nlp_utils import clean_sentence, bleu_score

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Setting hyperparameters
batch_size = 128  # batch size
vocab_threshold = 5  # minimum word count threshold
vocab_from_file = True  # if True, load existing vocab file
embed_size = 256  # dimensionality of image and word embeddings
hidden_size = 512  # number of features in hidden state of the RNN decoder
num_epochs = 3  # number of training epochs
save_every = 1  # determines frequency of saving model weights
print_every = 20  # determines window for printing average loss
log_file = "training_log.txt"  # name of file with saved training loss and perplexity
# Path to cocoapi dir
cocoapi_dir = r"path/to/cocoapi/dir"


# Amend the image transform below.
transform_train = transforms.Compose(
    [
        # smaller edge of image resized to 256
        transforms.Resize(256),
        # get 224x224 crop from random location
        transforms.RandomCrop(224),
        # horizontally flip image with probability=0.5
        transforms.RandomHorizontalFlip(),
        # convert the PIL Image to a tensor
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225),
        ),
    ]
)

In [3]:
# Build data loader.
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=vocab_from_file,
    cocoapi_loc=cocoapi_dir,
)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=1.08s)
creating index...
index created!
Obtaining caption lengths...


100%|█████████████████████████████████████████████████████████████████████████| 414113/414113 [01:21<00:00, 5075.39it/s]


In [4]:
# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initializing the encoder and decoder
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Defining the loss function
criterion = (
    nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
)

# Specifying the learnable parameters of the mode
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Defining the optimize
optimizer = torch.optim.Adam(params, lr=0.001)

# Set the total number of training steps per epoc
total_step = math.ceil(len(data_loader.dataset) / data_loader.batch_sampler.batch_size)

In [5]:
print(total_step)

3236


In [None]:
# Open the training log file.
f = open(log_file, "w")

for epoch in range(1, num_epochs + 1):
    for i_step in range(1, total_step + 1):

        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_saosmpler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler

        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)

        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()

        # Passing the inputs through the CNN-RNN model
        features = encoder(images)
        outputs = decoder(features, captions)

        # Calculating the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

        #         # Uncomment to debug
        #         print(outputs.shape, captions.shape)
        #         # torch.Size([bs, cap_len, vocab_size]) torch.Size([bs, cap_len])

        #         print(outputs.view(-1, vocab_size).shape, captions.view(-1).shape)
        #         # torch.Size([bs*cap_len, vocab_size]) torch.Size([bs*cap_len])

        # Backwarding pass
        loss.backward()

        # Updating the parameters in the optimizer
        optimizer.step()

        # Getting training statistics
        stats = (
            f"Epoch [{epoch}/{num_epochs}], Step [{i_step}/{total_step}], "
            f"Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):.4f}"
        )

        # Print training statistics to file.
        f.write(stats + "\n")
        f.flush()

        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print("\r" + stats)

    # Save the weights.
    if epoch % save_every == 0:
        torch.save(
            decoder.state_dict(), os.path.join("./models", "decoder-%d.pkl" % epoch)
        )
        torch.save(
            encoder.state_dict(), os.path.join("./models", "encoder-%d.pkl" % epoch)
        )

# Close the training log file.
f.close()

In [16]:
torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-final.pkl'))
torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-final.pkl'))

In [84]:
transform_test = transforms.Compose(
    [
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225),
        ),
    ]
)


# Create the data loader.
val_data_loader = val_get_loader(
    transform=transform_test, mode="valid", cocoapi_loc=cocoapi_dir
)

encoder_file = "encoder-3.pkl"
decoder_file = "decoder-3.pkl"

# Initialize the encoder and decoder.
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Moving models to GPU if CUDA is available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Loading the trained weights
encoder.load_state_dict(torch.load(os.path.join("./models", encoder_file)))
decoder.load_state_dict(torch.load(os.path.join("./models", decoder_file)))

encoder.eval()
decoder.eval()

Vocabulary successfully loaded from vocab.pkl file!


DecoderRNN(
  (embed): Embedding(9955, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=9955, bias=True)
)

In [85]:
# infer captions for all images
pred_result = defaultdict(list)
for img_id, img in tqdm(val_data_loader):
    img = img.to(device)
    with torch.no_grad():
        features = encoder(img).unsqueeze(1)
        output = decoder.sample(features)
    sentence = clean_sentence(output, val_data_loader.dataset.vocab.idx2word)
    pred_result[img_id.item()].append(sentence)

  0%|          | 0/40504 [00:00<?, ?it/s]

In [90]:
with open(
    os.path.join(cocoapi_dir, "cocoapi", "annotations/captions_val2014.json"), "r"
) as f:
    caption = json.load(f)

valid_annot = caption["annotations"]
valid_result = defaultdict(list)
for i in valid_annot:
    valid_result[i["image_id"]].append(i["caption"].lower())

In [91]:
list(valid_result.values())[:3]

[['a bicycle replica with a clock as the front wheel.',
  'the bike has a clock as a tire.',
  'a black metal bicycle with a clock inside the front wheel.',
  'a bicycle figurine in which the front wheel is replaced with a clock\n',
  'a clock with the appearance of the wheel of a bicycle '],
 ['a black honda motorcycle parked in front of a garage.',
  'a honda motorcycle parked in a grass driveway',
  'a black honda motorcycle with a dark burgundy seat.',
  'ma motorcycle parked on the gravel in front of a garage',
  'a motorcycle with its brake extended standing outside'],
 ['a room with blue walls and a white sink and door.',
  'blue and white color scheme in a small bathroom.',
  'this is a blue and white bathroom with a wall sink and a lifesaver on the wall.',
  'a blue boat themed bathroom with a life preserver on the wall',
  'a bathroom with walls that are painted baby blue.']]

In [92]:
list(pred_result.values())[:3]

[[' a group of horses standing in a field.'],
 [' a person riding a surf board on a body of water'],
 [' a woman standing in front of a store window.']]

In [93]:
bleu_score(true_sentences=valid_result, predicted_sentences=pred_result)

0.2091174140097583

Not a bad bleu score with only 3 epochs!