In [1]:
from pycocotools.coco import COCO
import nltk
from data_loader import get_loader
import torch
import numpy as np
import torch.utils.data as data
from torchvision import transforms

nltk.download("punkt")


%load_ext autoreload
%autoreload 2

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [2]:
# Define a transform to pre-process the training images.
transform_train = transforms.Compose(
    [
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.RandomHorizontalFlip(),  # horizontally flip image with probability=0.5
        transforms.ToTensor(),  # convert the PIL Image to a tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225),
        ),
    ]
)

# Set the minimum word count threshold.
vocab_threshold = 5

# Specify the batch size.
batch_size = 10

# Path to cocoapi dir
cocoapi_dir = r"path/to/cocoapi/dir"

# Obtain the data loader.
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=False,
    cocoapi_loc=cocoapi_dir,
)

loading annotations into memory...


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/cocoapi/dir\\cocoapi/annotations/captions_train2014.json'

In [None]:
# Modify the minimum word count threshold.
vocab_threshold = 4

# Obtain the data loader.
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=False,
    cocoapi_loc=cocoapi_dir,
)

In [None]:
# Print the total number of keys in the word2idx dictionary.
print(f"Total number of tokens in vocabulary: {len(data_loader.dataset.vocab)}")

In [None]:
unk_word = data_loader.dataset.vocab.unk_word
print(f"Special unknown word: {unk_word}")

print(
    f"All unknown words are mapped to this integer: {data_loader.dataset.vocab(unk_word)}"
)

In [None]:
print(data_loader.dataset.vocab("jfkafejw"))
print(data_loader.dataset.vocab("ieowoqjf"))

In [None]:
print(data_loader.dataset.vocab("."))

In [None]:
# Obtain the data loader (from file). Note that it runs much faster than before!
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_from_file=True,
    cocoapi_loc=cocoapi_dir,
)

In [None]:
type(data_loader.dataset.caption_lengths), len(data_loader.dataset.caption_lengths)

In [None]:
from collections import Counter

# Tally the total number of training captions with each length.
counter = Counter(data_loader.dataset.caption_lengths)
lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
for value, count in lengths:
    print("value: %2d --- count: %5d" % (value, count))

In [None]:
print(batch_size)
# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print("sampled indices:", indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler

# Obtain the batch.
images, captions = next(iter(data_loader))

print("images.shape:", images.shape)
print("captions.shape:", captions.shape)

# Uncomment the lines of code below to print the pre-processed images and captions.
# print('images:', images)
# print('captions:', captions)

In [None]:
# Import EncoderCNN and DecoderRNN.
# Watch for any changes in model.py, and re-load it automatically.
from model import EncoderCNN, DecoderRNN

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Specify the dimensionality of the image embedding.
image_embed_size = 256

# Initialize the encoder.
encoder = EncoderCNN(image_embed_size)

# Move the encoder to appropriate device.
encoder.to(device)

# Move last batch of images (from Step 2) to GPU if CUDA is available.
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print("type(features):", type(features))
print("features.shape:", features.shape)
print("captions.shape:", captions.shape)

# Check that the encoder satisfies the requirements!
assert type(features) == torch.Tensor, "Encoder output needs to be a PyTorch Tensor."

assert (features.shape[0] == batch_size) and (
    features.shape[1] == image_embed_size
), "The shape of the encoder output is incorrect."

In [None]:
print(image_embed_size)

In [None]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512

word_embed_size = image_embed_size

# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the decoder.
decoder = DecoderRNN(word_embed_size, hidden_size, vocab_size)

# Move the decoder to proper device.
decoder.to(device)


# Move last batch of captions (from Step 1) to GPU if CUDA is available
captions = captions.to(device)

# Pass the encoder output and captions through the decoder.
# outputs[i,j,k] contains the model's predicted score:
# how likely the j-th token in the i-th caption in the batch is the k-th token in the vocabulary.

outputs = decoder(features, captions)  # (bs, cap_length, vocab_size)


print("type(outputs):", type(outputs))
print("outputs.shape:", outputs.shape)

# Check that the decoder satisfies the requirements!
assert type(outputs) == torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (
    (outputs.shape[0] == batch_size)
    and (outputs.shape[1] == captions.shape[1])
    and (outputs.shape[2] == vocab_size)
), "The shape of the decoder output is incorrect."