In [1]:
import sys
# sys.path.append('/opt/cocoapi/PythonAPI')
from pycocotools.coco import COCO
from data_loader import get_loader
from torchvision import transforms

# TODO #1: Define a transform to pre-process the testing images.
transform_test = transforms.Compose([ 
    transforms.Resize(256),                          
    transforms.CenterCrop(224),                             
    transforms.ToTensor(),                           
    transforms.Normalize((0.485, 0.456, 0.406),      
                         (0.229, 0.224, 0.225))])

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Create the data loader.
data_loader = get_loader(transform=transform_test,    
                         mode='test')

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


Run the code cell below to visualize an example test image, before pre-processing is applied.

<a id='step2'></a>
## Step 2: Load Trained Models

In the next code cell we define a `device` that you will use move PyTorch tensors to GPU (if CUDA is available).  Run this code cell before continuing.

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Watch for any changes in model.py, and re-load it automatically.
# % load_ext autoreload
# % autoreload 2

import os
import torch
from model import EncoderCNN, DecoderRNN

# TODO #2: Specify the saved models to load.
encoder_file = 'encoder-751.pkl'
decoder_file = 'decoder-751.pkl'

# TODO #3: Select appropriate values for the Python variables below.
embed_size = 512
hidden_size = 512

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder, and set each to inference mode.
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

# Load the trained weights.
encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

# Move models to GPU if CUDA is available.
encoder.to(device)
decoder.to(device)

Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /tmp/xdg-cache/torch/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:12<00:00, 18.7MB/s] 


DecoderRNN(
  (word_embeddings): Embedding(4640, 512)
  (lstm): LSTM(512, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=4640, bias=True)
)

In [4]:
# TODO #4: Complete the function.
def clean_sentence(output):
 
    sentence = ""
    for idx in output:
        if idx == 0:
            continue
        if idx == 1:
            break
        word = data_loader.dataset.vocab.idx2word[idx]
        sentence = sentence + word + ' '
        
    return sentence

In [5]:
data_iterator = iter(data_loader)
result = {}
for i in range(data_loader.dataset.__len__()):
    orig_image, image, image_id = next(data_iterator)
    image = image.to(device)

    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)
    
    sentence = clean_sentence(output)
    result[image_id] = sentence


In [17]:
import json

data = []
# data['results'] = []

for k, v in result.items():
    data.append({
        'image_id': int(k),
        'caption': v,
    })

with open('captions_val2014_results.json', 'w') as file:
    json.dump(data, file)