In [16]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
sys.path.append('./')
from pycocotools.coco import COCO
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math

In [17]:
## TODO #1: Select appropriate values for the Python variables below.
batch_size = 128          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = False   # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log1.txt'       # name of file with saved training loss and perplexity

# (Optional) TODO #2: Amend the image transform below.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# TODO #4: Define the optimizer.
optimizer = torch.optim.Adam(params, lr=0.001)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

annotations file:  /datasets/ee285f-public/COCO-Annotations/annotations_trainval2014/captions_train2014.json
loading annotations into memory...
Done (t=0.96s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.91s)
creating index...



  0%|          | 0/414113 [00:00<?, ?it/s][A
  0%|          | 830/414113 [00:00<00:49, 8296.98it/s][A

index created!
Obtaining caption lengths...



  0%|          | 1653/414113 [00:00<00:49, 8275.73it/s][A
  1%|          | 2332/414113 [00:00<00:53, 7763.34it/s][A
  1%|          | 3114/414113 [00:00<00:52, 7779.02it/s][A
  1%|          | 3778/414113 [00:00<00:55, 7395.18it/s][A
  1%|          | 4386/414113 [00:00<00:59, 6889.18it/s][A
  1%|          | 5032/414113 [00:00<01:00, 6751.92it/s][A
  1%|▏         | 5667/414113 [00:00<01:01, 6626.04it/s][A
  2%|▏         | 6305/414113 [00:00<01:02, 6549.57it/s][A
  2%|▏         | 7034/414113 [00:01<01:00, 6754.79it/s][A
  2%|▏         | 7692/414113 [00:01<01:05, 6220.32it/s][A
  2%|▏         | 8310/414113 [00:01<01:06, 6146.83it/s][A
  2%|▏         | 8995/414113 [00:01<01:03, 6341.47it/s][A
  2%|▏         | 9810/414113 [00:01<00:59, 6791.70it/s][A
  3%|▎         | 10625/414113 [00:01<00:56, 7149.12it/s][A
  3%|▎         | 11351/414113 [00:01<01:00, 6698.15it/s][A
  3%|▎         | 12035/414113 [00:01<01:00, 6611.51it/s][A
  3%|▎         | 12706/414113 [00:01<01:00, 6587.78i

 27%|██▋       | 110928/414113 [00:14<00:44, 6794.79it/s][A
 27%|██▋       | 111618/414113 [00:14<00:44, 6741.73it/s][A
 27%|██▋       | 112311/414113 [00:14<00:44, 6795.10it/s][A
 27%|██▋       | 113195/414113 [00:14<00:41, 7299.74it/s][A
 28%|██▊       | 114031/414113 [00:14<00:39, 7587.62it/s][A
 28%|██▊       | 114859/414113 [00:15<00:38, 7782.84it/s][A
 28%|██▊       | 115699/414113 [00:15<00:37, 7956.19it/s][A
 28%|██▊       | 116588/414113 [00:15<00:36, 8214.02it/s][A
 28%|██▊       | 117453/414113 [00:15<00:35, 8337.56it/s][A
 29%|██▊       | 118299/414113 [00:15<00:35, 8373.07it/s][A
 29%|██▉       | 119177/414113 [00:15<00:34, 8490.91it/s][A
 29%|██▉       | 120060/414113 [00:15<00:34, 8587.15it/s][A
 29%|██▉       | 120931/414113 [00:15<00:33, 8623.53it/s][A
 29%|██▉       | 121819/414113 [00:15<00:33, 8697.73it/s][A
 30%|██▉       | 122691/414113 [00:15<00:33, 8678.44it/s][A
 30%|██▉       | 123560/414113 [00:16<00:38, 7487.39it/s][A
 30%|███       | 124339/

 52%|█████▏    | 213535/414113 [00:28<00:30, 6571.01it/s][A
 52%|█████▏    | 214203/414113 [00:29<00:31, 6379.50it/s][A
 52%|█████▏    | 214850/414113 [00:29<00:32, 6049.07it/s][A
 52%|█████▏    | 215465/414113 [00:29<00:33, 5978.36it/s][A
 52%|█████▏    | 216100/414113 [00:29<00:32, 6083.66it/s][A
 52%|█████▏    | 216719/414113 [00:29<00:32, 6115.09it/s][A
 52%|█████▏    | 217356/414113 [00:29<00:31, 6187.97it/s][A
 53%|█████▎    | 218016/414113 [00:29<00:31, 6305.72it/s][A
 53%|█████▎    | 218674/414113 [00:29<00:30, 6384.72it/s][A
 53%|█████▎    | 219497/414113 [00:29<00:28, 6843.02it/s][A
 53%|█████▎    | 220243/414113 [00:30<00:27, 7008.74it/s][A
 53%|█████▎    | 220953/414113 [00:30<00:30, 6428.04it/s][A
 54%|█████▎    | 221612/414113 [00:30<00:32, 5974.48it/s][A
 54%|█████▎    | 222276/414113 [00:30<00:31, 6158.48it/s][A
 54%|█████▍    | 222936/414113 [00:30<00:30, 6282.68it/s][A
 54%|█████▍    | 223577/414113 [00:30<00:30, 6318.92it/s][A
 54%|█████▍    | 224241/

 77%|███████▋  | 319948/414113 [00:43<00:10, 8657.97it/s][A
 77%|███████▋  | 320842/414113 [00:43<00:10, 8738.57it/s][A
 78%|███████▊  | 321722/414113 [00:43<00:10, 8756.15it/s][A
 78%|███████▊  | 322600/414113 [00:43<00:10, 8760.72it/s][A
 78%|███████▊  | 323483/414113 [00:43<00:10, 8780.58it/s][A
 78%|███████▊  | 324362/414113 [00:43<00:10, 8711.90it/s][A
 79%|███████▊  | 325236/414113 [00:43<00:10, 8717.75it/s][A
 79%|███████▊  | 326109/414113 [00:43<00:10, 8710.07it/s][A
 79%|███████▉  | 326981/414113 [00:44<00:10, 8529.76it/s][A
 79%|███████▉  | 327835/414113 [00:44<00:10, 8506.43it/s][A
 79%|███████▉  | 328691/414113 [00:44<00:10, 8520.62it/s][A
 80%|███████▉  | 329544/414113 [00:44<00:09, 8497.61it/s][A
 80%|███████▉  | 330395/414113 [00:44<00:09, 8475.29it/s][A
 80%|███████▉  | 331243/414113 [00:44<00:09, 8443.75it/s][A
 80%|████████  | 332105/414113 [00:44<00:09, 8492.97it/s][A
 80%|████████  | 332982/414113 [00:44<00:09, 8572.02it/s][A
 81%|████████  | 333859/

In [18]:
vocab_size

8856

In [19]:
import torch.utils.data as data
import numpy as np
import os
import requests
import time

In [None]:
# Open the training log file.
f = open(log_file, 'w')

#old_time = time.time()
#response = requests.request("GET", 
                            #"http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token", 
                            #headers={"Metadata-Flavor":"Google"})

for epoch in range(1, num_epochs+1):
    
    for i_step in range(1, total_step+1):
        
        #if time.time() - old_time > 60:
            #old_time = time.time()
            #requests.request("POST", 
                             #"https://nebula.udacity.com/api/v1/remote/keep-alive", 
                             #headers={'Authorization': ("STAR " + response.text)})
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
        if i_step % 200 == 0:
            torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % i_step))
            torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % i_step))
            
    # Save the weights.
    #if i_step == 2:
        #torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        #torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()

Epoch [1/3], Step [100/3236], Loss: 3.8572, Perplexity: 47.3334
Epoch [1/3], Step [164/3236], Loss: 3.6228, Perplexity: 37.44104

<a id='step3'></a>
## Step 3: (Optional) Validate your Model

To assess potential overfitting, one approach is to assess performance on a validation set.  If you decide to do this **optional** task, you are required to first complete all of the steps in the next notebook in the sequence (**3_Inference.ipynb**); as part of that notebook, you will write and test code (specifically, the `sample` method in the `DecoderRNN` class) that uses your RNN decoder to generate captions.  That code will prove incredibly useful here. 

If you decide to validate your model, please do not edit the data loader in **data_loader.py**.  Instead, create a new file named **data_loader_val.py** containing the code for obtaining the data loader for the validation data.  You can access:
- the validation images at filepath `'/opt/cocoapi/images/train2014/'`, and
- the validation image caption annotation file at filepath `'/opt/cocoapi/annotations/captions_val2014.json'`.

The suggested approach to validating your model involves creating a json file such as [this one](https://github.com/cocodataset/cocoapi/blob/master/results/captions_val2014_fakecap_results.json) containing your model's predicted captions for the validation images.  Then, you can write your own script or use one that you [find online](https://github.com/tylin/coco-caption) to calculate the BLEU score of your model.  You can read more about the BLEU score, along with other evaluation metrics (such as TEOR and Cider) in section 4.1 of [this paper](https://arxiv.org/pdf/1411.4555.pdf).  For more information about how to use the annotation file, check out the [website](http://cocodataset.org/#download) for the COCO dataset.

In [4]:
# (Optional) TODO: Validate your model.