In [1]:
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import warnings
warnings.simplefilter('ignore')

import torch
import torch.nn as nn
from torchvision import transforms
import sys
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math
import nltk
from keras import utils
nltk.download('punkt')


## TODO #1: Select appropriate values for the Python variables below.
batch_size = 32          # batch size
vocab_threshold = 2        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 2048           # dimensionality of image and word embeddings
hidden_size = 2048          # number of features in hidden state of the RNN decoder
num_epochs = 5             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

# (Optional) TODO #2: Amend the image transform below.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

Using TensorFlow backend.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=1.21s)
creating index...
index created!
Obtaining caption lengths...


100%|████████████████████████████████████████████████████████████████████████| 591753/591753 [00:59<00:00, 9879.47it/s]


In [3]:
# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.NLLLoss().cuda() if torch.cuda.is_available() else nn.NLLLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.parameters())

# TODO #4: Define the optimizer.
optimizer = torch.optim.Adam(params, lr=0.001)

# learning rate scheduler
learning_schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

In [None]:
import torch.utils.data as data
import numpy as np
import os
import requests
import time

# Open the training log file.
f = open(log_file, 'w')

losses = []

for epoch in range(1, num_epochs+1):
    
    for i_step in range(1, total_step+1):
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))
        
        captions_target = captions[:, 1:].to(device)
        captions_train = captions[:, :captions.shape[1]-1].to(device)

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions_train)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions_target.contiguous().view(-1))
        
#         raise NameError('Stop')
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch, num_epochs, i_step, total_step, loss.item())
        
        losses.append(loss.item())
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)

    # learning rate decay
    learning_schedule.step(loss)
            
    # Save the weights.
    if epoch % save_every == 0:
        print("\nSaving the model")
        torch.save(decoder, os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder, os.path.join('./models', 'encoder-%d.pkl' % epoch))
    
# Close the training log file.
f.close()

Epoch [1/5], Step [100/18493], Loss: 3.7848
Epoch [1/5], Step [200/18493], Loss: 3.6031
Epoch [1/5], Step [300/18493], Loss: 4.1952
Epoch [1/5], Step [400/18493], Loss: 3.4738
Epoch [1/5], Step [500/18493], Loss: 3.2434
Epoch [1/5], Step [581/18493], Loss: 3.2562

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
plt.plot(losses)

### Sampling

In [None]:
img = images[23]

In [None]:
features = encoder(img.unsqueeze(0)).cuda()

In [None]:
start_token = torch.tensor([0]).cuda()

In [None]:
sample = decoder.sample(start_token, features)

In [None]:
s = sample.squeeze().cpu().numpy()

In [None]:
import pickle

In [None]:
vocab = word2idx = idx2word = None
with open('./vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    word2idx = vocab.word2idx
    idx2word = vocab.idx2word
print('Vocabulary successfully loaded from vocab.pkl file!')

In [None]:
for idx in s:
    print(idx2word[idx])

In [None]:
class UnNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
            # The normalize code -> t.sub_(m).div_(s)
        return tensor

In [None]:
unorm = UnNormalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
img = unorm(images[23])

In [None]:
img = img.permute((1, 2, 0))

In [None]:
plt.imshow(img)