**Ideas -**
1. Find/Use OCR for non-comp part<br>
2. Rotate all images thrice to create a larger training dataset.<br>
3. Convert image to 0-1 grayscale.<br>
4. Use attention based RNN for caption generation.<br>

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

from skimage import io, transform

import matplotlib.pyplot as plt # for plotting
import numpy as np
import os

### Image Transforms

In [10]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        return img


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        return torch.tensor(image)


IMAGE_RESIZE = (256, 256)
# Sequentially compose the transforms
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor()])


### Captions Preprocessing

In [11]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path

        # max caption length
        self.maxLen = 0

        # Read raw captions
        self.raw_captions_dict = self.read_raw_captions()

        # Preprocess captions
        self.captions_dict = self.process_captions()

        # Create vocabulary
        self.vocab = self.generate_vocabulary()


    def read_raw_captions(self):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """

        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                captions_dict[img_captions[0]] = img_captions[1]
                self.maxLen = max(self.maxLen, len(img_captions[1].split()) + 2)

        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict

        # Do the preprocessing here
        captions_dict = {}
        # add START and END token
        for k, v in raw_captions_dict.items():
            captions_dict[k] = '[START] ' + v + ' [END]'

        return captions_dict

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        captions_dict = self.captions_dict

        # Generate the vocabulary
        vocab = {'[PAD]': 0}
        idx = 1
        for caption in captions_dict.values():
            for word in caption.split():
                if word in ['[START]', '[END]']:
                    continue
                if word not in vocab:
                    vocab[word] = idx
                    idx += 1
        vocab['[START]'] = idx
        vocab['[END]'] = idx + 1

        return vocab

    def captions_transform(self, img_caption):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption: caption for a particular image
        """
        vocab = self.vocab

        # Generate tensors
        tokens = [vocab[word] for word in img_caption.split()]
        tokens.extend([0 for _ in range(len(tokens), self.maxLen)])
        return torch.tensor(tokens)

# Set the captions tsv file path
CAPTIONS_FILE_PATH = 'Train_text.tsv'
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

### Dataset Class

In [14]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image paths (strings)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        image = io.imread(img_name)
        captions = self.captions_dict[img_name]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            captions = self.captions_transform(captions)

        sample = {'image': image, 'captions': captions}

        return sample

### Model Architecture

In [22]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        # CNN architecture
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, padding=1),
            nn.ReLU(inplace = True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            # batch 1
            nn.Conv2d(64, 64, kernel_size=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 256, kernel_size=1),
            nn.ReLU(inplace = True),
            
            nn.Conv2d(256, 64, kernel_size=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 256, kernel_size=1),
            nn.ReLU(inplace = True),
            
            nn.Conv2d(256, 64, kernel_size=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 256, kernel_size=1),
            nn.ReLU(inplace = True),
            
            # # batch 2
            # nn.Conv2d(256, 128, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 128, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 512, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(512, 128, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 128, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 512, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(512, 128, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 128, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 512, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(512, 128, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 128, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(128, 512, kernel_size=1),
            # nn.ReLU(inplace = True),

            # # batch 3
            # nn.Conv2d(512, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),
            
            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(1024, 256, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(256, 1024, kernel_size=1),
            # nn.ReLU(inplace = True),

            # # batch 4
            # nn.Conv2d(1024, 512, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 2048, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(2048, 512, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 2048, kernel_size=1),
            # nn.ReLU(inplace = True),

            # nn.Conv2d(2048, 512, kernel_size=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.ReLU(inplace = True),
            # nn.Conv2d(512, 2048, kernel_size=1),
            # nn.ReLU(inplace = True),
        )

    def forward(self, image_batch):
        # Forward Propogation
        encoded_output = self.cnn(image_batch)
        encoded_output = encoded_output.permute(0, 2, 3, 1) 
        return encoded_output


In [None]:
class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(p=0.5, inplace = True)
        
    def forward(self, img_features, text_captions, lengths):
        embeddings = self.embedding(text_captions)
        time_steps = torch.cat((img_features.unsqueeze(1), embeddings), dim=1)
        # embeddings = self.dropout(embeddings)
        packed_seq = rnn_utils.pack_padded_sequence(time_steps, lengths, batch_first=True)
        hidden_out, _ = self.lstm(packed_seq)
        outputs = self.softmax(self.linear(hidden_out[0]))

        return outputs


In [None]:
class ImageCaptionsNet(nn.Module):
    def __init__(self):
        super(ImageCaptionsNet, self).__init__()

        # Define your architecture here


    def forward(self, x):
        x = image_batch, captions_batch

        # Forward Propogation
        

        return captions_batch

net = ImageCaptionsNet()

# If GPU training is required
net = net.cuda()

### Training Loop

In [23]:
IMAGE_DIR = 'train_data'

# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)

# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 1e-1
BATCH_SIZE = 32
NUM_WORKERS = 0 # Parallel threads for dataloading
loss_function = nn.CrossEntropyLoss()
#optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE)

# Creating the DataLoader for batching purposes
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
encoder = Encoder()
# encoder = encoder.cuda()
for epoch in range(NUMBER_OF_EPOCHS):
    for batch_idx, sample in enumerate(train_loader):
        # clear gradients
        #net.zero_grad()

        image_batch, captions_batch = sample['image'], sample['captions']
        # image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()
        image_batch = image_batch.float()
        print(image_batch.dtype)
        features = encoder(image_batch)
        print(features.size())
        print(captions_batch[0])
        # If GPU training required
        # image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()

        # output_captions = net((image_batch, captions_batch))
        # loss = loss_function(output_captions, captions_batch)
        # loss.backward()
        # optimizer.step()
    print("Iteration: " + str(epoch + 1))

torch.float32
torch.Size([32, 125, 125, 256])
tensor([7736,    6,   16,    7,   44,   19,   80, 7737,    0,    0])
torch.float32
torch.Size([32, 125, 125, 256])
tensor([7736,    6,   25,   44,   54,  594,   50,    6, 7737,    0])
torch.float32
torch.Size([32, 125, 125, 256])
tensor([7736,   10,   18,  173,   78,    6, 1012,  839, 7737,    0])
torch.float32
torch.Size([32, 125, 125, 256])
tensor([7736,    6, 1657,   14,  331,   46, 7737,    0,    0,    0])
torch.float32
torch.Size([32, 125, 125, 256])
tensor([7736,    6,  298,  156,   78,   74, 7737,    0,    0,    0])
torch.float32


KeyboardInterrupt: 