In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [71]:
'''Import modules'''
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from collections import Counter
from skimage import io, transform
from torch.nn.utils.rnn import pack_padded_sequence
from torchsummary import summary

import matplotlib.pyplot as plt # for plotting
import numpy as np
from time import time
import collections
import pickle
import os
import gensim
import nltk

In [72]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device =", device)
print("Using", torch.cuda.device_count(), "GPUs!")
parallel = True #enable nn.DataParallel for GPU
platform = "local" #colab/local
restore = True #Restore Checkpoint
phase = "Test"

Device = cpu
Using 0 GPUs!


In [73]:
VOCAB = {}
WORD2IDX = {}
IDX2WORD = {}


In [74]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        #print("TA RESCALE INPUT", image.shape)
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        #print("TA RESCALE OUTPUT", image.shape)
        return img


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        #print("TA TRANSPOSE IP", image.shape)
        #image = image.transpose((2, 0, 1))
        #print("TA TRANSPOSE OP", image.shape)
        return image


IMAGE_RESIZE = (256, 256)
# Sequentially compose the transforms
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor()])

In [75]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens
    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path

        # Read raw captions
        self.raw_captions_dict = self.read_raw_captions()

        # Preprocess captions
        self.captions_dict = self.process_captions()

        # Create vocabulary
        self.start = "<start>"
        self.end = "<end>"
        self.oov = "<unk>"
        self.pad = "<pad>"
        self.vocab = self.generate_vocabulary()
        self.word2index = self.convert_word2index()        
        self.index2word = self.convert_index2word()
        

    def read_raw_captions(self):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """
        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                captions_dict[int(img_captions[0])] = img_captions[1:]

        return captions_dict 

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict 
        
        # Do the preprocessing here                
        captions_dict = raw_captions_dict

        return captions_dict

 

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """
        captions_dict = self.captions_dict

        # Generate the vocabulary
        
        all_captions = ""        
        for cap_lists in captions_dict.values():
            all_captions += " ".join(cap_lists)
        all_captions = nltk.tokenize.word_tokenize(all_captions.lower())
        
        vocab = {self.pad :1, self.oov :1, self.start :1, self.end :1}
        vocab_update = Counter(all_captions) 
        vocab_update = {k:v for k,v in vocab_update.items() if v >= freq_threshold}
        vocab.update(vocab_update)        
        vocab_size = len(vocab)
        
        if phase == "Train":
            VOCAB.clear()
            VOCAB.update(vocab)
            if platform == "colab":
                fname = '/content/drive/My Drive/A4/dict/VOCAB_comp.pkl'
            else:
                fname = '../dict/VOCAB_comp.pkl'
            #if not os.path.isfile(fname):
            with open(fname, 'wb') as handle:
                pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        print("VOCAB SIZE =", vocab_size)
        return vocab
    
    def convert_word2index(self):
        """
        word to index converter
        """
        word2index = {}
        vocab = self.vocab
        idx = 0
        words = vocab.keys()
        for w in words:
            word2index[w] = idx
            idx +=1
        if phase == "Train":
            WORD2IDX.clear()
            WORD2IDX.update(word2index)
            if platform == "colab":
                fname = '/content/drive/My Drive/A4/dict/WORD2IDX_comp.pkl'
            else:
                fname = '../dict/WORD2IDX_comp.pkl'
            #if not os.path.isfile(fname):
            with open(fname, 'wb') as handle:
                pickle.dump(word2index, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return word2index
    
    def convert_index2word(self):
        """
        index to word converter
        """
        index2word = {}
        w2i = self.word2index
        idx = 0
        
        for k, v in w2i.items():
            index2word[v] = k
            
        if phase == "Train":
            IDX2WORD.clear()
            IDX2WORD.update(index2word)
            if platform == "colab":
                fname = '/content/drive/My Drive/A4/dict/IDX2WORD_comp.pkl'
            else:
                fname = '../dict/IDX2WORD_comp.pkl'
            #if not os.path.isfile(fname):
            with open(fname, 'wb') as handle:
                pickle.dump(index2word, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return index2word

    def captions_transform(self, img_caption_list):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        if phase == "Test":
            word2index = WORD2IDX
            vocab = VOCAB
        else:
            word2index = self.word2index
            vocab = self.vocab
            
        start = self.start
        end = self.end
        oov = self.oov
        
        processed_list = list(map(lambda x: nltk.tokenize.word_tokenize(x.lower()), img_caption_list))
        
        
        #print(processed_list)
        processed_list = list(map(lambda x: list(map(lambda y: WORD2IDX[y] if y in vocab else WORD2IDX[oov],x)),
                                  processed_list))
        processed_list = list(map(lambda x: [WORD2IDX['<start>']] + x + [WORD2IDX['<end>']], processed_list))
        #print(processed_list)
        return processed_list


if platform == "colab":
    CAPTIONS_FILE_PATH = '/content/drive/My Drive/A4/train_captions.tsv'
else:
    CAPTIONS_FILE_PATH = "D:/Padhai/IIT Delhi MS(R)/2019-20 Sem II/COL774 Machine Learning/Assignment/Assignment4/train_captions.tsv"
    
embedding_dim = 200
freq_threshold = 5
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

VOCAB SIZE = 8680


In [77]:

if phase == "Test":
    if platform != 'colab':
        with open('../dict/VOCAB.pkl', 'rb') as handle:
            VOCAB = pickle.load(handle)
        with open('../dict/WORD2IDX.pkl', 'rb') as handle:
            WORD2IDX = pickle.load(handle)
        with open('../dict/IDX2WORD.pkl', 'rb') as handle:
            IDX2WORD = pickle.load(handle)
        print("Dictionary Loaded Successfully")
    else:
        with open('/content/drive/My Drive/A4/dict/VOCAB.pkl', 'rb') as handle:
            VOCAB = pickle.load(handle)
        with open('/content/drive/My Drive/A4/dict/WORD2IDX.pkl', 'rb') as handle:
            WORD2IDX = pickle.load(handle)
        with open('/content/drive/My Drive/A4/dict/IDX2WORD.pkl', 'rb') as handle:
            IDX2WORD = pickle.load(handle)
        print("Dictionary Loaded Successfully")

Dictionary Loaded Successfully


In [78]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image ids (integers)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, 'image_{}.jpg'.format(self.image_ids[idx]))
        image = io.imread(img_name)
        #print("RAW IMG", image.shape)
        #captions = self.captions_dict[self.image_ids[idx]]
        if self.img_transform:
            image = self.img_transform(image)
            
            image = image.transpose((2, 0, 1))
            

        '''if self.captions_transform:            
            captions = self.captions_transform(captions)'''
            
        sample = {'image': image}

        return sample
    
    
def custom_batch(batch):
    batch_size = len(batch)
    captions = []
    normalize_img = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
    
       
    x = list(map(lambda b: b['image'],batch)) 
    x = list(map(lambda i: normalize_img(torch.from_numpy(i)).unsqueeze(0),x))
    #print("my after norm shape", x[0].shape)
    images = torch.cat(x)
    
    sample = {'image': images}    
    return sample

In [79]:
#ENCODER

class Encoder(nn.Module):
    def __init__(self, embed_dim):
        super(Encoder, self).__init__()
        resnet50 = models.resnet50(pretrained=True, progress=True)        
        self.resnet50 = resnet50
        for param in self.resnet50.parameters():
            param.requires_grad = False
        print("EMBED DIM", embed_dim)
        self.fc = nn.Linear(in_features=self.resnet50.fc.in_features, out_features=embed_dim, bias = True)
        layers = list(resnet50.children())[:-1]
        self.resnet50 = nn.Sequential(*layers)
        '''for layer in list(self.resnet50.children())[2:]:
            for params in layer.parameters():
                params.requires_grad = True'''
        self.relu = nn.LeakyReLU()
        print("resnet50 Loaded Successfully..!")

    def forward(self, x):
        x = self.resnet50(x)
        #print("Resnet module op", x.shape)
        x = x.view(x.size(0), -1)
        #print("Resnet module op reshape", x.shape)
        x = self.fc(x)
        x = self.relu(x)
        #print("Resnet FC op", x)
        return x
        
class AttentionBlock(nn.Module):
    def __init__(self, embed_dim, units, vocab_size):
        super(AttentionBlock, self).__init__()
        self.W1 = nn.Linear(in_features = embed_dim, out_features = units)
        self.W2 = nn.Linear(in_features=units, out_features=units)
        self.V = nn.Linear(in_features=units, out_features=1)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, img_features, hidden):
        
        hidden = hidden.unsqueeze(dim=1)
        hidden = hidden.double()
        #print("feature and hidden shape",img_features.shape, hidden.shape)
        combined_score = self.tanh(self.W1(img_features) + self.W2(hidden))
        
        attention_weights = self.softmax(self.V(combined_score))
        context_vector = attention_weights * img_features
        context_vector = torch.sum(context_vector, dim=1)
        
        return context_vector, attention_weights    



class Decoder(nn.Module):
    def __init__(self, embed_dim, lstm_hidden_size,lstm_layers=1):
        super(Decoder, self).__init__()
        self.lstm_hidden_size = lstm_hidden_size
        self.vocab_size = len(VOCAB)
        print("VOCAB SIZE = ", self.vocab_size)
        
        self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = lstm_hidden_size,
                            num_layers = lstm_layers, batch_first = True)
        
        self.linear = nn.Linear(lstm_hidden_size, self.vocab_size)        
        #self.embed = nn.Embedding.from_pretrained(init_weights)
        self.embed = nn.Embedding(self.vocab_size, embed_dim)
        #self.attention = AttentionBlock(embed_dim, lstm_hidden_size, self.vocab_size)

        
    def forward(self, image_features, image_captions, lengths):
        #print("DECODER INPUT", image_features)
        if phase == "Train":
            #print(image)
            image_features = torch.Tensor.repeat_interleave(image_features, repeats=5 , dim=0)
        image_features = image_features.unsqueeze(1)
        
        '''Uncomment to use attention'''
        '''hidden = torch.zeros((image_features.shape[0], self.lstm_hidden_size))
        if device == "cuda":
            hidden = hidden.to(torch.device("cuda:0"))       
        
        #context, attention = self.attention(image_features, hidden)'''
        
        embedded_captions = self.embed(image_captions)
        #print("EMBED SHAPE", embedded_captions.shape)
        #print("SHAPES BEFORE CONCAT",context.unsqueeze(dim=1).shape, embedded_captions[:,:-1].shape)
        input_lstm = torch.cat((image_features, embedded_captions[:,:-1]), dim = 1)
        #input_lstm = pack_padded_sequence(input_lstm, lengths, batch_first=True, enforce_sorted=False)
        lstm_outputs, _ = self.lstm(input_lstm)        
        #lstm_outputs = self.linear(lstm_outputs[0]) 
        #print("lstm_outputs.shape", lstm_outputs.shape)
        lstm_outputs = self.linear(lstm_outputs) 
        
        return lstm_outputs

In [80]:
class ImageCaptionsNet(nn.Module):
    def __init__(self):
        super(ImageCaptionsNet, self).__init__()        
        ##CNN ENCODER RESNET-50        
        self.Encoder = Encoder(embed_dim = embedding_dim)
        ## RNN DECODER
        self.Decoder = Decoder(embedding_dim, units, 1)    
        

    def forward(self, img_batch, cap_batch, lengths):
        #print("IMG INPUT",x)
        x = self.Encoder(img_batch)
        #print("IMG FEATURE",x)
        x = self.Decoder(x, cap_batch, lengths)
        #print("IMG FEATURE",x)
        return x
    
units = 512
if restore == False:
    net = ImageCaptionsNet()
    net = net.double()
    
'''    if parallel == True and device != "cpu":
        print("Parallel Processing enabled")
        net = nn.DataParallel(net)'''

if device == "cpu":
    print("Device to CPU")
else:
    print("Device to CUDA")
    net = net.to(torch.device("cuda:0"))



Device to CPU


In [81]:
'''Save and Restore Checkpoints'''
def create_checkpoint(path,model, optim_obj, loss_obj,iteration, epoch):
    checkpoint = {'epoch': epoch,
                  'iteration': iteration,
                  'model_state_dict': model.state_dict()}

    if platform == "colab":
        directory = '/content/drive/My Drive/A4/bkp_final_try/'
    else:
        directory = '../bkp_final_try/'

    torch.save(checkpoint, directory + path)
    
def restore_checkpoint(path):
    new_state_dict = collections.OrderedDict()
    if platform == "colab":
        directory = '/content/drive/My Drive/A4/bkp_final_try/'
        checkpoint = torch.load(directory + path, map_location=torch.device('cpu'))
    else:
        directory = '../bkp_final_try/'
        checkpoint = torch.load(directory + path, map_location=torch.device('cpu'))    
    
    epoch = checkpoint['epoch']
    new_state_dict = checkpoint['model_state_dict']
    iteration = checkpoint['iteration']
    #optimizer_state_dict = checkpoint['optimizer_state_dict']
    #loss_obj = checkpoint['loss']
    print("Iterations = {}, Epoch = {}".format(iteration, epoch))
    return new_state_dict

In [82]:


def beam_search(img_feature, max_words=15, beam_k=3):
    
    #init with start token 
    init_caption = []
    init_caption = [[[WORD2IDX["<start>"]], float(0)]]

    while len(init_caption[0][0]) < max_words:
        temp_cap = []
        
        for c in init_caption:  
            #print(c[0])
            cap_pad = c[0] +  [0] * int(max_words - len(c[0]))
            cap_pad = torch.LongTensor(cap_pad).unsqueeze(dim=0)
            lstm_op = net.Decoder(img_feature, cap_pad)        
            lstm_op = lstm_op.reshape(max_words, lstm_op.shape[2])
        
            #TOP k prob
            #print(lstm_op.shape)
            #print(torch.argmax(lstm_op[0], dim=0).tolist())
            top_pred = torch.argmax(lstm_op, dim=1)
            #top_pred = torch.argsort(top_pred)[-beam_k:]
            print(top_pred)
            for i in range(beam_k): 
                word_idx = top_pred[i]
                prob = c[1] + lstm_op[0][word_idx]
                cap = c[0][:] + [word_idx]
                
                temp_cap.append([cap, prob])
                
        init_caption = temp_cap
        init_caption = sorted(init_caption, reverse=False, key=lambda x: x[1])[-beam_k:]
    #print(type(init_caption[-1][0]))
    temp_caption = list(map(lambda x: IDX2WORD[x], init_caption[-1][0]))
    
    pred_caption = list()
    for w in temp_caption:
        if w != '<end>':
            pred_caption.append(w)
        else:
            break
    
    return pred_caption

def caption_image(image_feature, max_words=20):
        results = []
        states = None
        x = image_feature.unsqueeze(0)
        #print(x)
        with torch.no_grad():
            for i in range(max_words):
                
                hiddens, states = net.Decoder.lstm(x, states)
                #print(hiddens.shape)
                decoder_op = net.Decoder.linear(hiddens.squeeze(1))
                predicted_word = decoder_op.argmax(1)
                prob = max(decoder_op[0].tolist())
                #print("{} - {}".format(IDX2WORD[predicted_word.item()], prob))
                x = net.Decoder.embed(predicted_word).unsqueeze(0)
                
                results.append(predicted_word.item())
                
                '''if predicted_word == WORD2IDX["<end>"]:
                    break'''
        
        caption = [IDX2WORD[i] for i in  results]
        return ' '.join(caption)
               
    
# Define your hyperparameters

In [99]:
if platform == "colab":
    IMAGE_DIR_TEST = '/content/drive/My Drive/train/'
else:
    IMAGE_DIR_TEST = 'D:/Padhai/IIT Delhi MS(R)/2019-20 Sem II/COL774 Machine Learning/Assignment/Assignment4/private_test_images/'

from glob import glob
import os


if restore == True:
    net = ImageCaptionsNet()
    net = net.double()
    state_dict = collections.OrderedDict()
    state_dict = restore_checkpoint("chkpt_finaltry_TOKEN.pth")
    net = ImageCaptionsNet()
    net = net.double()
    net.load_state_dict(state_dict)
    print("State Dictionary Loaded Successfully.")
    
    
images_names = glob(IMAGE_DIR_TEST+"*.jpg")
print(len(images_names))
images_names = [os.path.split(i)[-1][:-4] for i in images_names]
print(images_names[:5])
images_names = [i.split("_")[-1] for i in images_names]
print(images_names[0])

test_dataset = ImageCaptionsDataset(
    IMAGE_DIR_TEST, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)
test_dataset.image_ids = images_names

NUM_WORKERS = 0 
MAX_WORDS = 35
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=NUM_WORKERS, collate_fn=custom_batch)

if device != "cpu":
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
t0 = time()
pred_caps = {}
for batch_idx, sample in enumerate(test_loader):
        print("Image_idx", batch_idx)
        #image_batch = sample['image']
        #print("AFTER", image_batch)
        #print("Original", [IDX2WORD[i] for i in captions_batch)
        #print("Cap", [IDX2WORD[int(i)] for i in captions_batch[0]])
        img_features = net.Encoder(image_batch)
        print(img_features)
        #img_features = img_features.view(-1)[torch.randperm(img_features.nelement())].view(img_features.size())
        #img_features = torch.FloatTensor(np.random.randn(1,300))
        #print(img_features[0][:4].tolist(), img_features[0][-5:].tolist())
        #print(x.shape)
        #pred_cap = beam_search(img_features)
        pred_cap = caption_image(img_features, 60)
        
        pred_caps[batch_idx] = pred_cap
        print("Predicted",batch_idx, pred_cap)


EMBED DIM 200
resnet50 Loaded Successfully..!
VOCAB SIZE =  8680
Iterations = 25, Epoch = 1
EMBED DIM 200
resnet50 Loaded Successfully..!
VOCAB SIZE =  8680
State Dictionary Loaded Successfully.
1000
['image_10005', 'image_10010', 'image_10088', 'image_10097', 'image_10106']
10005
Image_idx 0
tensor([[-1.9639e-03,  2.7241e-01, -2.3129e-03,  1.3445e+00,  6.6016e-01,
         -3.3799e-03,  9.1875e-01, -2.4428e-03, -1.5702e-03, -4.1441e-03,
         -5.0070e-04,  4.1966e-01, -2.3454e-03, -2.7365e-03,  5.3379e-01,
         -2.8045e-03,  9.8574e-02,  2.4870e+00, -2.3918e-03,  2.6445e-01,
          1.8996e+00, -2.9671e-03,  1.3359e+00, -1.9331e-04, -1.3179e-03,
         -1.9671e-03, -2.2045e-03, -2.9289e-03,  6.4827e-01,  7.1795e-01,
         -6.9516e-04, -1.9713e-03,  2.2487e+00,  6.2337e-01,  1.1113e-01,
         -3.1616e-03, -2.0678e-03, -8.8828e-04, -4.6308e-03,  6.9077e-02,
         -3.5480e-03,  1.7838e+00, -3.5071e-03, -7.2820e-04, -2.7333e-03,
          3.9954e-01, -1.1518e-03, -9.81

       dtype=torch.float64, grad_fn=<LeakyReluBackward0>)
Predicted 2 <start> un uomo uomo ragazzi corpo corpo vicine maneggiano getta inclina impedire punta.un installa zona monopolio gratta feriti dipinte <end> musicali giocavano usano alzando piuma arancioni.un accendono rampe rampa.una grigliato cream miami esaminare terminale scrivania intorno.un softball.un gruppo.un africane drenaggio drenaggio drenaggio armato <end> lavoratori agricolo femmina esotici <end> sull'autobus l'azione biglietti scrivania intorno.un softball.un gruppo.un africane drenaggio drenaggio drenaggio
Image_idx 3
tensor([[-1.9639e-03,  2.7241e-01, -2.3129e-03,  1.3445e+00,  6.6016e-01,
         -3.3799e-03,  9.1875e-01, -2.4428e-03, -1.5702e-03, -4.1441e-03,
         -5.0070e-04,  4.1966e-01, -2.3454e-03, -2.7365e-03,  5.3379e-01,
         -2.8045e-03,  9.8574e-02,  2.4870e+00, -2.3918e-03,  2.6445e-01,
          1.8996e+00, -2.9671e-03,  1.3359e+00, -1.9331e-04, -1.3179e-03,
         -1.9671e-03, -2.2045e-03,

KeyboardInterrupt: 

### Test Model

In [98]:
if platform == "colab":
    IMAGE_DIR_TEST = '/content/drive/My Drive/train/'
else:
    IMAGE_DIR_TEST = 'D:/Padhai/IIT Delhi MS(R)/2019-20 Sem II/COL774 Machine Learning/Assignment/Assignment4/private_test_images/'

from glob import glob
import os

if restore == True:
    if net:
        del net
    net = ImageCaptionsNet()
    net = net.double()
    state_dict = collections.OrderedDict()
    state_dict = restore_checkpoint("chkpt_finaltry_TOKEN_0.01.pth")
    net = ImageCaptionsNet()
    net = net.double()
    net.load_state_dict(state_dict)
    print("State Dictionary Loaded Successfully.")

images_names = glob(IMAGE_DIR_TEST+"*.jpg")
print(len(images_names))
images_names = [os.path.split(i)[-1][:-4] for i in images_names]
print(images_names[:5])
images_names = [i.split("_")[-1] for i in images_names]
print(images_names[0])


# Creating the Dataset
test_dataset = ImageCaptionsDataset(
    IMAGE_DIR_TEST, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)
test_dataset.image_ids = images_names

#print(len(img_ids))
NUM_WORKERS = 0 # Parallel threads for dataloading
MAX_WORDS = 35
# Creating the DataLoader for batching purposes
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=NUM_WORKERS, collate_fn=custom_batch)

if device != "cpu":
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
t0 = time()
pred_caps = {}
for batch_idx, sample in enumerate(test_loader):
        print("Image_idx", batch_idx)
        #image_batch = sample['image']
        #print("AFTER", image_batch)
        #print("Original", [IDX2WORD[i] for i in captions_batch)
        #print("Cap", [IDX2WORD[int(i)] for i in captions_batch[0]])
        img_features = net.Encoder(image_batch)
        print(img_features)
        #img_features = img_features.view(-1)[torch.randperm(img_features.nelement())].view(img_features.size())
        #img_features = torch.FloatTensor(np.random.randn(1,300))
        #print(img_features[0][:4].tolist(), img_features[0][-5:].tolist())
        #print(x.shape)
        #pred_cap = beam_search(img_features)
        pred_cap = caption_image(img_features, 60)
        
        pred_caps[batch_idx] = pred_cap
        print("Predicted",batch_idx, pred_cap.replace("<unk>", "*"))


EMBED DIM 200
resnet50 Loaded Successfully..!
VOCAB SIZE =  8680
Iterations = 100, Epoch = 1
EMBED DIM 200
resnet50 Loaded Successfully..!
VOCAB SIZE =  8680
State Dictionary Loaded Successfully.
1000
['image_10005', 'image_10010', 'image_10088', 'image_10097', 'image_10106']
10005
Image_idx 0
tensor([[-7.9731e-05,  3.2992e+00,  6.7773e+00,  1.6855e+00, -3.1742e-04,
         -1.1180e-05,  4.9456e+00, -1.2853e-03,  5.7048e+00, -1.5602e-03,
          4.9112e+00,  1.1732e-02,  1.1889e+00,  1.3476e+00,  3.7415e+00,
         -2.3463e-03,  3.2826e+00, -1.7007e-03,  1.0514e+00,  8.6016e+00,
         -1.3218e-03,  7.0165e-01,  2.7890e+00, -2.3239e-04, -3.1168e-04,
          5.2710e-01,  2.8796e+00,  2.5961e+00,  2.0203e+00,  6.1750e-01,
         -1.2911e-05,  1.8112e+00,  5.2715e+00, -1.6493e-03, -1.5302e-03,
         -1.1092e-03,  6.9307e+00,  5.1040e+00,  4.1580e+00,  9.3736e-01,
          5.1366e+00,  9.1397e-01,  3.8016e-01,  4.1835e+00, -1.9223e-04,
          1.8982e+00,  9.8100e-02,  3.6

       dtype=torch.float64, grad_fn=<LeakyReluBackward0>)
Predicted 2 <start> un uomo con un uomo con un uomo con un uomo con un uomo con un uomo con un uomo con un uomo con un uomo . <end> . <end> <end> . <end> . <end> <end> . <end> <end> . <end> . <end> <end> . <end> <end> . <end> . <end> <end> . <end> <end> . <end> . <end>
Image_idx 3
tensor([[-7.9731e-05,  3.2992e+00,  6.7773e+00,  1.6855e+00, -3.1742e-04,
         -1.1180e-05,  4.9456e+00, -1.2853e-03,  5.7048e+00, -1.5602e-03,
          4.9112e+00,  1.1732e-02,  1.1889e+00,  1.3476e+00,  3.7415e+00,
         -2.3463e-03,  3.2826e+00, -1.7007e-03,  1.0514e+00,  8.6016e+00,
         -1.3218e-03,  7.0165e-01,  2.7890e+00, -2.3239e-04, -3.1168e-04,
          5.2710e-01,  2.8796e+00,  2.5961e+00,  2.0203e+00,  6.1750e-01,
         -1.2911e-05,  1.8112e+00,  5.2715e+00, -1.6493e-03, -1.5302e-03,
         -1.1092e-03,  6.9307e+00,  5.1040e+00,  4.1580e+00,  9.3736e-01,
          5.1366e+00,  9.1397e-01,  3.8016e-01,  4.1835e+00, -1.9223

KeyboardInterrupt: 

In [94]:
import pandas as pd
op_str = "blu.tre grillare trekking sinistra.una valigie pista.una tastiera estrarre jumpsuit azienda metropolitana.un accanto bilanciarsi coraggioso free canon canon dama canon dama dama 100 comodamente casco rocce accanto.una neve.il birra.una accanto.una accanto.una viso bassi mezz'aria.una sollevati dell'azienda lanciarsi sollevati letto.una afroamericana personale controllato birra.una orgoglioso carro l'esecuzione bandiera.un momenti l'operaio portapranzo pausa ferrovia.un l'attività pasta miei decide hockey.una addormentato cantava cubo fatta."
op_str = "un.uomo in camicia rossa e un gilet blu e una donna."
op_str = [op_str]*len(images_names)

op_dict = dict(zip(images_names, op_str))
df = pd.DataFrame.from_dict(op_dict, orient='index', columns=None)
df.to_csv( "../2019SIY7580_2019CSZ8763/2019SIY7580_2019CSZ8763_public.tsv", sep='\t', header=False)

In [89]:
net.Encoder.fc.weight

Parameter containing:
tensor([[-0.0044,  0.0037,  0.0051,  ...,  0.0081,  0.0126, -0.0021],
        [ 0.0051, -0.0143,  0.0178,  ..., -0.0196,  0.0201,  0.0193],
        [ 0.0086, -0.0086,  0.0099,  ...,  0.0163,  0.0085, -0.0196],
        ...,
        [-0.0101,  0.0161,  0.0168,  ..., -0.0196, -0.0078, -0.0193],
        [-0.0136, -0.0130, -0.0190,  ...,  0.0151, -0.0174, -0.0115],
        [-0.0103,  0.0192,  0.0010,  ..., -0.0193, -0.0207,  0.0131]],
       dtype=torch.float64, requires_grad=True)

### TRAIN LOOP

In [85]:
if platform == "colab":
    IMAGE_DIR = '/content/drive/My Drive/train_images/'
else:
    IMAGE_DIR = 'D:/Padhai/IIT Delhi MS(R)/2019-20 Sem II/COL774 Machine Learning/Assignment/Assignment4/train_images/'

if restore == True:
    net = ImageCaptionsNet()
    net = net.double()
    new_state_dict = collections.OrderedDict()
    new_state_dict = restore_checkpoint("caption_chkpt_multi.pth")    
    
    print("State Dictionary Loaded Successfully.")
    #net = nn.DataParallel(net)
    net = net.to(torch.device("cuda:0"))

# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)

# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 2e-3
BATCH_SIZE = 1
NUM_WORKERS = 0 # Parallel threads for dataloading

'''cw = torch.ones(len(VOCAB), dtype=torch.double)
cw[WORD2IDX["<pad>"]] = 0
cw = cw.to(torch.device("cuda:0"))'''

loss_function = nn.CrossEntropyLoss(ignore_index=WORD2IDX["<pad>"])

paramaters = list(net.Decoder.parameters()) + list(net.Encoder.fc.parameters())

optimizer = optim.Adam(paramaters, lr=LEARNING_RATE,weight_decay=0.001)

total_params = sum(p.numel() for p in net.parameters())
trainable_params = sum(p.numel() for p in net.parameters() if p.requires_grad)

params_for_adam = sum(p.numel() for p in paramaters)
print("TOTAL PARAMS: {}, TOTAL TRAINABLE PARAMS NET: {}, TOTAL ADAM PARAMS: {}".format(total_params,trainable_params,params_for_adam))
print("TOTAL EPOCHS: {}, BATCH SIZE: {}, OPTIMIZER: {}".format(NUMBER_OF_EPOCHS, BATCH_SIZE, optimizer))
loss_list = []
# Creating the DataLoader for batching purposes
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS,
                          collate_fn=custom_batch)

if device != "cpu":
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    torch.backends.cudnn.benchmark = True
    #torch.cuda.set_device(1)
t0 = time()
for epoch in range(NUMBER_OF_EPOCHS):
    print("$$$$$----EPOCH {}----$$$$$$".format(epoch+1))
    iteration = 0
    
    '''if epoch == 1:
        LEARNING_RATE = 8e-4
        for param_group in optimizer.param_groups:
            param_group['lr'] = LEARNING_RATE
        print("\nLEARNING RATE =", LEARNING_RATE, optimizer)
    elif epoch == 2:
        LEARNING_RATE = 5e-4
        for param_group in optimizer.param_groups:
            param_group['lr'] = LEARNING_RATE
        print("\nLEARNING RATE =", LEARNING_RATE, optimizer)'''
    

    for batch_idx, sample in enumerate(train_loader):
        iteration +=1
        if iteration%25 == 0:
            LEARNING_RATE *= 0.94
            for param_group in optimizer.param_groups:
                param_group['lr'] = LEARNING_RATE
            print("\nLEARNING RATE =", LEARNING_RATE, optimizer)

        #net.zero_grad()
        
        net.Encoder.zero_grad()
        net.Decoder.zero_grad()

        optimizer.zero_grad()

        image_batch, captions_batch, lengths = sample['image'], sample['captions'], sample['lengths']
        #print(lengths)
        #print("image_shape", image_batch.shape)
        #print("batch_shape", captions_batch.shape)
        
        #print("MY CAP", captions_batch)

        # If GPU training required
        if device != "cpu":
          #print("cuda")
          image_batch, captions_batch = image_batch.to(torch.device("cuda:0")), captions_batch.to(torch.device("cuda:0"))
        
        output_captions = net(image_batch, captions_batch, lengths)
        #ground_truth = pack_padded_sequence(captions_batch, lengths, batch_first=True, enforce_sorted=False)
        #ground_truth = ground_truth[0]
        #print("GT", captions_batch.reshape(-1))
        #print("size for loss", output_captions.shape, captions_batch.shape)
        #torch.Size([10, 26, 9934]) torch.Size([10, 26])
        #print("BEFORE LOSS", output_captions.shape, ground_truth.shape)
        #loss = loss_function(output_captions, ground_truth)
        loss = loss_function(output_captions.reshape(-1, output_captions.shape[2]), captions_batch.reshape(-1))
        loss_list.append(loss.item())
        
        loss.backward()
        optimizer.step()
 
        
        loss.backward()
        optimizer.step()
        
        if iteration%25 == 0:
            create_checkpoint("chkpt_finaltry_pad.pth", net, optimizer, loss, iteration, epoch+1)
        print("ITERATION:[{}/{}] | LOSS: {} | EPOCH = [{}/{}] | TIME ELAPSED ={}Mins".format(iteration, round(29000/BATCH_SIZE)+1,
              round(loss.item(), 6), epoch+1, NUMBER_OF_EPOCHS, round((time()-t0)/60,2)))
    print("\n$$Loss = {},EPOCH: [{}/{}]\n\n".format(round(loss.item(), 6), epoch+1, NUMBER_OF_EPOCHS))
    create_checkpoint("Epoch_finaltry_pad.pth", net, optimizer, loss, iteration, epoch+1)

create_checkpoint("Full_finaltry_pad.pth", net, optimizer, loss, iteration, epoch+1)


EMBED DIM 200
resnet50 Loaded Successfully..!
VOCAB SIZE =  8680


FileNotFoundError: [Errno 2] No such file or directory: '../bkp_final_try/caption_chkpt_multi.pth'

In [None]:
net

In [None]:
reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
candidate = ['un', 'uomo', 'in', 'camicia', 'rossa', 'e', 'un', 'gilet', 'blu', 'e', 'una', 'donna', '.']
score = sentence_bleu(reference, candidate)

In [None]:
'''if platform == "colab":
    embed_path = '/content/drive/My Drive/A4/embeddings/trained_embed.pkl'
else:
    embed_path = '../embeddings/trained_embed.pkl'
with open(embed_path, 'rb') as handle:
    vocab_dump = pickle.load(handle)

init_weights = torch.randn(len(VOCAB), 300)
idx = 0
words = WORD2IDX.keys()
for i in range(len(words)):
    init_weights[i] = vocab_dump[IDX2WORD[i]]
init_weights.shape'''

In [None]:
models.resnet34()

In [None]:
pred = '<start> un.uomo in camicia rossa e un gilet blu e una donna , <unk> <unk> <unk> . <end>'
#[VOCAB[i] for  i in pred.split(" ")]
pred.replace("\%.\%", " ").split(" ")

In [None]:
y = torch.Tensor.repeat_interleave(x, repeats=5 , dim=0)

z= y


