In [None]:
#connection with google drive in case you use google colaboratory
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import libraries and modules
SEED = 1234
! pip install pyprind
import torchtext
import torch
import nltk
nltk.download('punkt')
import json
import pyprind
import pandas as pd
import numpy as np
import seaborn as sns
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.legacy import data
import matplotlib.pyplot as plt
%matplotlib inline
torch.backends.cudnn.deterministic = True
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
torch.cuda.init()

In [None]:
#initialize cuda to use GPU when it is available 
is_cuda = torch.cuda.is_available()
print("Cuda Status on system is {}".format(is_cuda))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu' )

In [None]:
#unzip and load pt files containing image information (features and classes)
import tarfile
t = tarfile.open('/content/drive/coco_object_features.tar.gz', 'r')
t.extractall("/content/drive/embedding")

In [None]:
#check number of pt files loaded
import os
path, dirs, files = next(os.walk("/content/drive/coco_object_features/features"))
file_count = len(files)
print(file_count)

In [None]:
#load training dataset and validation dataset from google drive
train_set = pd.read_csv("/content/drive/train.csv")
validation_set = pd.read_csv("/content/drive/validation.csv")
#mix training set records
train_set = train_set.sample(n=train_set.shape[0])
validation_set = validation_set.sample(n=validation_set.shape[0])

In [None]:
#load pre-trained Glove embeddings
vocab = torchtext.vocab.Vectors("/content/drive/glove.6B.300d.txt")
#adding element for padding inside the vocabulary at position 400000
vector_400000 = torch.zeros(300)
vector_400000.view(1,300).shape
vocab.vectors = torch.cat((vocab.vectors, vector_400000.view(1,300)), dim=0)

In [None]:
#create the required dataset class for dataloder
class Dataset(Dataset):

    def __init__(self,dataframe):
        self.samples = dataframe
    #the size of the dataset (we can reduce it when needed Ex: return 28 for 28 lines)
    def __len__(self): 
      return len(self.samples)
      
    def __getitem__(self, idx):
      '''
      return current elements of the dataset(at index idx)
      '''
      #get caption
      word_idx = []
      caption = self.samples.caption.iloc[idx].lower()
      row = nltk.word_tokenize(caption)
      #get indexes of word's caption from embedding ('vocab')
      for word in row:
        try :
          index = vocab.stoi[word]
          word_idx.append(index)
        except KeyError:
          word_idx.append(400000)
      #padding to get 13 words from a caption
      while len(word_idx) <13:
        word_idx.append(400000)

      #get the current image feature
      image_id = self.samples.iloc[idx].image_id
      file_image ="/content/drive/coco_object_features/features/"+'0' * (12-len(str(image_id))) + str(image_id) + '.pt'
      images = torch.load(file_image)
      data_classes = images['classes']
      data_features = images['features']
      i=0
      #get indexes of the expected 5 classes name from embedding ('vocab')
      class_idx=[]
      for tag in data_classes:
        try:
          tag = tag.decode("utf-8")
          tag = tag.lower()
          index = vocab.stoi[tag]
          class_idx.append(index)
        except KeyError:
          class_idx.append(400000)    
      #padding to get 5 classes
      while len(class_idx) < 5:
        class_idx.append(400000)
      data_features = data_features[:5]
      while len(data_features) < 5:
        row = torch.zeros(1,2048)
        data_features = torch.cat((torch.tensor(data_features), row),dim=0)
      #get the label of the current line
      label = self.samples.Label.iloc[idx]

      #return: the 13 first words indexes, features, and the first 5 classes of the single image
      return   torch.tensor(word_idx[:13]), torch.tensor(data_features), torch.tensor(class_idx[:5]), torch.tensor(label)

In [None]:
#create torch Dataset for train and validation
dataset_train = Dataset(train_set)
dataset_validation = Dataset(validation_set)
#use dataloder for training and validation
dataloader_train = DataLoader(dataset_train, batch_size=32, num_workers=2)
dataloader_validation = DataLoader(dataset_validation, batch_size=32, num_workers=2)

In [None]:
#set two recurrent neural network(RNN) and a linear classifier
class CRM(nn.Module): #cross retrieval match
  def __init__(self, vocab_dim, embedding_language_dim, embedding_classes_dim, embedding_features_dim, hidden_dim, output_dim):
        super().__init__()
        
        #RNN for caption processing
        self.embeddings_language = nn.Embedding(vocab_dim, embedding_language_dim) 
        self.embeddings_language.load_state_dict({'weight': (vocab.vectors)})
        self.rnn_language = nn.RNN(embedding_language_dim, hidden_dim, batch_first = True)
        
        #RNN for images processing
        embedding_img_dim = embedding_classes_dim + embedding_features_dim
        self.embeddings_visual = nn.Embedding(vocab_dim, embedding_classes_dim)
        self.embeddings_visual.load_state_dict({'weight': (vocab.vectors)})
        self.rnn_visual = nn.RNN(embedding_img_dim, hidden_dim, batch_first = True)
         
        #turn off somes neurons avoiding overfitting 
        self.dropout = nn.Dropout(p=0.2)
        #linear classifier
        self.fc = nn.Linear(hidden_dim + hidden_dim, output_dim)

        #forward get in input dataloader output: captions indexes, features, classes indexes
  def forward(self, caption_idx, embedding_features, classes_idx):
          '''
          Forwarding method: give as a input a tensor which summarize the current image and caption
          '''
          #load words embeddings for captions
          embedding_text = self.embeddings_language(caption_idx)
          _,hidden_language = self.rnn_language(embedding_text) #saving only the last hidden which summarize the caption
          
          #load words embeddings for image classes 
          embedding_classes = self.embeddings_visual(classes_idx)
          #concatenate image features and classes
          embedding_image = torch.cat((embedding_features, embedding_classes), dim=-1)
          #saving only the last hidden which summarize the image features
          _, hidden_visual = self.rnn_visual(embedding_image)
          
          #tensor that summarize both image and caption
          hidden = torch.cat((hidden_language, hidden_visual), dim = -1)
          

          out = self.fc(hidden)
          out = self.dropout(out) #avoid overfitting

          return out