In [1]:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import string
import re
import spacy
from collections import Counter
import pickle
from PIL import Image
import torch
from torchvision import transforms
import torchtext
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cpu



#### Loading Images and Captions

In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/Flickr8k.token.txt') as f:
    lines = f.readlines()
caption_dict = {}
image_dict = {}
for i, line in enumerate(lines):
  try:
    image_path = line.split('#')[0]
    caption = line.split('#')[1]
    image = plt.imread("/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/Flicker8k_Dataset/"+image_path)
    image_dict[image_path] = image
    caption_dict[image_path] =  caption
  except:
    pass

In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/caption_dict.pkl', 'wb') as f:
    pickle.dump(caption_dict, f)  # save it

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/image_dict.pkl', 'wb') as f:
    pickle.dump(image_dict, f)  # save it

In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/caption_dict.pkl', 'rb') as f:
    caption_dict = pickle.load(f) # load it

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/image_dict.pkl', 'rb') as f:
    image_dict = pickle.load(f) # load it

In [None]:
# Train Datas
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/Flickr_8k.trainImages.txt') as f:
  lines = f.readlines()
datas = pd.DataFrame(columns=["caption","image"])

for line in lines:
  line = line.replace('\n', '')
  image = image_dict[line]
  caption = caption_dict[line]
  datas = datas.append({"caption":caption, "image":image}, ignore_index=True)

# Test Datas
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/Flickr_8k.testImages.txt') as f:
  lines = f.readlines()

for line in lines:
  line = line.replace('\n', '')
  image = image_dict[line]
  caption = caption_dict[line]
  datas = datas.append({"caption":caption, "image":image}, ignore_index=True)

del caption_dict
del image_dict
datas.head(4)

Unnamed: 0,caption,image
0,4\tTwo dogs running through a low lying body o...,"[[[38, 31, 25], [64, 50, 49], [78, 73, 67], [2..."
1,4\tThe little boy is playing with a croquet ha...,"[[[254, 254, 254], [254, 254, 254], [254, 254,..."
2,4\tA dog with something pink in its mouth is l...,"[[[146, 143, 154], [149, 143, 155], [153, 142,..."
3,4\tThe large brown dog is running on the beach...,"[[[121, 158, 202], [117, 154, 198], [118, 155,..."


#### Preprocessing the datas

In [None]:
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
# remove punctuation and numbers and unfrequent words
datas_preprocessed = datas.copy()
counts = Counter()
for index, row in datas_preprocessed.iterrows():
    counts.update(tokenize(row['caption']))

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 10:
      datas_preprocessed['caption'] = datas_preprocessed['caption'].str.replace("\b{0}\b".format(word), '')
      del counts[word]

punctuation = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '\t',"\n",'#','@','.',',','?','!',')', '(']
for char in punctuation:
    datas_preprocessed['caption'] = datas_preprocessed['caption'].str.replace(char, '')

print("num_words after:",len(counts.keys()))
del datas
del counts
# adding <START> and <END> to the captions
datas_preprocessed['caption'] = "<START>" + datas_preprocessed['caption'].astype(str) + "<END>"
datas_preprocessed.head(4)

num_words before: 4079
num_words after: 664


Unnamed: 0,caption,image
0,<START>Two dogs running through a low lying bo...,"[[[38, 31, 25], [64, 50, 49], [78, 73, 67], [2..."
1,<START>The little boy is playing with a croque...,"[[[254, 254, 254], [254, 254, 254], [254, 254,..."
2,<START>A dog with something pink in its mouth ...,"[[[146, 143, 154], [149, 143, 155], [153, 142,..."
3,<START>The large brown dog is running on the b...,"[[[121, 158, 202], [117, 154, 198], [118, 155,..."


In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/datas_preprocessed.pkl', 'wb') as f:
    pickle.dump(datas_preprocessed, f)  # save it

In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/datas_preprocessed.pkl', 'rb') as f:
    datas_preprocessed = pickle.load(f) # load it

#### Vectorizing the Images using pretrained Squeezenet (on ImageNet)

In [None]:
squeezeNet = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)
squeezeNet.to(device)

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (5): Fire(
   

In [None]:
preprocess = transforms.Compose([
    transforms.Resize([224,224]),
])

def squeezeOut(image):
  input_image = image/255
  input_image = torch.from_numpy(input_image).permute(2, 0, 1).float() 

  input_tensor = preprocess(input_image)
  input_batch = input_tensor.unsqueeze(0)

  input_batch = input_batch.to(device)
  with torch.no_grad():
    output = squeezeNet(input_batch)
  probabilities = torch.nn.functional.softmax(output[0], dim=0)
  return probabilities

def encode(datas):
  datas['vectorized'] = datas.apply(lambda row: squeezeOut(row['image']), axis=1)
  return datas


datas_preprocessed = encode(datas_preprocessed)

In [None]:
datas_preprocessed.head(4)

Unnamed: 0,caption,image,vectorized
0,<START>Two dogs running through a low lying bo...,"[[[38, 31, 25], [64, 50, 49], [78, 73, 67], [2...","[tensor(1.6228e-06), tensor(1.6867e-06), tenso..."
1,<START>The little boy is playing with a croque...,"[[[254, 254, 254], [254, 254, 254], [254, 254,...","[tensor(0.0008), tensor(0.0010), tensor(0.0002..."
2,<START>A dog with something pink in its mouth ...,"[[[146, 143, 154], [149, 143, 155], [153, 142,...","[tensor(8.1646e-06), tensor(3.3554e-05), tenso..."
3,<START>The large brown dog is running on the b...,"[[[121, 158, 202], [117, 154, 198], [118, 155,...","[tensor(1.2193e-08), tensor(1.2519e-08), tenso..."


In [None]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/datas_preprocessed.pkl', 'wb') as f:
    pickle.dump(datas_preprocessed, f)  # save it

In [3]:
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q03/Datas/datas_preprocessed.pkl', 'rb') as f:
    datas_preprocessed = pickle.load(f) # load it

#### Building Vocabulary

In [4]:
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

#count number of occurences of each word
counts = Counter()
for index, row in datas_preprocessed.iterrows():
    counts.update(tokenize(row['caption']))

#creating vocabulary
vocab2index = {"":0, "UNK":1}
insex2vocab = {0:"", 1:"UNK"}

words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    insex2vocab[len(words)] = word
    words.append(word)

def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

datas_preprocessed['encoded'] = datas_preprocessed['caption'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))



In [5]:
datas_preprocessed.head(4)

Unnamed: 0,caption,image,vectorized,encoded
0,<START>Two dogs running through a low lying bo...,"[[[38, 31, 25], [64, 50, 49], [78, 73, 67], [2...","[tensor(1.6228e-06), tensor(1.6867e-06), tenso...","[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 1..."
1,<START>The little boy is playing with a croque...,"[[[254, 254, 254], [254, 254, 254], [254, 254,...","[tensor(0.0008), tensor(0.0010), tensor(0.0002...","[[2, 3, 15, 16, 17, 18, 19, 20, 8, 21, 22, 23,..."
2,<START>A dog with something pink in its mouth ...,"[[[146, 143, 154], [149, 143, 155], [153, 142,...","[tensor(8.1646e-06), tensor(3.3554e-05), tenso...","[[2, 3, 8, 27, 20, 28, 29, 30, 31, 32, 18, 33,..."
3,<START>The large brown dog is running on the b...,"[[[121, 158, 202], [117, 154, 198], [118, 155,...","[tensor(1.2193e-08), tensor(1.2519e-08), tenso...","[[2, 3, 15, 35, 36, 27, 18, 6, 37, 15, 38, 39,..."


#### Making Torch Dataloader

In [6]:
class Vector_Caption_Dataset(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe # load it

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):

        vectorized = self.dataframe['vectorized'][idx]
        encoded = self.dataframe['encoded'][idx][0]
        sample = {'vectorized': vectorized, 'encoded':encoded}
        return sample

vector_dataset = Vector_Caption_Dataset(datas_preprocessed)

BATCH_SIZE = 64
dataloader = DataLoader(vector_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

#### implementing RNN

In [7]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [27]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super().__init__()
    
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.word_embeddings = nn.Embedding(vocab_size, embed_size)

        self.lstm = nn.LSTM(input_size=embed_size, \
                            hidden_size=hidden_size, # LSTM hidden units 
                            num_layers=1, # number of LSTM layer
                            bias=True, # use bias weights b_ih and b_hh
                            batch_first=True,  # input & output will have batch size as 1st dimension
                            dropout=0, # Not applying dropout 
                            bidirectional=False, # unidirectional LSTM
                           )
        

        self.linear = nn.Linear(hidden_size, vocab_size)                     


        
    def init_hidden(self, batch_size):

        return (torch.zeros((1, batch_size, self.hidden_size), device=device), \
                torch.zeros((1, batch_size, self.hidden_size), device=device))

    def forward(self, features, captions):
        
        captions = captions[:, :-1]     
        
        self.batch_size = features.shape[0]
        self.hidden = self.init_hidden(self.batch_size) 
                
        embeddings = self.word_embeddings(captions)
        
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1) # embeddings new shape : (batch_size, caption length, embed_size)
        

        lstm_out, self.hidden = self.lstm(embeddings, self.hidden) # lstm_out shape : (batch_size, caption length, hidden_size)

        outputs = self.linear(lstm_out) # outputs shape : (batch_size, caption length, vocab_size)

        return outputs

    def sample(self, inputs):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        
        
        output = []
        batch_size = inputs.shape[0] # batch_size is 1 at inference, inputs shape : (1, 1, embed_size)
        hidden = self.init_hidden(batch_size) # Get initial hidden state of the LSTM
    
        while True:
            lstm_out, hidden = self.lstm(inputs, hidden) # lstm_out shape : (1, 1, hidden_size)
            outputs = self.linear(lstm_out)  # outputs shape : (1, 1, vocab_size)
            outputs = outputs.squeeze(1) # outputs shape : (1, vocab_size)
            _, max_indice = torch.max(outputs, dim=1) # predict the most likely next word, max_indice shape : (1)
            print(max_indice)
            print(max_indice.cpu().numpy()[0].item())
            output.append(max_indice.cpu().numpy()[0].item()) # storing the word predicted

            if len(output) == 10:
              break
            
            inputs = self.word_embeddings(max_indice) # inputs shape : (1, embed_size)
            inputs = inputs.unsqueeze(1) # inputs shape : (1, 1, embed_size)
            
        return output




    ## Beam search implementation (Attempt)
    def beam_search_sample(self, inputs, beam=3):
        output = []
        self.batch_size = inputs.shape[0] # batch_size is 1 at inference, inputs shape : (1, 1, embed_size)
        hidden = self.init_hidden(self.batch_size) # Get initial hidden state of the LSTM
        
        sequences = [[[torch.Tensor([0])], 1.0, hidden]]
        max_len = 20

        ## Step 1
        # Predict the first word <start>
        outputs, hidden = DecoderRNN.get_outputs(self, inputs, hidden)
        _, max_indice = torch.max(outputs, dim=1) # predict the most likely next word, max_indice shape : (1)
        output.append(max_indice.cpu().numpy()[0].item()) # storing the word predicted 
        # inputs = DecoderRNN.get_next_word_input(self, max_indice)
        
        
        l = 0
        while len(sequences[0][0]) < max_len: 
            print("l:", l)
            l+= 1
            temp = []
            for seq in sequences:
#                 print("seq[0]: ", seq[0])
                inputs = seq[0][-1] # last word index in seq
                inputs = inputs.type(torch.cuda.LongTensor)
                print("inputs : ", inputs)
                # Embed the input word
                inputs = self.word_embeddings(inputs) # inputs shape : (1, embed_size)
                inputs = inputs.unsqueeze(1) # inputs shape : (1, 1, embed_size) 
                
                # retrieve the hidden state
                hidden = seq[2]
                
                preds, hidden = DecoderRNN.get_outputs(self, inputs, hidden)

                # Getting the top <beam_index>(n) predictions
                softmax_score = F.log_softmax(outputs, dim=1) # Define a function to sort the cumulative score
                sorted_score, indices = torch.sort(-softmax_score, dim=1)
                word_preds = indices[0][:beam]
                best_scores = sorted_score[0][:beam]

                # Creating a new list so as to put them via the model again
                for i, w in enumerate(word_preds):
#                     print("seq[0]: ", seq[0][0][:].cpu().numpy().item())
                    next_cap, prob = seq[0][0].cpu().numpy().tolist(), seq[1]
                    
                    next_cap.append(w)
                    print("next_cap : ", next_cap)
                    prob *best_scores[i].cpu().item()
                    temp.append([next_cap, prob])

            sequences = temp
            # Order according to proba
            ordered = sorted(sequences, key=lambda tup: tup[1])

            # Getting the top words
            sequences = ordered[:beam]
            print("sequences: ", sequences)

    def get_outputs(self, inputs, hidden):
        lstm_out, hidden = self.lstm(inputs, hidden) # lstm_out shape : (1, 1, hidden_size)
        outputs = self.linear(lstm_out)  # outputs shape : (1, 1, vocab_size)
        outputs = outputs.squeeze(1) # outputs shape : (1, vocab_size)

        return outputs, hidden

    def get_next_word_input(self, max_indice):
        ## Prepare to embed the last predicted word to be the new input of the lstm
        inputs = self.word_embeddings(max_indice) # inputs shape : (1, embed_size)
        inputs = inputs.unsqueeze(1) # inputs shape : (1, 1, embed_size)

        return inputs

In [28]:
embed_size = 1000           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
vocab_size = len(vocab2index)

decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.to(device)

criterion = nn.CrossEntropyLoss()
criterion.to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08)

#### Training

In [32]:
num_epochs = 5
print_every = 1
for epoch in range(1, num_epochs+1):
    
    for i_step, data in enumerate(dataloader):
        # Pass the inputs through the CNN-RNN model.
        features = data['vectorized'].to(device)
        captions = data['encoded'].to(device)
        outputs = decoder(features, captions)
        
        loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1))
        loss.backward()
        optimizer.step()
            
        stats = 'Epoch [%d/%d], Step [%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, loss.item(), np.exp(loss.item()))
        
        
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)

Epoch [1/5], Step [0], Loss: 4.0732, Perplexity: 58.7474
Epoch [1/5], Step [1], Loss: 3.3342, Perplexity: 28.0549
Epoch [1/5], Step [2], Loss: 2.6200, Perplexity: 13.7361
Epoch [1/5], Step [3], Loss: 2.3717, Perplexity: 10.7158
Epoch [1/5], Step [4], Loss: 2.0421, Perplexity: 7.7068
Epoch [1/5], Step [5], Loss: 1.8355, Perplexity: 6.2681


KeyboardInterrupt: ignored

In [24]:
def clean_sentence(output):
    list_string = []
    
    for idx in output:
        list_string.append(insex2vocab[idx])
    
    list_string = list_string[1:-1] # Discard <start> and <end> words
    sentence = ' '.join(list_string) # Convert list of string to full string
    sentence = sentence.capitalize()  # Capitalize the first letter of the first word
    return sentence

In [25]:
def get_prediction():
    data = next(iter(dataloader))
    features = data['vectorized'].to(device).unsqueeze(1)
    captions = data['encoded'].to(device)
    output = decoder.sample(features)
    print(output)  
    sentence = clean_sentence(output)
    print(sentence)

In [33]:
get_prediction()

tensor([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8, 70,  8,  8,  8,  8,  8,  8, 70, 70,  8,  8,
         8,  8, 70, 70,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8])
8
tensor([   2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2, 3846,    2,    2,    2,    2,    2,    2, 3846, 3846,    2,    2,
           2,    2, 3846, 3846,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2])
2
tensor([ 14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
         14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14, 714,  14,  14,
         14,  14,  14,  14, 714, 714,  14,  14,  14,  14, 714, 714,  14,  14,
         14,  14,  14,  14,  14,  14,  1