In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import models, transforms

import nltk
import PIL.Image as Image

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class Embeddings():
    def __init__(self, vocab, emd_dims):
        self.vocab = vocab
        self.embeds = nn.Embedding(len(vocab), emd_dims)
    
    def get_embedding(self, word):
        if(word not in vocab.keys()):
            word = '<unk>'
        lookup_tensor = torch.tensor(self.vocab[word], dtype = torch.long)
        embeds = self.embeds(lookup_tensor)
        embeds = embeds.unsqueeze(0)
        embeds = embeds.unsqueeze(0)
#         print("Embedding shape ", embeds.shape)
        return embeds.to(device)
    
    def vocab_size(self):
        return len(self.vocab.keys())

In [3]:
test_df = pd.read_csv('./data/coco/coco_test_all.csv')

In [4]:
data_folder = ['coco','bing','flickr']
dataframes = []
for f in data_folder:
    files = os.listdir('./data/'+f)   
    for path in files:
        if('.csv' in path):
            csv_path = './data/'+f+'/'+path
            print(csv_path)
            df = pd.read_csv(csv_path)
            dataframes.append(df)
len(dataframes)

./data/coco/coco_test_all.csv
./data/coco/coco_train_all.csv
./data/coco/coco_val_all.csv
./data/bing/bing_train_all.csv
./data/bing/bing_test_all.csv
./data/bing/bing_val_all.csv
./data/flickr/flickr_val_all.csv
./data/flickr/flickr_train_all.csv
./data/flickr/flickr_test_all.csv


9

In [5]:
df = pd.concat(dataframes, axis = 0)
len(df)

14815

In [6]:
questions = list(df['questions'])
freq = {}
for q in questions:
    for question in q.split('---'):
        wordlist = nltk.word_tokenize(question)
        for word in wordlist:
            if(word not in freq):
                freq[word] = 1
            else:
                freq[word] += 1
len(freq)

12098

In [7]:
vocab = {}
counter = 0
for key in freq.keys():
    if freq[key]>=3:
        vocab[key] = counter
        counter += 1


vocab['<eoq>'] = counter
vocab['<start>'] = counter+1
vocab['<pad>'] = counter+2
vocab['<unk>'] = counter+3
len(vocab)

5065

In [8]:
class Vqgnet(nn.Module):
    def __init__(self, n_lstm_layers, embedding, max_len):
        super(Vqgnet, self).__init__()
        self.embedding = embedding
        self.model_vgg = models.vgg19(pretrained=True)
        for p in self.model_vgg.parameters():
            p.requires_grad = False
        classifier = nn.Sequential(*list(self.model_vgg.classifier.children())[:-1])
        self.features = self.model_vgg
        self.features.classifier = classifier
        self.transform_layer = nn.Linear(4096, 512)
        self.feature_to_word = nn.Linear(512, self.embedding.vocab_size())
        self.softmax = nn.Softmax(dim=1)
        self.n_lstm_layers = n_lstm_layers
        self.lstm = nn.LSTM(512, 512, 8)
        self.max_len = max_len
    
    def forward(self, image, question):
        # teacher forcing using gt question
        
        # getting image features
        x = self.features(image)
        x = F.relu(x)
        x = F.relu(self.transform_layer(x))
        x = x.unsqueeze(1)
        
        
        cell_state = torch.randn(8, 1, 512).to(device)
        hidden_state = torch.randn(8, 1, 512).to(device)
        cell_state = (hidden_state, cell_state)
        predicted_question = None
        # embedding phase
        for i in range(self.max_len):
            if(i == 0):
                embed = x
            else:
                embed = question[:,i,:]
            output, cell_state = self.lstm(embed, cell_state)
            output = self.feature_to_word(output)
            output = self.softmax(output)
            output = output.squeeze(0)
            if(i == 0):
                predicted_question = output
            else:
                predicted_question = torch.cat((predicted_question, output))
        
        return predicted_question.squeeze(1)
    
    def test(self, image):
        # get image features
        x = self.features(image)
        x = F.relu(x)
        x = F.relu(self.transform_layer(x))
        x = x.unsqueeze(1)
        
        cell_state = torch.randn(8, 1, 512).to(device)
        hidden_state = torch.randn(8, 1, 512).to(device)
        cell_state = (hidden_state, cell_state)
        output = x
        predicted_question = None
        # generate question
        pred = 0
        for i in range(self.max_len):
            output, cell_state = self.lstm(output, cell_state)
            pred = self.softmax(self.feature_to_word(output))
            pred = pred.squeeze(0)
            if(i == 0):
                predicted_question = pred
            else:
                predicted_question = torch.cat((predicted_question, pred))
            pred = torch.argmax(pred)
        
        return predicted_question.squeeze(1)
        

In [9]:
def train_model(model, dataloader, criterion, optimizer, scheduler, device, embedding, vocab, num_epochs=25):
    train_loss = []
    val_loss = []
    
    # looping over number of epochs
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        # looping over train validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            running_loss = 0.0
        
            # looping over phase data 
            for image, question in dataloader[phase]:

                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    if(phase == 'train'):
#                         print(question)
                        output = model(image, question)
                    else:
                        output = model.test(image)
                        if len(output) < len(question):
                            for i in range(len(question) - len(output)):
                                output.append(torch.zeros(output[0].shape))

                    # getting one_hot encoding
                    labels = torch.zeros([len(output)]).long().to(device)
                    for i in range(len(question)):
                        if(question[i][0] not in vocab.keys()):
                            labels[i] = vocab['<unk>']
                        else:
                            labels[i] = vocab[question[i][0]]
                        
                    # finding the loss
                    loss = criterion(output, labels)
                    # back propogating the weights
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                # adding to loss
                running_loss += loss.item()
            
            if phase == 'train':
                scheduler.step()
            
            # finding and printing epoch loss
            epoch_loss = running_loss / len(dataloader[phase])
            print('{} Loss: {:.4f} '.format(
                phase, epoch_loss))
            
            # appending loss to list 
            if(phase == 'train'):
                train_loss.append(epoch_loss)
            else:
                val_loss.append(epoch_loss)
                    
    return model, train_loss, val_loss

In [10]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.5, 0.5, 0.5])
])

In [11]:
# Dataloader
def read_data(csv_file, image_folder, embeddings):
    dataloader = []
    df = pd.read_csv(csv_file)
    files = os.listdir(image_folder)
    for index, row in df.iterrows():
        file_name = str(row['image_id']) + '.jpg'
        question = row['questions'].split('---')[0]
        img = Image.open(image_folder+ '/' + file_name).convert('RGB')
        img_tensor = transform(img)
        question_words = nltk.word_tokenize(question)
        question_words.append('<eoq>')
        question_words = ['<start>'] + question_words
        for i in range(len(question_words),26):
            question_words.append('<pad>')
        question_embeddings = torch.empty(0,1, 512).to(device)
        for word in question_words:
            question_embeddings = torch.cat((question_embeddings, embeddings.get_embedding(word)))
        dataloader.append([img_tensor.to(device), question_embeddings])
    return dataloader

In [12]:
embeddings = Embeddings(vocab, 512)
train_set = read_data('./data/coco/coco_train_all.csv', './data/coco/train_images', embeddings)

In [13]:
val_set = read_data('./data/coco/coco_val_all.csv', './data/coco/val_images', embeddings)

In [14]:
TrainImageLoader = torch.utils.data.DataLoader(train_set, batch_size=8, shuffle=True)
ValImageLoader = torch.utils.data.DataLoader(val_set, batch_size=8, shuffle=True)
dataloaders = {'train':TrainImageLoader, 'val':ValImageLoader}

In [15]:
net = Vqgnet(1, embeddings, 26)
net = net.to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.parameters(), lr=0.01)

exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.10)

In [16]:
net, train_loss, val_loss = train_model(net, dataloaders, criterion, optimizer, exp_lr_scheduler,
                                        device, embeddings, vocab, num_epochs=10)

Epoch 0/9
----------
train Loss: 8.5301 
val Loss: 8.5301 
Epoch 1/9
----------


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.