In [27]:
import os
import numpy as np
from tqdm.notebook import tqdm
import re
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec as w2v

In [28]:
BASE_DIR = r'C:\Users\Silen\Documents\Flickr8k'

In [29]:
#getting just the caption information from the file
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [30]:
image_dict = {}

for line in tqdm(captions_doc.split('\n')):
    #tokens are formatted as "{image_id},{caption}"
    tokens = line.split(',')
    #tokens are now formatted as "{image_id}", "{caption}"
    image_id, caption = tokens[0], tokens[1:]
    #image_id is formatted as {id}.jpg
    #remove .jpg from the {id}
    image_id = image_id.split('.')[0]
    #caption is currently a list
    #convert caption from list to string
    caption = ' '.join(caption)
    #creates an index in the dict for image if not already there
    if image_id not in image_dict:
        image_dict[image_id] = []
    #connects all the captions to their respective images in one dict
    image_dict[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [31]:
def cleanText(image_dict):
    for key, captions in image_dict.items():
        for i in range(len(captions)):
            #load one caption at a time
            caption=captions[i]
            #lower case the whole string
            caption=caption.lower()
            #remove extra space
            caption=re.sub('\s\s+', ' ', caption)
            #get rid of any special characters or numbers
            caption=re.sub('[^A-Za-z\s]', '', caption)
            #add begining and ending tags and get rid of one character words
            caption='<start>' + ' '.join([word for word in caption.split() if len(word) > 1]) + '<end>'
            #add caption back into list
            captions[i]=caption


In [32]:
#before preprocessing
image_dict['1015118661_980735411b']

['A boy smiles in front of a stony wall in a city .',
 'A little boy is standing on the street while a man in overalls is working on a stone wall .',
 'A young boy runs aross the street .',
 'A young child is walking on a stone paved street with a metal pole and a man behind him .',
 'Smiling boy in white shirt and blue jeans in front of rock wall with man in overalls behind him .']

In [33]:
cleanText(image_dict)
#after prepossing
image_dict['1015118661_980735411b']

['<start>boy smiles in front of stony wall in city<end>',
 '<start>little boy is standing on the street while man in overalls is working on stone wall<end>',
 '<start>young boy runs aross the street<end>',
 '<start>young child is walking on stone paved street with metal pole and man behind him<end>',
 '<start>smiling boy in white shirt and blue jeans in front of rock wall with man in overalls behind him<end>']

In [34]:
# nltk.download('stopwords')
sw = stopwords.words('english')

In [46]:
def tokenize(clean_image_dict):
    # gather all captions in one list
    all_captions = []
    for i in clean_image_dict:
        for caption in clean_image_dict[i]:
            all_captions.append(caption)
    all_captions = [word_tokenize(caption) for caption in all_captions] #list of all captions

    # remove stopwords
    filtered_captions = []
    for caption in all_captions:
        original = caption
        # removes stopword when detected in line
        caption = [w for w in caption if w not in sw]
        # if the whole line was deleted the original line is used instead
        if len(caption) < 1:
            caption = original
        filtered_captions.append(caption)
    
    # creates word2vec keyedvector to store the vocabulary
    # uses skip-gram as the training algorithm
    # with a window of 3
    w = w2v(
        filtered_captions,
        vector_size=256,
        min_count=3,
        sg=1
    )
    return w


tokenized_captions = tokenize(image_dict)
vocab = list(tokenized_captions.wv.index_to_key)

In [42]:
len(tokenized_captions.wv)

3978

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pack_padded_sequence

In [39]:
# loads the word vectors into a float tensor to pretrain a word embedding layer
weight = torch.FloatTensor(tokenized_captions.wv.vectors)
embedding = nn.Embedding.from_pretrained(weight)

In [None]:
#device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#hyper parameters
num_epochs = 4
batch_size = 4
learning_rate = 0.001

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 128)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
    def forward(self, x):
        with torch.no_grad:
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16*5*5)
        x = self.bn(self.fc1(x))
        return x

model = EncoderCNN(256).to(device=device)

In [50]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, weight, max_seq_length=80):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding.from_pretrained(weight)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length
        self.softmax = nn.LogSoftmax(dim=1)
    def init_hidden(self,):
        return(torch.zeros(1, self.hidden_size))
    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        #packed = pack_padded_sequence(embeddings, lengths.cpu())
        outputs, _ = self.lstm(embeddings)
        outputs = self.linear(outputs[0])
    def predict(self, features, states=None):
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seq_length):
            hidden, states = self.lstm(inputs, states)
            outputs = self.linear(hidden.squeeze(1))
            _, predicted = outputs.max(1)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)
        sampled_ids = torch.stack(sampled_ids, 1)
        sentences = []
        for sampled_id in sampled_ids:
            sampled_id = sampled_id.cpu().numpy()
            sampled_caption = []
            for word_id in sampled_id:
                word = vocab[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
                print(sampled_caption)
            sentence = ' '.join(sampled_caption)
            sentences.append(sentence)
            return sentences

In [None]:
def train_batch(data, encoder, decoder, optimizer, criterion):
    encoder.train()
    decoder.train()
    # TODO: Need dataloader to load batches at a time
    images, captions