In [20]:
import torch.nn as nn
import torch
import numpy as np
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm_notebook
from torchvision import models

device = torch.device("cuda")
device

device(type='cuda')

In [21]:
class EncoderCNN(nn.Module):
    def __init__(self, output_size, dropout=0.10, trainCNN = False):
        super(EncoderCNN, self).__init__()
        
        # Define the embedding size used in the output
        self.output_size = output_size
        # Resnet model
        # Remove the fc layer and the avg pool layer
        self.cnn = nn.Sequential(*list(models.resnet18(pretrained=True).children())[:-2])
        # Define the output linear transformation, relu, dropout CNN output -> out_size
        self.out = nn.Linear(output_size , output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        # Replace the last linear transf of the CNN with the custom linear transf
        # Lock CNN training parameters if trainCNN = False
        self.lock() if not trainCNN else None
        
    def lock(self):
        for parameter in self.cnn.parameters():
            parameter.requires_grad = False
    
    def forward(self, input_images):
        
        out = self.cnn(input_images)
        # Transform the CNN output to [b, H, W, C] 
        out = out.permute(0, 3, 2, 1)
        out = out.reshape(out.shape[0], out.shape[1] * out.shape[2], out.shape[3])
        
        out = self.dropout(self.relu(self.out(out)))
        
        return out


In [37]:
class BahdanauAttentionDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, dropout=0.10):
        super(BahdanauAttentionDecoder, self).__init__()
        
        # Define the dimension size for the vocab, hidden and embedding
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # Define the linear tranf for each score
        self.Qattn = nn.Linear(self.embed_size, self.hidden_size, bias=False)
        self.Kattn = nn.Linear(self.embed_size, self.hidden_size, bias=False)
        self.Vattn = nn.Linear(self.hidden_size, 1, bias=False)
        
        self.out1 = nn.Linear(self.hidden_size, self.hidden_size)
        # dropout, batchnorm
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(num_features=hidden_size)
        # Embedding layers
        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        # Sequential lstm
        self.hidden_concat = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.input_lin = nn.Linear(self.hidden_size * 2, self.hidden_size)
        # Out linear
        self.out2 = nn.Linear(self.hidden_size, self.vocab_size)
         
    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size).to(device)
    
    def forward(self, inpt_sequence, image_features, hidden):
    
        # Score shape = [B, sequence_length, 1]
        # Bahdau Fattn = Wv * tanh[ image_features(encoder_outputs kinda) * Wk + hidden * Wq ]
        hidden = hidden.unsqueeze(1)
        score = self.Vattn(torch.tanh(self.Kattn(image_features) + self.Qattn(hidden)))
        
        # Softmax this beautiful score
        attn = nn.functional.softmax(score, dim=1)
        
        # Create the context vector
        # Attention weights -> [b, max_seq_len, value]
        # Image features -> [b, max_seq_len, embed_size]
        # Context vector ->[b, value, embed_size]
        context_vector = attn * image_features

        context_vector = torch.sum(context_vector, 1)
        context_vector = context_vector.view(context_vector.shape[0], -1)
        
        # Embedd the input sequence
        embedding_sequence = self.embedding(inpt_sequence) # [B, 1, embed_size]
        
        x = torch.cat((embedding_sequence.view(embedding_sequence.shape[0], 1, -1), context_vector.view(context_vector.shape[0], 1, -1)), dim=2)
        x = self.dropout(self.input_lin(x))
        
        x, hidden = self.gru(x)
        x = x.view(x.shape[0], -1)
        
        x = self.dropout(self.out1(x))
        x = self.batch_norm(x)
        x = self.out2(x)
        
        # Softmax will be applied with the nn.Croessentropy loss
        # no need to double softmax this boi
        
        return x, hidden
        
        

In [38]:
decoder = BahdanauAttentionDecoder(512, 512 ,1000).to(device)
decoder(torch.zeros(32, 1).type(torch.LongTensor).cuda(), torch.zeros(32, 49, 512).to(device), torch.zeros(32, 512).to(device))

(tensor([[-0.1789, -0.5255,  0.0902,  ...,  0.0377, -0.0546,  0.1611],
         [-0.4326, -1.1425,  0.7373,  ..., -0.4647, -0.2748, -1.4796],
         [ 0.4928,  0.0301, -0.8610,  ..., -1.2885, -0.1576, -0.5311],
         ...,
         [-0.4320,  1.2746,  0.7269,  ..., -0.4191, -0.2051, -0.4628],
         [-0.8408,  0.7471,  0.5631,  ..., -0.0415,  0.7361,  0.3495],
         [-0.0396, -0.7126,  0.3778,  ..., -0.7192,  0.0017, -0.3393]],
        device='cuda:0', grad_fn=<AddmmBackward>),
 tensor([[[ 0.0270,  0.0559,  0.1382,  ..., -0.0162, -0.2553, -0.1315],
          [ 0.0917,  0.0926,  0.0941,  ..., -0.0593, -0.2707, -0.0671],
          [ 0.0237,  0.0540,  0.0219,  ..., -0.0232, -0.2892, -0.1058],
          ...,
          [ 0.0066,  0.0607,  0.0097,  ..., -0.0563, -0.2388, -0.1141],
          [ 0.0731,  0.0709,  0.0740,  ..., -0.0262, -0.2163, -0.0017],
          [ 0.0590,  0.0355,  0.0999,  ..., -0.0205, -0.1912, -0.0389]]],
        device='cuda:0', grad_fn=<CudnnRnnBackward>))

In [39]:
class EncoderToDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, dropout=0.10):
        super(EncoderToDecoder, self).__init__()
        
        # Define the dimension size for the vocab, hidden and embedding
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # Define the encoder and the decoder network
        self.encoder = EncoderCNN(embedding_size)
        self.decoder = BahdanauAttentionDecoder(embedding_size, hidden_size , vocab_size)
        
    def init_hidden(self, batch):
        return self.decoder.init_hidden(batch)
        
    def forward(self, inp, images, hidden):
        
        image_features = self.encoder(images)
        
        x, hidden = self.decoder(inp, image_features, hidden)
        
        return x, hidden

In [80]:
from torch.utils.data import DataLoader, Dataset
from PIL import Image  # Load img
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pad_sequence  # pad batch
import spacy
import pandas as pd
import os

spacy_eng = spacy.load("en")

In [81]:
import unicodedata
import string

class Vocabulary:
    def __init__(self, dataset, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.dataset = dataset
        seof.build_vocabulary()
        
        
    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        tokens = [tok.text.lower() for tok in spacy_eng.tokenizer(text)]  
        
        return tokens

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


In [82]:
class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=1):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file, sep="\t", header=None)
        self.transform = transform

        # Get img, caption columns
        self.imgs = self.df[self.df.columns[0]]
        self.captions = self.df[self.df.columns[1]]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img = Image.open(os.path.join(self.root_dir, img_id.split('#')[0])).convert("RGB")
        
        if self.transform is not None:
            img = self.transform(img)

        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])
        
        return img, torch.tensor(numericalized_caption)

In [83]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)
        return imgs, targets

In [84]:
def get_loader(
    root_folder,
    annotation_file,
    transform,
    batch_size=64,
    num_workers=0,
    shuffle=True,
    pin_memory=True,
):
    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        drop_last=True,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset

In [85]:
transform = transforms.Compose(
    [transforms.Resize((224, 224)), transforms.ToTensor(),]
)

loader, dataset = get_loader(
    "F:\Flickr8k\Flicker8k_Dataset", "F:\Flickr8k\Flickr8k.token.txt", transform=transform
)

In [86]:
embedding_size = 512
hidden_size = 512
vocab_size = len(dataset.vocab)
batch = 64


model = EncoderToDecoder(embedding_size, hidden_size , vocab_size).to(device)

model_optimizer = optim.Adam(model.parameters(), lr=3e-4)

criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])


In [132]:
def train():
    
    model.train()
    
    for epoch in range(2):
        epoch_loss = 0
        for idx, (imgs, captions) in enumerate(loader):   
            
            imgs = imgs.to(device)
            captions = captions.to(device)
            
            model_optimizer.zero_grad()
            hidden = model.init_hidden(batch)
            result_batch = [1]
            loss = 0

            for i in range(1, captions.shape[1]):
                out, hidden = model(captions[:, i-1], imgs, hidden.view(batch, -1))

                topv, topi = out.topk(1)
                result_batch.append(topi[5].item())
                loss += criterion(out, captions[:, i])

            epoch_loss += loss.item()
            loss.backward() 
            # Clip the gradients for the sequential models
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            model_optimizer.step()

            if (idx+1) % 25  == 0 :
                result_string = [dataset.vocab.itos[x] if x in dataset.vocab.itos else dataset.vocab.itos[3] for x in result_batch]
                print(result_string)
                print('Loss after 25 batch iterations is {}'.format(epoch_loss / idx * 25))
        
        print('The loss for epoch {} is {}'.format(epoch, epoch_loss / len(loader.dataset)))
train()

['<SOS>', 'a', 'party', 'party', 'of', 'off', 'bridesmaids', 'party', 'is', 'standing', 'a', 'and', 'a', 'animals', 'gathered', 'outside', '<EOS>', 'a', 'a', 'a', 'a']
Loss after 25 batch iterations is 782.6465944449108
['<SOS>', 'a', 'man', 'wearing', 'a', 'red', 'on', 'a', 'beach', '<EOS>', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a']
Loss after 25 batch iterations is 757.9713344573975
['<SOS>', 'three', 'women', 'posing', 'beers', ' ', 'posing', 'for', 'a', 'picture', '<EOS>', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a']
Loss after 25 batch iterations is 749.9689463022593


KeyboardInterrupt: 

In [133]:
model.eval()

EncoderToDecoder(
  (encoder): EncoderCNN(
    (cnn): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, e

In [137]:
def test_sentences():
    with torch.no_grad():
        horse = transform(Image.open(os.path.join(os.path.abspath('F:\\Flickr8k\\test'), 'cal.jpg')).convert("RGB"))
        dog = transform(Image.open(os.path.join(os.path.abspath('F:\\Flickr8k\\test'), 'dog.jpg')).convert("RGB"))
        girl = transform(Image.open(os.path.join(os.path.abspath('F:\\Flickr8k\\test'), 'girl4.png')).convert("RGB"))
        boat = transform(Image.open(os.path.join(os.path.abspath('F:\\Flickr8k\\test'), 'boat.png')).convert("RGB"))
        out = torch.ones(1, 1).type(torch.LongTensor).to(device)
        result = [1]

        hidden = model.init_hidden(1)

        for i in range(50):
            out, hidden = model(out, boat.view(1, horse.shape[0], horse.shape[1], horse.shape[2]).to(device), hidden.view(1, -1))
            topv, topi = out.topk(1)
            
            print(topi)
            
            out = topi.view(-1, 1)
            result.append(out.item())
        
    result_string = [dataset.vocab.itos[x] if x in dataset.vocab.itos else dataset.vocab.itos[3] for x in result]
    print(result_string)

test_sentences()

tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='cuda:0')
tensor([[1952]], device='cuda:0')
tensor([[199]], device='cuda:0')
tensor([[13]], device='cuda:0')
tensor([[200]], device='cuda:0')
tensor([[2]], device='

In [136]:
torch.save(model.state_dict(),"C:\\Users\\Cordu\\Desktop\\Projects\\Artificial-Intelligence\\RIST PROJECTS\\NLP BEGINNER\\model.pth")