# Getting the data

In [None]:
!nvidia-smi

Tue Mar 30 11:15:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install pytorch-lightning
!pip install gensim

In [None]:
import math

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

from gensim.models import KeyedVectors
from gensim.utils import tokenize

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision import transforms

import torchtext
from torchtext.data import get_tokenizer

import pytorch_lightning as pl

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
wordmodel = api.load("glove-wiki-gigaword-50")  # download the model and return as object ready for use



# Vocab

In [None]:
index2word = wordmodel.__dict__['index2word'].copy()
index2word.insert(0, '<unk>')
index2word.insert(0, '<end>')
index2word.insert(0, '<start>')
word2index = {w:i for i, w in enumerate(index2word)}

def indextoword(index):
  try:
    return index2word[index]
  except:
    print('enter a valid index')


def wordtoindex(word):
  try:
    return word2index[word]
  except:
    return word2index['<unk>']

# Dataset

In [None]:
dialgogue_turn_max_lenght = 10
caption_max_lenght = 15
embedlenght = 50

tokenizer = get_tokenizer("basic_english")

def clippad(sequence, max_lenght=10):
  n = len(sequence)
  if n > max_lenght:
    return sequence[:max_lenght]
  else:
    for i in range(max_lenght - n):
      sequence.append('')
    return sequence
  
def myTokenizer(sentence, max_lenght=dialgogue_turn_max_lenght, lowercase=True, caption=False):
  tokensequence = tokenizer(sentence)
  if caption:
    tokensequence.insert(0, '<start>')
    tokensequence.append('<end>')
  tokensequence = clippad(tokensequence, max_lenght=max_lenght)
  return tokensequence

def myWordModel(s, embedlenght=embedlenght):
  try:
    out = list(wordmodel[s])
    return out
  except:
    return [0 for x in range(embedlenght)]

class Dial2DescDataset(torch.utils.data.Dataset):
  def __init__(self, dev=False):

    path = 'drive/MyDrive/Datasets/VisDial/visdial_1.0_(Unzipped Files)/visdial_1.0_val.json' if dev else 'drive/MyDrive/Datasets/VisDial/visdial_1.0_(Unzipped Files)/visdial_1.0_train.json'

    traindialogue = pd.read_json(path)[['data']] if dev else pd.read_json(path)[['data']]
    answers = traindialogue.loc['answers'][0]
    dialogs = traindialogue.loc['dialogs'][0]
    questions = traindialogue.loc['questions'][0]
    del traindialogue

    n = len(dialogs)
    questionsarray = torch.empty([n, 10, dialgogue_turn_max_lenght, embedlenght]).to(device)
    answersarray = torch.empty([n, 10, dialgogue_turn_max_lenght, embedlenght]).to(device)
    i = 0
    for elm in dialogs:
      questionsi = [questions[x['question']] for x in elm['dialog']]
      answersi = [answers[x['answer']] for x in elm['dialog']]
      questionsarray[i] = torch.tensor([[myWordModel(y) for y in myTokenizer(x)] for x in questionsi])
      answersarray[i] = torch.tensor([[myWordModel(y) for y in myTokenizer(x)] for x in answersi])
      i+=1
      if i == n -1:
          break
    del answers
    del questions

    captions = np.array([elm['caption'] for elm in dialogs])[:n]

    del dialogs

    #decoder inputs and outputs
    captionsTokens = [myTokenizer(elm, max_lenght=caption_max_lenght, caption=True) for elm in captions]
    captionsI = [[myWordModel(x) for x in y[:-1]] for y in captionsTokens]
    captionsO = [[wordtoindex(x) for x in y[1:]] for y in captionsTokens]

    self.captionsI = torch.tensor(captionsI).to(device)
    self.captionsO = torch.tensor(captionsO).to(device)
    self.questions = questionsarray
    self.answers = answersarray

  def __len__(self):
      return len(self.captionsI)  #required
      
  def __getitem__(self, idx):
      question = self.questions[idx]
      answer = self.answers[idx]
      captionI = self.captionsI[idx]
      captionO = self.captionsO[idx]
      return question, answer, captionI, captionO

In [None]:
train = Dial2DescDataset()
dev = Dial2DescDataset(dev=True)
len(train)

123287

# Model

## Position Encoder

In [None]:
class PositionalEncoding(nn.Module):    
  def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len,dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float()*(-math.log(10000.0) / d_model))        
    pe[:, 0::2] = torch.sin(position * div_term)        
    pe[:, 1::2] = torch.cos(position * div_term)        
    pe = pe.unsqueeze(0).transpose(0, 1)        
    self.register_buffer('pe', pe)
  def forward(self, x):
    x = x + self.pe[:x.size(0), :]        
    return self.dropout(x)

## Encoder Layer

In [None]:
src_vocab_size = 400003
embed_size = 50
turns = 10
seq_len = 10
heads = 2
hidden_size = 256
num_layers = 1
batch_size = 16

class EnhancedInteractionDialogueEncoder(nn.Module):
  
  def __init__(self, src_vocab_size, embed_size, turns, heads, hidden_size, num_layers, seq_len=seq_len):
    super(EnhancedInteractionDialogueEncoder, self).__init__()
    self.turns = turns
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.UELSTMSQ = nn.ModuleList([nn.LSTM(embed_size, hidden_size, num_layers, bidirectional=True) for i in range(turns)])
    self.UEHiddensQ = [(torch.randn(num_layers*2, batch_size, hidden_size).to(device), torch.randn(num_layers*2, batch_size, hidden_size).to(device)) for i in range(turns)]
    self.UELSTMSA = nn.ModuleList([nn.LSTM(embed_size, hidden_size, num_layers, bidirectional=True) for i in range(turns)])
    self.UEHiddensA = [(torch.randn(num_layers*2, batch_size, hidden_size).to(device), torch.randn(num_layers*2, batch_size, hidden_size).to(device)) for i in range(turns)]

    self.UIAttentionsQ = nn.ModuleList([nn.MultiheadAttention(2*hidden_size, heads) for i in range(turns)])
    self.UIAttentionsA = nn.ModuleList([nn.MultiheadAttention(2*hidden_size, heads) for i in range(turns)])

    self.DCLSTMSQ = nn.ModuleList([nn.LSTM(embed_size+hidden_size*4*2, hidden_size, num_layers, bidirectional=True) for i in range(turns)])
    self.DCHiddensQ = [(torch.randn(num_layers*2, batch_size, hidden_size).to(device), torch.randn(num_layers*2, batch_size, hidden_size).to(device)) for i in range(turns)]
    self.DCLSTMSA = nn.ModuleList([nn.LSTM(embed_size+hidden_size*4*2, hidden_size, num_layers, bidirectional=True) for i in range(turns)])
    self.DCHiddensA = [(torch.randn(num_layers*2, batch_size, hidden_size).to(device), torch.randn(num_layers*2, batch_size, hidden_size).to(device)) for i in range(turns)]

    self.fc = nn.Linear(hidden_size*2*2, embed_size)
    self.position_encoding = PositionalEncoding(embed_size, max_len=turns*seq_len)
    self.MOTransformers = nn.TransformerEncoderLayer(embed_size, heads, dim_feedforward=embed_size)

  def forward(self, questions, answers):
    questions = questions.permute(1, 2, 0, 3)
    answers = answers.permute(1, 2, 0, 3)
    seq_len = questions.shape[1]

    #utterance encoding layer
    ha = torch.zeros((turns, seq_len, batch_size, 2*self.hidden_size)).to(device)
    hb = torch.zeros((turns, seq_len, batch_size, 2*self.hidden_size)).to(device)
    for i in range(turns):
      ha[i], _ = self.UELSTMSQ[i](questions[i], self.UEHiddensQ[i])
      hb[i], _ = self.UELSTMSA[i](answers[i], self.UEHiddensA[i])

    #utterance interaction layer
    sa = torch.zeros_like(ha).to(device)
    sb = torch.zeros_like(hb).to(device)
    for i in range(turns):
      sa[i], _ = self.UIAttentionsQ[i](ha[i], hb[i], hb[i])
      sb[i], _ = self.UIAttentionsA[i](hb[i], ha[i], ha[i])

    #densely connected recurrent layer 
    sae = torch.cat((ha, sa, ha-sa, ha*sa, questions), -1)
    sbe = torch.cat((hb, sb, hb-sb, hb*sb, answers), -1)
    hap = torch.zeros((turns, seq_len, batch_size, 2*self.hidden_size)).to(device)
    hbp = torch.zeros((turns, seq_len, batch_size, 2*self.hidden_size)).to(device)
    for i in range(turns):
      hap[i], _ = self.DCLSTMSQ[i](sae[i], self.DCHiddensQ[i])
      hbp[i], _ = self.DCLSTMSA[i](sbe[i], self.DCHiddensA[i])

    #memory output layer
    out = torch.cat((hap, hbp), -1).reshape(-1, batch_size, hidden_size*2*2)
    out = self.fc(out)
    positionE = self.position_encoding(out)
    out = out + positionE
    out = self.MOTransformers(out)
    return out

In [None]:
encoder = EnhancedInteractionDialogueEncoder(src_vocab_size, embed_size, turns, heads, hidden_size, num_layers).to(device)
m = encoder.forward(question, answer)
decode = Decoder(embed_size, hidden_size, heads).to(device)
decode.forward(captioni, m).shape

## Decoder Layer

In [None]:
caption_len = 14

class Decoder(nn.Module):

  def __init__(self,embed_size, hidden_size, heads):
    super(Decoder, self).__init__()

    self.positionE = PositionalEncoding(embed_size, max_len=caption_len)
    self.transformersDecoder = nn.TransformerDecoderLayer(embed_size, heads)
  
  def forward(self, tgt, memory):
    tgt = tgt.permute(1, 0, 2)
    positionE = self.positionE(tgt)
    out = tgt + positionE
    out = self.transformersDecoder(out, memory)
    return out

## Full model

In [None]:
# Hyper-parameters
num_epochs = 4
learning_rate = 1e-5


class Dial2Desc(pl.LightningModule):
    def __init__(self, src_vocab_size, embed_size, turns, heads, hidden_size, num_layers, seq_len=seq_len):
        super(Dial2Desc, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.encoder = EnhancedInteractionDialogueEncoder(src_vocab_size, embed_size, turns, heads, hidden_size, num_layers)
        self.decoder = Decoder(embed_size, hidden_size, heads)
        self.fc = nn.Linear(embed_size, src_vocab_size)
        self.logsoftmax = nn.LogSoftmax(dim=-1)
        self.loss = nn.NLLLoss()
        

    def forward(self, questions, anwsers, captionsI):
        m = self.encoder(questions, anwsers)
        out = self.decoder(captionsI, m)
        out = self.fc(out)
        out = self.logsoftmax(out)
        return out

    def training_step(self, batch, batch_idx):
        questions, anwsers, captionsI, captionsO = batch
        # Forward pass
        outputs = self(questions, anwsers, captionsI)
        outputs = outputs.permute(1, 2, 0)
        loss = self.loss(outputs, captionsO)
        # use key 'log'
        return {"loss": loss}

    def train_dataloader(self):
        # Data loader
        train_loader = torch.utils.data.DataLoader(
            dataset=train, batch_size=batch_size, shuffle=True, drop_last=True
        )
        return train_loader

    def val_dataloader(self):
        val_loader = torch.utils.data.DataLoader(
            dataset=dev, batch_size=batch_size, shuffle=False, drop_last=True
        )
        return val_loader
    
    def validation_step(self, batch, batch_idx):
        questions, anwsers, captionsI, captionsO = batch
        # Forward pass
        outputs = self(questions, anwsers, captionsI)
        outputs = outputs.permute(1, 2, 0)
        loss = self.loss(outputs, captionsO)
        # use key 'log'
        return {"val_loss": loss}
    
    def validation_epoch_end(self, outputs):
        # outputs = list of dictionaries
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        # use key 'log'
        return {'val_loss': avg_loss}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=learning_rate)

In [None]:
model = Dial2Desc(src_vocab_size, embed_size, turns, heads, hidden_size, num_layers)

In [None]:
trainer = pl.Trainer(max_epochs=num_epochs, gpus=1)
trainer.fit(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                               | Params
------------------------------------------------------------------
0 | encoder    | EnhancedInteractionDialogueEncoder | 130 M 
1 | decoder    | Decoder                            | 227 K 
2 | fc         | Linear                             | 20.4 M
3 | logsoftmax | LogSoftmax                         | 0     
4 | loss       | NLLLoss                            | 0     
------------------------------------------------------------------
150 M     Trainable params
0         Non-trainable params
150 M     Total params
603.298   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [None]:
question = dev[:batch_size][0]
answer = dev[:batch_size][1]
captioni = dev[:batch_size][2]
start = torch.zeros((16, 14, 50)).to(device)
predictions = torch.zeros((16, 14))
'''for i in range(caption_len):
  output = model.forward(question, answer, start)
  tokens = output.permute(1, 0, 2)[:, i, :].argmax(dim=-1)
  predictions[:, i] = tokens
  for j in range(caption_len):
    start[j][i] = torch.tensor(myWordModel(tokens[i]))'''

In [None]:
out = model.forward(question, answer, captioni).permute(1, 0, 2).argmax(dim=-1)[0].tolist()
out = predictions[0].int().tolist()
for elm in out:
  print(indextoword(elm), end=' ')