In [1]:
"""
Retrieval Model for "Wizard Of Wikipedia" task based on Dinan et al. Wizard Of Wikipedia: Knowledge
Powered Conversational Agents.

Deepak Goyal <deepak.16je002137@ece.iitism.ac.in>

"""

'\nRetrieval Model for "Wizard Of Wikipedia" task based on Dinan et al. Wizard Of Wikipedia: Knowledge\nPowered Conversational Agents.\n\nDeepak Goyal <deepak.16je002137@ece.iitism.ac.in>\n\n'

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
"""
This code takes inspiration from the following sources:

The Annotated Transformer : http://nlp.seas.harvard.edu/2018/04/03/attention.html
Illustrated Transformer   : http://jalammar.github.io/illustrated-transformer/
Samuel Lynn Evans's Blog  : https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
Tensorflow's Medium Blog  : https://medium.com/tensorflow/a-transformer-chatbot-tutorial-with-tensorflow-2-0-88bf59e66fe2
Pytorch's Chatbot Tutorial: https://pytorch.org/tutorials/beginner/chatbot_tutorial.html

"""

In [0]:
# Framework for running model on GPU
import torch
import torch.nn as nn
import torch.nn.functional as F

# For Visualising gradient flow across the model
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# For Visualising training loss, accuracy Vs epochs
# from visdom import Visdom

# Basic computation library needed for visdom visualising class
import numpy as np

# Other libraries for data loading, processing etc
import json
import os
import re
import math
import copy
import os
import string
import time

In [0]:
# Specify which machine you are running things
running_device='colab'

In [0]:
def raw_data_loader(running_device):
    """ 
        Function for loading json data into a list
    """
    
    file_dir=''
    file_name='data.json'
    if running_device=='local':
        file_dir='/home/naive/Documents/rohit/Wizard Of Wikipedia/Dataset'
    elif running_device=='colab':
        file_dir='/content/drive/My Drive/Data/Wizard of Wikipedia/wizard_of_wikipedia'
    else:
        print("Invalid running device")
        return
    
    data=os.path.join(file_dir,file_name)
    
    json_data=None
    with open(data) as f:
        json_data=json.load(f)
    
    return json_data

In [0]:
json_data=None

print("Loading Raw Data into a list....")

t1=time.time()
json_data=raw_data_loader(running_device)
t2=time.time()

print("Loading raw data took "+str(t2-t1)+" seconds")

In [0]:
class vocabulary:
    """ Creates vocabulary for our corpus data of conversations"""
    
    def __init__(self):
        """
            self.word2Index(Dictionary): Maps words to tokens
            self.num_words(Scalar)     : Number of distinct words in the vocabulary
            
        """
        
        self.word2Index={'PAD':0,"<START>":1,"<END>":2}
        self.num_words=3
        
    def addSentence(self,sentence):
        """ 
            Adds sentences into vocabulary by splitting them
        """
        
        for word in sentence.split(" "):
            self.addWord(word)
    
    def addWord(self,word):
        
        if word not in self.word2Index.keys():
            self.word2Index[word]=self.num_words
            self.num_words+=1       

In [0]:
# Preprocessing functions for the data

def remove_punctuation(sentence):
    # thanks to https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    return sentence.translate(str.maketrans('', '', string.punctuation)).lower().strip()

In [0]:
def load_data(data_json):
    
    context, knowledge, wizard=[],[],[]
    dict2cnt={}
    count=0
    
    N=len(data_json)
    for id in range(N):
        
        current_knowledge,current_wizard,context_knowledge,previous_context=[],[],[],[]
        tmp_context=[]
        
        current_conv=data_json[id]
        
        topic_chosen=remove_punctuation(current_conv['chosen_topic'])
        tmp_context.append(topic_chosen)
        
        conversation_length=len(current_conv['dialog'])
        
        for i in range(conversation_length):
            
            if current_conv['dialog'][i]['speaker']=='0_Wizard':
                
                dict2cnt[count]=len(current_conv['dialog'][i]['retrieved_passages'])
                count=count+1
                
                for x in range(len(current_conv['dialog'][i]['retrieved_passages'])):
                    
                    idx=current_conv['dialog'][i]['retrieved_passages'][x]
                    
                    for value in idx.values():
                        
                        for j in range(len(value)):
                            value[j]=remove_punctuation(value[j])
                            
                        current_knowledge.append(value)
                
                wizard_dialog=remove_punctuation(current_conv['dialog'][i]['text'])
                
                current_wizard.append(wizard_dialog)
                tmp_context.append(wizard_dialog)
            
            else:
                apperentice_dialog=remove_punctuation(current_conv['dialog'][i]['text'])
                tmp_context.append(apperentice_dialog)
                
        knowledge.append(current_knowledge)
        wizard.append(current_wizard)
        
        final_context=[]
        flag=True
        
        if current_conv['dialog'][0]['speaker']=='0_Wizard':
            flag=False
        
        for i in range(len(tmp_context)):
            temp=[]
            
            for j in range(i+1):
                temp.append(tmp_context[j])
                
            if flag:
                if i%2!=0 and len(final_context)<len(current_wizard):
                    final_context.append(temp)
                    
            else:
                if i%2==0 and len(final_context)<len(current_wizard):
                    final_context.append(temp)
        
        context.append(final_context)
    
    return context, knowledge, wizard, dict2cnt

print("Getting 22311 conversation's context, knowledge, wizard utterances..")
t1=time.time()
context, knowledge, wizard, dict2cnt=load_data(json_data)        
t2=time.time()
print("This took: "+str(t2-t1)+" seconds")

In [0]:
def after_load_process(context,knowledge,wizard):
    input_context=[]
    input_knowledge=[]
    output_wizard=[]
    for i in range(len(wizard)):
        for c in context[i]:
            input_context.append(c)
        for k in knowledge[i]:
            input_knowledge.append(k)
        for w in wizard[i]:
            output_wizard.append(w)

    input_knowledge2=[]
    index=0
    for i in range(50246):
        r=dict2cnt[i]
        tmp=[]
        fr=index
        to=index+r
        for j in range(fr,to):
            tmp.append(input_knowledge[index])
            index=index+1
        input_knowledge2.append(tmp)
    input_knowledge3=[]
    for i in range(len(input_knowledge2)):
        g=[]
        for j in range(len(input_knowledge2[i])):
            s=""
            for k in range(len(input_knowledge2[i][j])):
                s=s+" "+input_knowledge2[i][j][k]
            s=s.strip()
            g.append(s)
        input_knowledge3.append(g)
    return input_context,input_knowledge3,output_wizard

print("Converting the extracted sentences into lists...")
t1=time.time()
final_context, final_knowledge, final_wizard=after_load_process(context,knowledge,wizard)
t2=time.time()
print("This took: "+str(t2-t1)+" seconds")
    


In [0]:
##############################Sentence Length Analysis#############################

avg_wizard=0.0
no_wizard=0
no_knowledge=0
no_context=0
for i in range(len(final_wizard)):
    avg_wizard+=len(final_wizard[i].split())
    no_wizard+=1
    
avg_context=0.0
for i in range(len(final_context)):
    for j in range(len(final_context[i])):
        avg_context+=len(final_context[i][j].split())
        no_context+=1
        
avg_knowledge=0.0

for i in range(len(final_knowledge)):
    for j in range(len(final_knowledge[i])):
        avg_knowledge+=len(final_knowledge[i][j].split())
        no_knowledge+=1
        
avg_sentence=(avg_knowledge+avg_wizard+avg_context)/(no_wizard+no_context+no_knowledge)
print("Average Sentence Length: "+str(avg_sentence))
avg_wizard=avg_wizard/len(final_wizard)
avg_context=avg_context/no_context
avg_knowledge=avg_knowledge/no_knowledge
print("Average wizard's sentence length: "+str(avg_wizard))
print("Average knowledge's sentence length: "+str(avg_knowledge))
print("Average context's sentence length: "+str(avg_context))



In [0]:
max_length=70

def process(sentence):
    """
        Preprocessing sentencing to make them of equal length and appending and terminating them with
        start (<START>) and end (<END>) token.
        
    """
    
    
    words=sentence.split()
    sentence_length=max_length-2
    if len(words)<=sentence_length:
        for i in range(sentence_length-len(words)):
            words.append("PAD")
    else:
        words=words[:sentence_length]
    
    res=""
    words.append("<END>")
    words.insert(0,"<START>")
    assert len(words)==max_length
    for w in words:
        res=res+" "+w
    res=res.strip()
    return res

In [0]:
vocab=vocabulary()

def process_loaded(input_context,input_knowledge,output_wizard):
    """ Processing sentences and forming vocabulary
    """
    
    for i in range(len(output_wizard)):
        output_wizard[i]=process(output_wizard[i])
        vocab.addSentence(output_wizard[i])
        assert len(output_wizard[i].split())==max_length
    
    for i in range(len(input_knowledge)):
        for j in range(len(input_knowledge[i])):
            input_knowledge[i][j]=process(input_knowledge[i][j])
            vocab.addSentence(input_knowledge[i][j])
            assert len(input_knowledge[i][j].split())==max_length
        
    for i in range(len(input_context)):
        for j in range(len(input_context[i])):
            input_context[i][j]=process(input_context[i][j])
            vocab.addSentence(input_context[i][j])
            assert len(input_context[i][j].split())==max_length
    return input_context, input_knowledge, output_wizard

print("Processing fetched sentences...")
t1=time.time()
p_context,p_knowledge,p_wizard=process_loaded(final_context,final_knowledge,final_wizard)
t2=time.time()
print("This process took: "+str(t2-t1))

In [0]:
##########################SANITY CHECK##########################

assert len(p_wizard)==50246
assert len(p_context)==50246
assert len(p_knowledge)==50246

print("Running Sanity Checks on the data...")

t1=time.time()

for i in range(len(p_wizard)):
    assert len(p_wizard[i].split())==max_length
    
for i in range(len(p_context)):
    for j in range(len(p_context[i])):
        assert len(p_context[i][j].split())==max_length
        
for i in range(len(p_knowledge)):
    for j in range(len(p_knowledge[i])):
        assert len(p_knowledge[i][j].split())==max_length

t2=time.time()
print("Everything seems fine.")        
print("Sanity Checks took: "+str(t2-t1)+" seconds")
 
################################################################

In [0]:
# # For Visualising training loss per epochs etc

# from visdom import Visdom

# class VisdomLinePlotter(object):
#     """Plots to Visdom"""
#     def __init__(self, env_name='main'):
#         self.viz = Visdom()
#         self.env = env_name
#         self.plots = {}
#     def plot(self, var_name, split_name, title_name, x, y):
#         if var_name not in self.plots:
#             self.plots[var_name] = self.viz.line(X=np.array([x,x]), Y=np.array([y,y]), env=self.env, opts=dict(
#                 legend=[split_name],
#                 title=title_name,
#                 xlabel='Epochs',
#                 ylabel=var_name
#             ))
#         else:
#             self.viz.line(X=np.array([x]), Y=np.array([y]), env=self.env, win=self.plots[var_name], name=split_name, update = 'append')

In [0]:
device=torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [0]:
class Embeddings(nn.Module):
    """ Embeddings class for creating pretrained embeddings for word's token
    """
    def __init__(self, d_model, vocab_size,max_length):
        """ Class for converting sentence into matrix of word embeddings plus the positional encoding.
            
            Inputs: 
                d_model(scalar)   : The size of encoding desired in the model.
                vocab_size(scalar): Size of the vocabulary of the model.
                max_length(scalar): Length of the sentences.
                
            Layers:
                self.embed: Embedding layer to convert tokens into embeddings.
                self.pe   : Positional Encoder encoding word's position in the sentence.
        """
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe=PositionalEncoder(d_model,max_length)
        
    def forward(self, x):
        """ self.embed takes in the tokens associated with the words in the sentence for encoding into
            the embeddings.
            Input:
                x: Tensor representing sentences. x[i,:] is the vector of tokens of words of ith sentence.
                   size: (number_sentences,sentence_length)
            Output:
                y: Tensor representing each word's embedding in the sentence along with the added with positional encoding
                   size: (number_sentences,sentence_length,d_model)
        """

        y=self.embed(x)
        y=self.pe(y)
        
        return y

In [0]:
class PositionalEncoder(nn.Module):
    """ To encode the position of a particular word in the sentence, Positional Encoder adds 
        something to each embedding position of a word wrt to its position in the sentence.
    """
    def __init__(self,d_model,max_length=200):
        """ 
            Input:
                d_model(scalar)        : The word embedding size of the model.
                max_length(scalar)     : The sentence length.
                
            Layers:
                self.pe: A tensor containing the values to be added to each word embedding that depends on 
                         the position of the word in the sentence.
                         Good Explanation: https://datascience.stackexchange.com/questions/51065/what-is-positional-encoding-in-transformer-model?rq=1
        """
        super().__init__()
        self.d_model=d_model
        self.pe=torch.zeros(max_length,d_model,requires_grad=False,device=device)
        for pos in range(max_length):
            for i in range(0,d_model,2):
                self.pe[pos,i]=\
                math.sin(pos/(10000**((2*i)/d_model)))
                self.pe[pos,i+1]=\
                math.cos(pos/(10000**((2*(i+1))/d_model)))
        
    def forward(self,x):
        """
            Input:
                x: Matrix representation of sentence.
                   size: sentence_length*d_model
            Output:
                x: Positional Encoded sentence.
                   size: sentence_length*d_model
        """
        
        x=x*math.sqrt(self.d_model) # makes embeddings larger by multiplying with d_model
        x=x+self.pe
        
        return x

In [0]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)
      
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [0]:
class MultiHeadAttention(nn.Module):
    """ Calculates Multihead attention as described in the "Attention is all you need"
        For more details checkout: http://jalammar.github.io/illustrated-transformer/
    """
    def __init__(self, heads, d_model, dropout = 0.1):
        """ 
            Inputs:
                heads(scalar)  : Number of heads to split the d_model.
                d_model(scalar): Size of the word embeddings used in the model
                dropout(scalar): Probability for dropout layers
            
            Layers:
                self.q_linear: Linear layer for queries
                self.v_linear: Linear layer for values
                self.k_linear: Linear layer for keys
                self.dropout : Dropout layer  
                
        """
        super().__init__()
        
        assert d_model%heads==0
        
        self.d_model = d_model
        self.d_k = d_model// heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v,mask=None):
        """
            Inputs:
                q   : Query matrix for self attention
                      size: (sentence_length,d_model)
                k   : Key matrix for self attention
                      size: (sentence_length,d_model)
                v   : Value matrix for self attention
                      size: (sentence_length,d_model)
                mask: to make model blind for ahead words used in decoder
                      size: (sentence_length,sentence_length)
                      
             Outputs:
                 output: The final output of the multi head attention as described in 
                         "Attention is all you need"
                         size: (sentence_length,d_model)
        """
        
        bs = q.size(0)      
        
        k = self.k_linear(k).view(bs,-1,self.h,self.d_k)
        q = self.q_linear(q).view(bs,-1,self.h,self.d_k)
        v = self.v_linear(v).view(bs,-1,self.h,self.d_k)
        
        k=k.transpose(1,2)
        q=q.transpose(1,2)
        v=v.transpose(1,2)
                
        scores = attention(q, k, v, self.d_k,mask,self.dropout)
        
        concat = scores.transpose(1,2).contiguous()\
        .view(bs,-1,self.d_model)
        
        output = self.out(concat)

        return output

In [0]:
class FeedForward(nn.Module):
    """ FeedForward class consist of two linear layer.
    """
    def __init__(self,d_model,d_ff=2048,dropout=0.1):
        """
            Input:
                d_model(scalar): Model's embedding size
                d_ff(scalar)   : First Linear layer output neurons
                dropout(scalar): Dropout probability for dropout layer
            
            Layers:
                self.linear1: First linear layer
                self.dropout: Dropout layer (A simple way to prevent overfitting by Srivastava et al)
                              http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf
                self.linear2: Second linear layer
        """
        super().__init__()
        
        self.linear1=nn.Linear(d_model,d_ff)
        self.dropout=nn.Dropout(dropout)
        self.linear2=nn.Linear(d_ff,d_model)
        
    def forward(self,x):
        """
            Input:
                x: Output of the multihead attention plus the residual
                   size: (sentence_length,d_model)
                   
            Output:
                x: Output of the two linear layer with relu in between as the non linearity
                   size: (sentence_length,d_model)
        """
        x=self.dropout(F.relu(self.linear1(x)))
        x=self.linear2(x)
        return x

In [0]:
class Encoder(nn.Module):
    """Encoder Module of the Transformer Model"""
    def __init__(self,d_model,d_ff,dropout,h):
        """
            Inputs: 
                d_model(scalar): The size of the word embedding for the model.
                d_ff(scalar)   : The size of the output neurons of the first linear in feed forward layer.
                dropout(scalar): The probability for the dropout layer
                h(scalar)      : Number of heads to split the incoming data. Make sure d_model%h==0
            
            Layers:
                self.mha       : The multihead attention sublayer of the encoder
                self.ff        : Feed Forward sublayer of the encoder 
                self.dropout1  : Dropout layer for the output of multiheadattention layer
                self.dropout2  : Dropout layer for the output of feedforward layer
        """
        
        
        super().__init__()
        
        self.mha=MultiHeadAttention(h,d_model,dropout)
        self.ff=FeedForward(d_model,d_ff,dropout)
        self.dropout1=nn.Dropout(dropout)
        self.dropout2=nn.Dropout(dropout)
        
    def forward(self,y,mask):
        """
            Inputs:
                y   : The matrix representing the input sentence into the encoder
                      size: (max_length,d_model)
                mask: The matrix for masking the model to not see ahead in decoder
                      size: None in case of encoder
                      
            Output:
                y   : The output of the multihead attention and residual layer --> Feedforward plus 
                      the residual.
                      size: (max_length,d_model)
        """
        y=self.dropout1(self.mha(y,y,y,mask))+y
        return self.dropout2(self.ff(y))+y

In [0]:

class RetrievalModel(nn.Module):

    def __init__(
        self,
        d_model,
        d_ff,
        dropout,
        h,
        vocab_size,
        max_length,
        ):
        
        super().__init__()

        self.embed = Embeddings(d_model, vocab_size, max_length)
        self.embed.embed.weight.requires_grad = False

        self.encoderLayer = Encoder(d_model, d_ff, dropout, h)
        self.d_model = d_model
        self.length = max_length

    def forward(
        self,
        x,
        k,
        r,
        ):

        X = self.encoderLayer(self.embed(x), None).view(x.size()[0], -1)
        K = self.encoderLayer(self.embed(k), None).view(k.size()[0], -1)
        R = self.encoderLayer(self.embed(r), None).view(r.size()[0], -1)



        Y = X.matmul(K.t())
        func = nn.Softmax(dim=-1)
        Y = func(Y)

        attnd = X + Y.matmul(K)
        Z=attnd.mean(dim=0)

        const = torch.norm(Z)       
        Z = torch.div(Z, const)
        
        normv = torch.norm(R)
        res = torch.matmul(R, Z.t())
        res = torch.div(res, normv)
        
        return res


In [0]:
def plot_grad_flow(named_parameters):
    """
        Plotting gradient flow across various layers
        Thanks to: https://discuss.pytorch.org/t/check-gradient-flow-in-network/15063/2
    """   
    ave_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)


In [0]:
def embed(s):
    
    """ Converting sentences into a list of tokens
    """
    i=0
    wordList=s.split()
    
    assert len(wordList)==max_length
    
    res=torch.zeros(len(wordList))
    
    for w in wordList:
        idx=vocab.word2Index[w]
        res[i]=idx
        i=i+1
    return res
    

In [0]:
def run_epoch(context,knowledge,wizard,model,nepochs):
    
  start=time.time()

  total_loss=0
  avg_acc=0
    
  data_len=5 # batch size
  nbatches=10 # Number of batches
  epochs=nepochs # Number of epochs  
       
  optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=0.1,betas=(0.9,0.98),eps=1e-9,amsgrad=True)
#   model_opt = NoamOpt(512, 1, 400,
#         torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9,amsgrad=True))

  
  
  for i in range(epochs):  #epoch
    
    model.train()
    acc=0
    loss=0
    
    for j in range(nbatches): #batch
      wizard_em=torch.zeros(data_len,max_length,dtype=torch.long)
    
      counter_epoch=j*data_len
      for k in range(data_len):
        wizard_em[k,:]=(embed(wizard[counter_epoch+k]))
        
      target=torch.zeros(data_len,dtype=torch.long)
      inp=torch.zeros(data_len,data_len)
      
      for k in range(data_len):
        
        context_em=torch.zeros(len(context[counter_epoch+k]),max_length,dtype=torch.long)
        
        for l in range(len(context[counter_epoch+k])):
          context_em[l,:]=(embed(context[counter_epoch+k][l]))

        knowledge_em=torch.zeros(len(knowledge[counter_epoch+k]),max_length,dtype=torch.long)
        
        for l in range(len(knowledge[counter_epoch])):
          knowledge_em[l,:]=(embed(knowledge[counter_epoch+k][l]))
        
        out=model(context_em,knowledge_em,wizard_em)
        inp[k,:]=out.t()
        target[k]=k
        
        if torch.argmax(out).item()==k:
          acc=acc+1

      loss_curr=nn.CrossEntropyLoss()
      output_loss=loss_curr(inp,target)
        
      optimizer.zero_grad()
#       model_opt.optimizer.zero_grad()
      output_loss.backward()
        
      plot_grad_flow(model.named_parameters()) 
#       nn.utils.clip_grad_norm_(model.parameters(),1)
      optimizer.step()
#       model_opt.step()
     
      loss=loss+output_loss.item()
      torch.cuda.empty_cache()
      
    loss=loss/nbatches
#     plotter.plot("epoch Loss","epoch-number","Loss Vs Epochs ADAM ",i,loss)
   

    
    print("-"*80)
    print("Epoch: "+str(i)+" "+str((acc*100)/(nbatches*data_len)))
    avg_acc=avg_acc+(acc*100)/(nbatches*data_len)
    print(loss)
    print("-"*80)

    elapsed=time.time()-start
    start=time.time()
    print(elapsed)
    print("-"*80)
    
  print("Average Accuracy: "+str(avg_acc/epochs))
  

In [0]:
# plotter=VisdomLinePlotter('main')
model=RetrievalModel(256,1024,0.1,8,vocab.num_words,max_length)
model.cuda()
torch.set_default_tensor_type('torch.cuda.FloatTensor')
for p in model.parameters():
    if p.dim()>1:
        nn.init.xavier_uniform_(p)
run_epoch(p_context,p_knowledge,p_wizard,model,1000)