In [1]:
import json
import pandas as pd
import torch
import torch.nn as nn
import re
import spacy
from collections import Counter,defaultdict
import torchtext
from torchtext.data.utils import get_tokenizer

import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [2]:
with open('D:/Squad/train.json') as f:
    data = json.load(f)

with open('D:/Squad/dev.json') as f:
    test_data=json.load(f) 

In [3]:
tokenizer = get_tokenizer("spacy", language='en_core_web_sm')

In [4]:
df = pd.DataFrame(columns=['id','title','context','question','answer'])
for topic in data["data"]:
  for paragraph in topic["paragraphs"]:
    for context in paragraph["qas"]:
      df = df.append({'id':context["id"], 'title':topic["title"],'context':paragraph["context"],'question':context["question"],'answer':context["answers"][0]['text']}, ignore_index=True)

test_df = pd.DataFrame(columns=['id','title','context','question','answer'])
for topic in test_data["data"]:
  for paragraph in topic["paragraphs"]:
    for context in paragraph["qas"]:
      df = df.append({'id':context["id"], 'title':topic["title"],'context':paragraph["context"],'question':context["question"],'answer':context["answers"][0]['text']}, ignore_index=True)


In [6]:
unique_context_df=pd.DataFrame(df['context'].unique(),columns=['context'])
question_context_df=pd.DataFrame(df['question'].unique(),columns=['question'])


In [7]:
dict_context = []
char_context=[]
ques_context=[]
char_question=[]
for i in range(len(unique_context_df)):
  dict_context.extend(list(tokenizer(unique_context_df['context'][i].lower().strip())))
  for word in list(tokenizer(unique_context_df['context'][i].lower().strip())):
    char_context.extend(list(word))
for i in range(len(question_context_df)):
  ques_context.extend(list(tokenizer(question_context_df['question'][i].lower().strip())))
  for word in list(tokenizer(question_context_df['question'][i].lower().strip())):
    char_question.extend(list(word))


In [8]:
vocab=[]
char=[]
vocab.append('pad')
vocab.append('unk')
char.append('pad')
char.append('unk')
vocab.extend(list(Counter(list(Counter(dict_context).keys())+list(Counter(ques_context).keys())).keys()))
char.extend(list(Counter(list(Counter(char_context).keys())+list(Counter(char_question).keys())).keys()))



In [9]:
word2idx = defaultdict(lambda:1)
i=0;
for z in vocab:
    word2idx[z]=i;
    i=i+1;

char2idx = defaultdict(lambda:1)
i=0;
for z in char:
    char2idx[z]=i;
    i=i+1;

In [10]:
from torchtext.vocab import GloVe
glove = GloVe(name='6B', dim=100)

In [11]:
embedding_dim=100
word_embedding = np.zeros((len(vocab),embedding_dim))
for i, word in enumerate(vocab):
    word_embedding[i] = glove[word]

word_embedding=torch.Tensor(word_embedding)


In [12]:
class Question_Answer(Dataset):
    def __init__(self,data,tokenizer,glove):
        self.data= data;
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context=self.data['context'][idx]
        line = context.lower().strip()
        tokens = self.tokenizer(line)
        context_data = torch.tensor([word2idx[x] for x in tokens])
        
        question=self.data['question'][idx]
        tks=self.tokenizer(question.lower().strip())
        question_data = torch.tensor([word2idx[x] for x in tks])
        
        answer=self.data['answer'][idx]
        tks=self.tokenizer(answer.lower().strip())
        answer_data = torch.tensor([word2idx[x] for x in tks])
        
        return context_data, question_data, answer_data

In [13]:
def padding_and_masking(batch):
    context, question, answer = zip(*batch)
    context_seq_length=torch.tensor([len(x) for x in context])
    question_seq_length=torch.tensor([len(x) for x in question])
    answer_seq_length=torch.tensor([len(x) for x in answer])
    
    context = pad_sequence(context, batch_first=True, padding_value=0)
    question=pad_sequence(question, batch_first=True, padding_value=0)
    answer=pad_sequence(answer, batch_first=True, padding_value=0)
    

    return context, question, answer ,context_seq_length, question_seq_length, answer_seq_length

In [14]:
train_data = Question_Answer(df, tokenizer, glove)
test_data  = Question_Answer(test_df, tokenizer, glove)

In [25]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True,collate_fn=padding_and_masking)
test_loader = DataLoader(test_data, batch_size=16, shuffle=True,collate_fn=padding_and_masking)

In [26]:
context,question,answer,context_seq_length,question_seq_length,answer_seq_length = next(iter(train_loader))



In [27]:
class BIDAF(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size):
        super(BIDAF,self).__init__()
        self.embedding = None;
        self.lstm = nn.LSTM(embedding_dim,hidden_size,bidirectional=True)
        self.lstm1=nn.LSTM(embedding_dim*4, embedding_dim, bidirectional=True, num_layers=2)
        self.lstm2= nn.LSTM(embedding_dim*2, embedding_dim, bidirectional=True)
        self.output1= nn.Linear(embedding_dim*6,1)
        self.output2=nn.Linear(embedding_dim*6,1)
        
    def forward(self,context, question):
        que_emb = self.embedding(question)
        
        outq,(hq,cq) = self.lstm(que_emb)
        
        con_emb=self.embedding(context)
        
        outc,(hc,cc)=self.lstm(con_emb)
        
        matrix=torch.bmm(outc,outq.transpose(1, 2))
        
        C2Q = nn.functional.softmax(matrix, dim=-1)
        
        Q2C = nn.functional.softmax(torch.max(matrix,dim=-1)[0],dim=-1)
        
        b=torch.bmm(C2Q,que_emb)
        
        J=torch.bmm(torch.unsqueeze(Q2C,1),con_emb)
        
        J=J.repeat(1,con_emb.shape[1],1)
        
        F = torch.cat([con_emb, b, torch.mul(con_emb,b), torch.mul(con_emb, J)],dim=2)
        
        out1,(h1,c1)= self.lstm1(F)
        out2,(h2,c2)=self.lstm2(out1)
        
        p1=self.output1(torch.cat([F,out1],dim=2))
        
        p1=p1.squeeze()
        
        p2=self.output2(torch.cat([F,out2],dim=2))
        
        p2=p2.squeeze()
        
        return p1,p2

In [28]:
context,question,answer,context_seq_length,question_seq_length,answer_seq_length = next(iter(train_loader))
num_embeddings=len(vocab) 
embedding_dim=100
hidden_size=100
bidaf=BIDAF(num_embeddings,embedding_dim,100)
bidaf.embedding=nn.Embedding.from_pretrained(word_embedding)
p1,p2=bidaf.forward(context,question)

start_index=[];
end_index=[];
answer_seq_length=answer_seq_length-1;
            
for ans,index in zip(answer,answer_seq_length):
    end_index.append(int(ans[index]))
    start_index.append(int(ans[0]))
            
start_index=torch.tensor(start_index)
end_index=torch.tensor(end_index)

z=nn.functional.cross_entropy(p1,start_index)
print(z)

tensor(5.0245, grad_fn=<NllLossBackward>)


In [29]:
hidden_layer_neurons = 100
vocabulary_size = len(vocab)
embedding_dim = 100
model=BIDAF(num_embeddings,embedding_dim,hidden_layer_neurons)
model.embedding=nn.Embedding.from_pretrained(word_embedding)
# Cross-Entropy loss
loss_fn = nn.CrossEntropyLoss()

# Learning rate
lr = 0.01
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=lr)

In [45]:
def train_and_evaluate_BIDAF(model, train_loader):

    train_loss = 0.
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        context, question, answer ,context_seq_length, question_seq_length, answer_seq_length  = batch
        p1,p2 = model(context, question)
        start_index=[];
        end_index=[];
        answer_seq_length=answer_seq_length-1;
            
        for ans,index in zip(answer,answer_seq_length):
            end_index.append(int(ans[index]))
            start_index.append(int(ans[0]))
            
        start_index=torch.tensor(start_index)
        end_index=torch.tensor(end_index)
        

        
        loss = nn.functional.cross_entropy(p1, start_index) + nn.functional.cross_entropy(p2, end_index)

        loss.backward()
    
    

        optimizer.step()

        train_loss += loss.item()

    return train_loss/len(train_loader)

In [46]:
train_loss = train_and_evaluate_BIDAF(model,train_loader)

In [47]:
train_loss

8.255741834640503

In [49]:
def test_and_evaluate_BIDAF(model, test_loader):

    test_loss = 0.
    with torch.no_grad():
        model.eval()
        for batch in test_loader:
            context, question, answer ,context_seq_length, question_seq_length, answer_seq_length  = batch
            p1,p2 = model(context, question)
            start_index=[];
            end_index=[];
            answer_seq_length=answer_seq_length-1;
            
            for ans,index in zip(answer,answer_seq_length):
                end_index.append(int(ans[index]))
                start_index.append(int(ans[0]))
            
            start_index=torch.tensor(start_index)
            end_index=torch.tensor(end_index)
        

        
            loss = nn.functional.cross_entropy(p1, start_index) + nn.functional.cross_entropy(p2, end_index)

            

            test_loss += loss.item()

    return test_loss/len(test_loader)

In [50]:
test_loss = test_and_evaluate_BIDAF(model,test_loader)

In [51]:
test_loss

7.94553804397583

In [77]:
def prediction(model,context,question,answer):
    p1,p2=model(context,question);
    p1=torch.argmax(p1,dim=-1);
    p2=torch.argmax(p2,dim=-1);
    c=[]
    for i in context[0]:
        if(i!=0):
            c.append(vocab[i])
    print(' '.join(c))
    print(' ')
    q=[]
    for i in question[0]:
        if(i!=0):
            q.append(vocab[i])
    print(' '.join(q))
    print(' ')
    a=[]
    for i in answer[0]:
        if(i!=0):
            a.append(vocab[i])
    print(' '.join(a))
    print(' ')
    
    print([vocab[p1[0]],vocab[p2[0]]])


In [82]:
context,question,answer,context_seq_length,question_seq_length,answer_seq_length = next(iter(train_loader))
prediction(model,context,question,answer)

super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24–10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi 's stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspending the tradition of naming each super bowl game with roman numerals ( under which the game would have been known as " super bowl l " ) , so that the logo could prominently feature the arabic numerals 50 .
 
who won super bowl 50 ?
 
denver broncos
 
['denver', 'broncos']


In [83]:
context,question,answer,context_seq_length,question_seq_length,answer_seq_length = next(iter(train_loader))
prediction(model,context,question,answer)

super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24–10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi 's stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspending the tradition of naming each super bowl game with roman numerals ( under which the game would have been known as " super bowl l " ) , so that the logo could prominently feature the arabic numerals 50 .
 
what team was the nfc champion ?
 
carolina panthers
 
['denver', 'national']


In [96]:
context,question,answer,context_seq_length,question_seq_length,answer_seq_length = next(iter(train_loader))
prediction(model,context,question,answer)

super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24–10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi 's stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspending the tradition of naming each super bowl game with roman numerals ( under which the game would have been known as " super bowl l " ) , so that the logo could prominently feature the arabic numerals 50 .
 
which nfl team won super bowl 50 ?
 
denver broncos
 
['denver', 'national']
