<a href="https://colab.research.google.com/github/AmanPriyanshu/QnA-System/blob/main/QnA_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

--2022-02-19 10:04:04--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.111.153, 185.199.109.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2022-02-19 10:04:05 (180 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]



In [2]:
!ls

sample_data  train-v2.0.json


In [3]:
import json
import re
import numpy as np
import pandas as pd

In [4]:
def reader(raw_data, N=None):
  data = {'topic': [], 'question': [], 'answer': []}
  for query in raw_data:
    topic = query['title']
    for paragraph in query['paragraphs']:
      for qas in paragraph['qas']:
        question = qas['question']
        answers = [i['text'] for i in qas['answers']]
        for answer in answers:
          data['topic'].append(topic)
          data['question'].append(question)
          data['answer'].append(answer)
  if N is not None:
    return pd.DataFrame({k:i[:N] for k,i in data.items()})
  return pd.DataFrame(data)

In [5]:
def load_sentences(N=None, path='train-v2.0.json'):
  with open(path) as f:
    data = json.load(f)
  return reader(data['data'], N=N)

In [6]:
from nltk.stem import PorterStemmer
from tqdm.notebook import trange, tqdm

ps = PorterStemmer()
def preprocess_qs(sentence):
  sentence = sentence.lower()
  sentence = re.sub("[^a-zA-Z]", " ", sentence)
  return ' '.join([ps.stem(word) for word in sentence.split() if word!=''])

In [7]:
def preprocess(data):
  for i in trange(len(data), desc="preprocessing"):
    data.question[i] = preprocess_qs(data.question[i])

In [8]:
class TfIdfEmbedder:
  def __init__(self, vocab_size=None, min_occurences=None, max_occurences=None):
    self.vocab_size = vocab_size
    self.min_occurences = min_occurences
    self.max_occurences = max_occurences
    self.vocab, self.count, self.tf_matrix, self.df_matrix, self.tf_idf_matrix, self.len_sentences, self.train_sentences, self.train_answers = None, None, None, None, None, None, None, None

  def extract_vocabulary(self, sentences):
    text = ' '.join(sentences).split()
    vocab, count = np.unique(text, return_counts=True)
    indexes = np.argsort(count)[::-1]
    vocab = vocab[indexes]
    count = count[indexes]
    if self.min_occurences is not None:
      try:
        index = np.argwhere(count<self.min_occurences).flatten()[0]
        vocab = vocab[:index]
        count = count[:index]
      except:
        pass
    if self.max_occurences is not None:
      try:
        index = np.argwhere(count>self.max_occurences).flatten()[-1]+1
        vocab = vocab[index:]
        count = count[index:]
      except:
        pass
    if self.vocab_size is not None:
      vocab = vocab[-self.vocab_size:]
      count = count[-self.vocab_size:]
    self.vocab = [i for i in vocab]
    self.count = count

  def fit(self, sentences, answers):
    self.train_sentences = sentences
    self.train_answers = answers
    self.extract_vocabulary(sentences)
    self.tf_matrix = np.zeros((len(sentences), len(self.vocab)))
    self.df_matrix = np.zeros(len(self.vocab))
    for row_index, sentence in enumerate(tqdm(sentences, desc="Fitting through Sentences...")):
      for word in sentence.split():
        try:
          word_index = self.vocab.index(word)
          self.tf_matrix[row_index][word_index] += 1
        except:
          pass
      for word in list(set(sentence.split())):
        try:
          word_index = self.vocab.index(word)
          self.df_matrix[word_index] += 1
        except:
          pass
    self.len_sentences = len(sentences)

  def set_weights(self):
    self.tf_idf_matrix = self.tf_matrix * (1+np.log2((1+self.len_sentences)/(self.df_matrix+1)))

  def transform(self, sentence, method='euclidean'):
    sentence = preprocess_qs(sentence)
    sentence_matrix = np.zeros((1, len(self.vocab)))
    for word in sentence.split():
      try:
        word_index = self.vocab.index(word)
        sentence_matrix[0][word_index] += 1
      except:
        pass
    sentence_matrix = sentence_matrix * (1+np.log((1+self.len_sentences)/(self.df_matrix+1)))
    if method=='euclidean':
      sentence_matrix = sentence_matrix[0]
      distance = np.sum((self.tf_idf_matrix - sentence_matrix)**2, axis=1)
      index = np.argmin(distance)
    else:
      distance = np.dot(sentence_matrix, self.tf_idf_matrix.T)[0]
      index = np.argmax(distance)
    return {'question': sentence, 'original question': self.train_sentences[index], 'answer' :self.train_answers[index]}

In [9]:
data = load_sentences(N=5000)
preprocess(data)
data

preprocessing:   0%|          | 0/5000 [00:00<?, ?it/s]

Unnamed: 0,topic,question,answer
0,Beyoncé,when did beyonc start becom popular,in the late 1990s
1,Beyoncé,what area did beyonc compet in when she wa gro...,singing and dancing
2,Beyoncé,when did beyonc leav destini s child and becom...,2003
3,Beyoncé,in what citi and state did beyonc grow up,"Houston, Texas"
4,Beyoncé,in which decad did beyonc becom famou,late 1990s
...,...,...,...
4995,Buddhism,what is the fourth of the four nobl truth explain,identifies a path to this cessation
4996,Buddhism,what is the second truth,dukkha can be known.
4997,Buddhism,how is the mean of dukkha explain,craving
4998,Buddhism,what is a contribut factor to dukkha,ignorance


In [10]:
embedder = TfIdfEmbedder(min_occurences=2, max_occurences=500)
embedder.fit([sentence for sentence in data.question], [answer for answer in data.answer])
embedder.set_weights()

sentence = "what is my dukkha and second truth?"#data.question[3]
print(sentence, "\n", embedder.transform(sentence))
print(sentence, "\n", embedder.transform(sentence, method="similarity"))

Fitting through Sentences...:   0%|          | 0/5000 [00:00<?, ?it/s]

what is my dukkha and second truth? 
 {'question': 'what is my dukkha and second truth', 'original question': 'what is the second truth', 'answer': 'dukkha can be known.'}
what is my dukkha and second truth? 
 {'question': 'what is my dukkha and second truth', 'original question': 'what journalist drew comparison between my beauti dark twist fantasi and other kany album', 'answer': 'Simon Vozick-Levinson'}
