In [None]:
#! pip install wikipedia
# In Case installation is needed

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp36-none-any.whl size=11686 sha256=f88a8ea8cfd563ba5b3a074e1cff84ab23b15a3f73d687b7a6af34d5d0f1de27
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import wikipedia
import re
import spacy
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Query Reformulation

In [None]:
# Check whether the query given by the user is in one of the four desired form (Who, What, Where, When)
def invalid_query(query):
  query_words = "Who What Where When where what when who".split()
  for i in range(len(query_words)):
    if (query_words[i] in query):
      return False
  return True

In [None]:
# If query is not in WH form then get entity tag of query and process on that information
def partial_information(query):
  labels = []
  if (invalid_query(query)==True):
    partial = spacy.load('en')
    lab = partial(query)
    for l in lab.ents:
      labels.append(l.label_)
    return labels,True
  return labels,False

In [None]:
# The Query given by user is processed and Reformulated by chaning the positions of the Non-Stop Words present in the query
def QR(query):
  tags,part = partial_information(query)
  start_words = '? Who What Where When where was what when who'.split()
  refined_query = ""
  # This loop removes the question word from query
  for j in range(len(start_words)):
    if (j==0):
      refined_query = query.replace(start_words[j],'')
    else:
      refined_query = refined_query.replace(start_words[j],'')
  # The remaining part of the query is reformulated to make different rewrites of the given query and also assigned a weight
  words = refined_query.split()
  new_queries = []
  verb = words[0]
  for i in range(len(words)):
    q = ""
    k = 1
    for j in range(len(words)):
      if (i == j):
        q += verb + ' '
      else:
        q += words[k] + ' '
        k += 1
    new_queries.append([q,5])
  q = ""
  for i in range(1,len(words)):
    q += words[i] + ' '
  new_queries.append([q,2])

  return new_queries,tags,part

# Ngram Mining

In [None]:
# This function takes the reformulated queries and get the summary of the desired query from Wikipedia
# The summary is then pre-processed i.e punctuation is removed to make ngrams
def information(reformed_queries):
  remove_letters = ". ? / ( ) : ! , \ \n =".split()
  summary = []
  for i in range(len(reformed_queries)):
    summary.append(wikipedia.summary(reformed_queries[i][0]))
  refined_summary = []
  for i in range(len(summary)):
    punctuation_removed = []
    for j in range(len(remove_letters)):
      if (j==0):
        punctuation_removed.append(summary[i].replace(remove_letters[j],''))
      else:
        punctuation_removed.append(punctuation_removed[-1].replace(remove_letters[j],''))
    refined_summary.append(punctuation_removed[-1])
  return refined_summary

In [None]:
# This function take the pre-processed summary and tokenize it i.e give all the words in the vocabulary
def tokenize(refined_summary):
  words = []
  for i in range(len(refined_summary)):
    words.append(nltk.word_tokenize(refined_summary[i]))
  return words

# These 3 function compute the unique unigram, bigram and trigram from the tokens of summary
def generate_unigram(ngram_token):
  unigram = []
  c_unigram = []
  for i in range(len(ngram_token)):
    ug = []
    cug = []
    for j in range(len(ngram_token[i])):
      if (ngram_token[i][j] not in unigram):
        ug.append(ngram_token[i][j])
        cug.append(1)
    unigram.append(ug)
    c_unigram.append(cug)
  return unigram,c_unigram

def generate_bigram(ngram_token):
  bigram = []
  c_bigram = []
  for i in range(len(ngram_token)):
    bg = []
    cbg = []
    for j in range(len(ngram_token[i])-1):
      if ((ngram_token[i][j]+' '+ngram_token[i][j+1]) not in bigram):
        bg.append(ngram_token[i][j]+' '+ngram_token[i][j+1])
        cbg.append(1)
    bigram.append(bg)
    c_bigram.append(cbg)
  return bigram,c_bigram

def generate_trigram(ngram_token):
  trigram = []
  c_trigram = []
  for i in range(len(ngram_token)):
    tg = []
    ctg = []
    for j in range(len(ngram_token[i])-2):
      if ((ngram_token[i][j]+' '+ngram_token[i][j+1]+' '+ngram_token[i][j+2]) not in trigram):
        tg.append(ngram_token[i][j]+' '+ngram_token[i][j+1]+' '+ngram_token[i][j+2])
        ctg.append(1)
    trigram.append(tg)
    c_trigram.append(ctg)
  return trigram,c_trigram


In [None]:
# These 3 functions assign weights on the basis of an ngram present in number of summaries
# A ngram that appears in more summaries gets higher weight than a ngram that appears in less summaries
def unigram_weights(ug,cug):
  for i in range(len(ug)):
    for j in range(len(ug[i])):
      for k in range(len(ug)):
        if (k != i):
          if (ug[i][j] in ug[k]):
            cug[i][j] += 1
  return cug

def bigram_weights(bg,cbg):
  for i in range(len(bg)):
    for j in range(len(bg[i])):
      for k in range(len(bg)):
        if (k != i):
          if (bg[i][j] in bg[k]):
            cbg[i][j] += 1
  return cbg

def trigram_weights(tg,ctg):
  for i in range(len(tg)):
    for j in range(len(tg[i])):
      for k in range(len(tg)):
        if (k != i):
          if (tg[i][j] in tg[k]):
            ctg[i][j] += 1
  return ctg

## Ngram Filtering

In [None]:
# This function analyze the query and assign a filter type on the basis of query question type
def query_analysis(query):
  if ("Who" in query or "who" in query):
    return ["PERSON","NORP"]
  elif ("When" in query or "when" in query):
    return ["DATE"]
  elif ("Where" in query or "where" in query):
    return ["GPE","LOC"]
  elif ("What" in query or "what" in query):
    return ["EVENT","PRODUCT","WORK_OF_ART","CARDINAL","MISC"]

# This function adjust the weight of the ngrams on the basis of the filter type given by above function
# For example a Who question query is probably asking for a person so all the ngrams with entity relations as PERSON get weight incremented
def weight_adjustment_filtered(filter_type,ug,cug,bg,cbg,tg,ctg):
  summary = spacy.load('en')
  for i in range(len(ug)):
    for j in range(len(ug[i])):
      word = summary(ug[i][j])
      for w in word.ents:
        if (w.label_ in filter_type):
          cug[i][j] += 1
  
  for i in range(len(bg)):
    for j in range(len(bg[i])):
      word = summary(bg[i][j])
      for w in word.ents:
        if (w.label_ in filter_type):
          cbg[i][j] += 1

  for i in range(len(tg)):
    for j in range(len(tg[i])):
      word = summary(bg[i][j])
      for w in word.ents:
        if (w.label_ in filter_type):
          ctg[i][j] += 1

  return cug,cbg,ctg

## Ngram Tiling

In [None]:
# This perform the merging of the ngram if they overlap
# First the top scoring (highly weighted) ngrams are overlapped and then the second best one only(at the moment)
# So if we have max weight 5 only weight 5 and 4 will be checked and merged if possible using trigrams
def ngram_tiling(filter,ug,cug,bg,cbg,tg,ctg):
  candidate = []
  weight = []
  n = 0
  i = 0
  while (i < len(ctg)):
    if (n==0):
      start = max(ctg[i])
      answer = tg[i][ctg[i].index(start)]
      w = start
    else:
      start -= 1
    for j in range(len(ctg[i])-1):
      if (j!=ctg[i].index(start) and count_tri[i][j]==start):
        new_text = nltk.word_tokenize(answer)
        if (new_text[-1] in tg[i][j]):
          new = nltk.word_tokenize(tg[i][j])
          answer += ' '+new[-1]
          w = max(ctg[i][j],w)
    if (n==1):
      candidate.append(answer)
      weight.append(w)
      n = 0
      i += 1
    else:
      n += 1
  
  return candidate,weight

## Main

In [None]:
# Main Function
print("This is a QA system by YourName. It will try to answer questions that start with Who, What, When or Where. Enter *exit* to leave the program.")
query = input("Type query: ")
while (query!="exit"):
  reformed_queries,ft,partial = QR(query)
  if (reformed_queries):
    refined_summary = information(reformed_queries)
    tokens = tokenize(refined_summary)
    unigram,count_uni = generate_unigram(tokens)
    bigram,count_bi = generate_bigram(tokens)
    trigram,count_tri = generate_trigram(tokens)
    count_uni = unigram_weights(unigram,count_uni)
    count_bi = bigram_weights(bigram,count_bi)
    count_tri = trigram_weights(trigram,count_tri)
    if(partial==False):
      ft = query_analysis(query)
    count_uni,count_bi,count_tri = weight_adjustment_filtered(ft,unigram,count_uni,bigram,count_bi,trigram,count_tri)
    cand,w = ngram_tiling(ft,unigram,count_uni,bigram,count_bi,trigram,count_tri)
    print(cand[w.index(max(w))])
  query = input("Type query: ")
print("Thankyou for using my services.")

This is a QA system by YourName. It will try to answer questions that start with Who, What, When or Where. Enter *exit* to leave the program.
Type query: Tom Holland
Thomas Stanley Holland born June 1996 is an English actor A graduate of the BRIT School in London he began his acting career on stage in the title role of Billy Elliot
Type query: exit
Thankyou for using my services.
