**Ahmet Zafer SAGLIK - Example QS Model**

** Creating and training a ML Question-Answer(QA) Model which given a Query-question predicts the best answer from the context given. The dataset given consists of N source texts and M questions per source. For each question there is an answer given from the text, as well as a list of alternatives answers within the text too.**

**Steps:**


1.   INSTALL THE NECESSARY LIBRARIES
2.   IMPORT THE NECESSARY LIBRARIES
3.   CREATE DICTIONARY FOR NER DETECTION
4.   TRAIN MODELS(BERT,NERD)
5.   EVALUATION AND TESTING
6.   RESULTS

Just give the Test Data to Read Function with Sample Number

Careful: Bert is slow so Dont give big numbers

# **INSTALL THE NECESSARY LIBRARIES**

In [1]:
!pip install ner-d



In [2]:
!pip install sentence-transformers



# **IMPORT THE NECESSARY LIBRARIES**

In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import spacy
from nerd import ner
from scipy.spatial.distance import cosine
from nerd import ner



In [5]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# **MAKE DICT FOR NER DETECTION**

In [6]:
questionsArray=["when","where","who","how much","how many","organization","event","language","art","law","what year","place"]
quest_dict = {}
quest_dict["when"] = ["DATE","TIME"]
quest_dict["where"] = ["FAC","GPE","LOC"]
quest_dict["who"]= ["PERSON","NORP","FAC"] 
quest_dict["how much"] = ["MONEY","QUANTITY", "ORDINAL","CARDINAL","PERCENT"]
quest_dict["how many"] = ["QUANTITY", "ORDINAL","CARDINAL","PERCENT"]
quest_dict["organization"]=["ORG"]
quest_dict["event"] = ["EVENT"]
quest_dict["language"]=["LANGUAGE"]
quest_dict["art"]=["WORK_OF_ART"]
quest_dict["law"]=["LAW"]
quest_dict["what year"]=["DATE"]
quest_dict["place"]=["FAC","GPE","LOC"]




# **READ FILE**

In [7]:
def read_json(file_name,sample_num):
  
  with open(f"{file_name}") as data_file:
      data = json.load(data_file)

  column_names=["answer_start","context","question","text","id"]
  df=pd.DataFrame(columns=column_names)

  count=0
  for key, value in  data.items():
    for x in range(len(value["qas"])):
      df=df.append({
                  'answer_start':value["qas"][x]["answers"][0]["answer_start"],
                  'question': value["qas"][x]["question"],
                  'context' : value["context"],
                  'text':value["qas"][x]["answers"][0]["text"],
                  'id':  value["qas"][x]["id"],},
                  ignore_index=True)
      count+=1
    if count==sample_num:
      break
     
  return df

# **DATA PROCESS**

In [8]:
def find_target(x):
    index = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: index = i
    return index

In [9]:
def data_process(df):
  df['sentences'] = df['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
  df["target"] = df.apply(find_target, axis = 1)

  return df

# **MODEL TRAINING**

In [10]:
def bert_sentence_sim(df):
  #tknzr = TweetTokenizer()
  #lemmatizer = WordNetLemmatizer()
  count=0
  sim_list=[None]*len(df["sentences"])
  for i in (df["sentences"]):

    #We dont need to Lemmatize or tokenize the data since Bert dont need it. When we do accuracy is decreasing.
    #i_str=lemmatizer.lemmatize(''.join(i))
    #question_lemma=lemmatizer.lemmatize(df['question'][count])
    

    sentence_embeddings = sbert_model.encode(i)
    query_vec = sbert_model.encode(df['question'][count])

    sent_cont=0
    d = {}
    for sent in i:
      sim=0
      sim = cosine(query_vec, sbert_model.encode([sent]))
      d[sent_cont] = sim
      sent_cont+=1
    sent_index=(min(d, key=d.get))
    sim_list[count]=sent_index
    
    count=count+1
     
  df['sim_sent'] = sim_list
  return df


In [11]:
def ner_detect(df):
  arr_answer_tuple=[None]*len(df["sentences"])
  arr_single_answer=[]
  
  for index in range(len(df["sentences"])):
    flag=False

    answer_sentence=(df['sentences'][index][int(df['sim_sent'][index])])
   
    question=df['question'][index]


    doc = ner.name(answer_sentence)
    answer_label = [(X.text, X.label_) for X in doc] 
  
    for quest_index in range(len(questionsArray)):
      if questionsArray[quest_index] in df['question'].values[index].lower():
        question_equal_tags=quest_dict[questionsArray[quest_index]]
        
        for simple_tag in (question_equal_tags):
          for answer_tuple in (answer_label):  
            if answer_tuple[1]==simple_tag:
              flag=True
              if (arr_answer_tuple[index]==None) :
                arr_answer_tuple[index]=answer_tuple[0]
              else:
                arr_answer_tuple[index]=arr_answer_tuple[index] +" "+answer_tuple[0]
    if flag==False:
       arr_answer_tuple[index]=answer_sentence
    else:
      continue
  df['last_ans'] = arr_answer_tuple    
  
  return df


## **EVALUATION AND TESTING**

In [12]:
def true_in_sentence(df):
  true=0
  limit=0
  for i,a in zip(df['target'],df['sim_sent']):
    if i==a:
      true+=1
    limit+=1
    if limit==len(df["sentences"]):
      break
  return true/len(df["sentences"])

In [13]:
def exactly_true(df):
  true_ner=0
  limit=0
  for i,a in zip(df['text'],df['last_ans']):
    if i in a:
      true_ner+=1
    limit+=1
    if limit==len(df["sentences"]):
      break
  return true_ner/len(df["sentences"])

**RESULTS**

In [16]:
data = read_json("qa_dataset.json",sample_num=200)

data_ready=data_process(data)

data_include_sim=bert_sentence_sim(data_ready)

data_include_ner=ner_detect(data_include_sim)

result_true=true_in_sentence(data_include_ner)
print('Finds the sentence index of answer with accuracy',result_true)

exactly_true_var=exactly_true(data_include_ner)
print('Finds exact result or nearly exact result accuracy',exactly_true_var)

(data_include_ner.head(20))


Finds the sentence index of answer with accuracy 0.535
Finds exact result or nearly exact result accuracy 0.52


Unnamed: 0,answer_start,context,question,text,id,sentences,target,sim_sent,last_ans
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,5733be284776f41900661182,"[Architecturally, the school has a Catholic ch...",5,5,Virgin Mary Saint Bernadette Soubirous
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,5733be284776f4190066117f,"[Architecturally, the school has a Catholic ch...",2,3,Next to the Main Building is the Basilica of t...
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,5733be284776f41900661180,"[Architecturally, the school has a Catholic ch...",3,3,Next to the Main Building is the Basilica of t...
3,381,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,5733be284776f41900661181,"[Architecturally, the school has a Catholic ch...",4,5,"It is a replica of the grotto at Lourdes, Fran..."
4,92,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,5733be284776f4190066117e,"[Architecturally, the school has a Catholic ch...",1,3,Next to the Main Building is the Basilica of t...
5,248,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,September 1876,5733bf84d058e614000b61be,"[As at most other universities, Notre Dame's s...",2,10,Spring 2008
6,441,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?,twice,5733bf84d058e614000b61bf,"[As at most other universities, Notre Dame's s...",3,4,The Dome yearbook is published annually.
7,598,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...,The Observer,5733bf84d058e614000b61c0,"[As at most other universities, Notre Dame's s...",9,5,The newspapers have varying publication intere...
8,126,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...,three,5733bf84d058e614000b61bd,"[As at most other universities, Notre Dame's s...",9,0,"As at most other universities, Notre Dame's st..."
9,908,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...,1987,5733bf84d058e614000b61c1,"[As at most other universities, Notre Dame's s...",7,10,Spring 2008
