# Q&A pipeline

In [1]:
! pip install -U datasets pandas sentence-transformers



### Load the dataset 

In [2]:
from datasets import load_dataset

paragraphs_dataset = load_dataset("GroNLP/ik-nlp-22_slp", 'paragraphs')
questions_dataset = load_dataset('GroNLP/ik-nlp-22_slp', 'questions')

paragraphs_dataset

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset ik-nlp-22_slp (/Users/andreeaioanatudor/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/paragraphs/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8)
100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 136.07it/s]
Found cached dataset ik-nlp-22_slp (/Users/andreeaioanatudor/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/questions/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8)
100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 889.57it/s]


DatasetDict({
    train: Dataset({
        features: ['n_chapter', 'chapter', 'n_section', 'section', 'n_subsection', 'subsection', 'text'],
        num_rows: 1697
    })
})

In [3]:
import pandas as pd

paragraphs = paragraphs_dataset['train'].to_pandas()
questions = questions_dataset['test'].to_pandas()

# only paragraphs text and questions 
paragraphs_list = list(paragraphs['text'])
questions_list = list(questions['question'])

### Set the parameters for Semantic Similarity
`model_name` - pre-trained model used for emebdding

`k` - the top k paragraphs retrieved for each question (the top k highest scores)

`unique_chapter` - if set to `True`, only the paragraphs coming from the same chapter will be used in the QA system

In [4]:
# model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "sentence-transformers/masmarco-distilbert-base-tas-b"
model_name = "sentence-transformers/all-MiniLM-L6-v2"

k = 10
# k = 20

unique_chapter = True
# unique_chapter = False

### Information retrieval using Semantic Similarity
Embed the paragraphs and questions. Then retrieve the top 10 paragraphs with the highest scores for each question (semantic similarity using cosine similarity) and store their indexes.

In [5]:
from sentence_transformers import SentenceTransformer, util
import time

# start the timer
start_time = time.time()
model = SentenceTransformer(model_name)

# paragraphs embeddings
embeddings = model.encode(paragraphs_list, convert_to_tensor=True)

# questions embeddings
q_embeddings = model.encode(questions_list, convert_to_tensor=True)

# top k semantic similarity score paragraphs for each question (using cosine similarity)
sem_search_scores = util.semantic_search(query_embeddings=q_embeddings, corpus_embeddings=embeddings, top_k=k)

# stop the timer for information retrieval
retrieval_end_time = time.time()

# elapsed time for embedding + semantic search
semantic_search_elapsed_time = retrieval_end_time - start_time

contexts_idx = []
contexts_pos = []

# store the indexes of the retrieved paragraphs
# and the start pos of each paragraph from a context 
for scores in sem_search_scores:
    answers_idx = []
    answers_pos = []
    start_pos = 0
    for answer in scores:
        idx = answer['corpus_id']
        answers_idx.append(idx)
        answers_pos.append(start_pos)
        start_pos += len(paragraphs_list[idx])
    contexts_idx.append(answers_idx)
    contexts_pos.append(answers_pos)

### Information Retrieval using an Instructable Model
For each question, ask an instructable model which paragraphs are relevant to answer the question. Then, store all the relevant paragraphs (their indexes).

Tried to use multithreading, but it still takes too long - can't run it for the entire dataset.

In [6]:
# import threading
# from tqdm import tqdm

# import torch
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Define the function that each thread will execute 
# def get_relevant_context(question):
#     answers_idx = []
#     # Load the Flan-T5-Base model and tokenizer
#     model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
#     tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    
#     # find the relevant paragraphs for the question
# #     for idx, context in enumerate(paragraphs_list):
#     for idx, context in enumerate(paragraphs_list[0:10]):
#         # Encode the input as a T5 sequence
#         input_str = "question: '{}' context: '{}' Is this context relevant for answering the question? Answer yes or no.".format(question, context)
#         input_ids = tokenizer.encode(input_str, return_tensors="pt")

#         # Generate an answer using the model
#         outputs = model.generate(input_ids=input_ids)
#         answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

#         # if a paragraph is relevant for the question, store its index
#         if 'yes' in answer.lower():
#             answers_idx.append(idx)
            
#     return answers_idx

# # Define the function that each thread will execute
# def worker(questions, contexts_idx, start, end):
#     for i in range(start, end):
#         contexts_idx[i] = get_relevant_context(questions[i])

# # Create a list to store the results (the indexes of the relevant paragraphs)
# contexts_idx1 = [None] * len(questions_list)

# num_threads = 15
# chunk_size = len(questions_list) // num_threads

# threads = []

# # Create and start each thread
# for i in range(num_threads):
#     start = i * chunk_size
#     end = start + chunk_size
#     if i == num_threads - 1:
#         end = len(questions_list)
#     t = threading.Thread(target=worker, args=(questions_list, contexts_idx1, start, end))
#     t.start()
#     threads.append(t)

# # Create a tqdm progress bar
# with tqdm(total=len(questions_list)) as pbar:
#     # Wait for all threads to finish
#     for t in threads:
#         t.join()
#         # Update the progress bar
#         pbar.update(chunk_size)

# # The results list now contains the incremented numbers
# print(contexts_idx1)

In [7]:
# import torch
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Load the Flan-T5-Base model and tokenizer
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# contexts_idx2 = []

# for question in questions_list:
#     answers_idx = []
#     for idx, context in enumerate(paragraphs_list):
#         # Encode the input as a T5 sequence
#         input_str = "question: '{}' context: '{}' Is this context relevant for answering the question? Answer yes or no.".format(question, context)
#         input_ids = tokenizer.encode(input_str, return_tensors="pt")

#         # Generate an answer using the model
#         outputs = model.generate(input_ids=input_ids, max_length=32, num_beams=4)
#         answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
#         # if a paragraph is relevant for the question, store its index
#         if 'yes' in answer.lower():
#             answers_idx.append(idx)
        
#     contexts_idx2.append(answers_idx)

### Define the context for each question
From the retrieved paragraphs of each question, keep the ones coming from the same, most common chapter, and create the context to use in the QA system. If all the top 10 (most relevant) paragraphs come from different chapters, only the first one will be used.

For many sections/subsections, the value is 'nan'. So I only used chapter.

`unique_chapter` must be set to `True`, otherwise all retrieved k paragraphs will be used.

In [8]:
if unique_chapter:
    book_part = 'chapter'
    # book_part = 'n_section'
    # book_part = 'n_subsection'
    contexts = []
    contexts_chapters = []

    for answers_idx in contexts_idx:
        # find the most common chapter of the k retrieved paragraphs
        chapters = []
        for idx in answers_idx:
            chapters.append(paragraphs[book_part][idx])

        most_common_chapter = max(chapters, key=chapters.count)
        contexts_chapters.append(most_common_chapter)

        # store the retrieved paragraphs coming from the most common chapter (and their indexes)
        context = []
        context_chap = []
        for idx in answers_idx:
            if paragraphs[book_part][idx] == most_common_chapter:
                context.append(paragraphs['text'][idx])

        contexts.append('. '.join(context))

In [9]:
if not unique_chapter:
    contexts = []
    all_contexts_chapters = []
    
    for answers_idx in contexts_idx:
        # append the chapter of each paragraph and
        # concatenate all paragraphs to create the context
        chapters = []
        context = []
        
        for idx in answers_idx:
            chapters.append(paragraphs[book_part][idx])
            context.append(paragraphs['text'][idx])
            
        all_contexts_chapters.append(chapters)
        contexts.append('. '.join(context))

### QA system
Use the Inference API to access the roberta-base-squad2 model.

In [10]:
import json
import requests

def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))

In [11]:
model = 'deepset/roberta-base-squad2'
API_TOKEN = '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'
API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": f"Bearer {API_TOKEN}"}

answers = []

for idx, question in enumerate(questions_list):
    answers.append(query(
        {
            "inputs": {
                "question": question,
                "context": contexts[idx]
            }
        }
    ))

# stop the timer
end_time = time.time()

# total elapsed time (embedding + semantic search + defining context + question answering)
total_elapsed_time = end_time - start_time

### Evaluation
Print the answers of the system and the actual answers.
Then, we compute the cosine similarity for each pair.

In [12]:
def get_predicted_answer_location(answer, q_index):
    start_pos = answer['start']
    end_pos = answer['end']
    idx = None
    
    for i in range(len(context)-1):
        if start_pos > contexts_pos[q_index][i] and end_pos < contexts_pos[q_index][i+1]:
            idx = contexts_idx[q_index][i]
            
    if idx is None:
        idx = contexts_idx[q_index][-1]
    
    return {
        'chapter': paragraphs['chapter'][idx],
        'section': paragraphs['section'][idx],
        'subsection': paragraphs['subsection'][idx]
    }


def get_location_str(location):
    loc_str = ''
    loc_keys = ['chapter', 'section', 'subsection']
    
    for key in loc_keys:
        loc_str += f"{key} {location[key]}, "
        
    return loc_str[:-2]


def compare_book_location(ans_loc, q_index):
    for key in ans_loc.keys():
        if questions[key][q_index] != ans_loc[key]:
            return f"different {key}"
    return "same location"

In [13]:
print(get_location_str(questions.iloc[0]))

chapter Regular Expressions, section Regular Expressions, subsection Basic Regular Expressions


In [14]:
dif_location = 0
predicted_answers_list = []
test_answers_list = []
ans_str = 'answer'

for idx, question in enumerate(questions_list):
    answer_location = get_predicted_answer_location(answers[idx], idx)
    pred_loc_str = get_location_str(answer_location)
    real_loc_str = get_location_str(questions.iloc[idx])
    location_comp = compare_book_location(answer_location, idx)
    
    print(question)
    print(f"system answer:\t {answers[idx][ans_str]}")
    print(f"real answer:\t {questions[ans_str][idx]}")
    
    predicted_answers_list.append(answers[idx]['answer'])
    test_answers_list.append(questions['answer'][idx])
    
    if location_comp != "same location":
        dif_location += 1
        print("predicted answer: from ", pred_loc_str)
        print("actual answer:    from ", real_loc_str)
        
    print()


What is the meaning of the Kleene star in Regex?
system answer:	 zero or more occurrences of the immediately previous character or regular expression".
real answer:	 The Kleene star means "zero or more occurrences of the immediately previous character or regular expression"

What is the usage of the Regex lookahead operator "?="?
system answer:	 disjunction operator
real answer:	 The operator (?= pattern) is true if pattern occurs, but is zero-width, i.e. the match pointer doesn’t advance.
predicted answer: from  chapter Regular Expressions, section Regular Expressions, subsection Disjunction, Grouping and Precendence
actual answer:    from  chapter Regular Expressions, section Regular Expressions, subsection Lookahead Assertions

What are the most common steps in a text normalization process?
system answer:	 frequency computation
real answer:	 1. Tokenizing (segmenting) words 2. Normalizing word formats 3. Segmenting sentences
predicted answer: from  chapter Machine Translation and En

In [15]:
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model to embed the answers
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Get sentence embeddings for each answer
embeddings1 = model.encode(predicted_answers_list, convert_to_tensor=True)
embeddings2 = model.encode(test_answers_list, convert_to_tensor=True)

# Calculate the cosine distance between the embeddings
similarity_score = util.cos_sim(embeddings1, embeddings2)

results = []

# compute and print the scores
for i in range(len(predicted_answers_list)):
    result = similarity_score[i][i].item()
    results.append(result)
#     print("sent_1: ", predicted_answers_list[i])
#     print("sent_2: ", test_answers_list[i])
#     print("result: ", result)
#     print()

# average similarity score
avg = sum(results) / len(results)

In [16]:
print(f"Model used for embeddings: {model_name} \ntop k: {k} \nusing retrieved paragraphs from the same chapter: {unique_chapter}")
print("---------------------------------------------------------------------")
print()
print(f"predicted answers from different chapters/sections/subsections than the real ones: {dif_location}")
print("Average similarity score between predicted and actual answers: ", avg)
print()
print(f"information retrieval elapsed time: {round (semantic_search_elapsed_time, 2)} seconds")
print(f"total elapsed time: {round (total_elapsed_time, 2)} seconds")

Model used for embeddings: sentence-transformers/all-MiniLM-L6-v2 
top k: 10 
using retrieved paragraphs from the same chapter: True
---------------------------------------------------------------------

predicted answers from different chapters/sections/subsections than the real ones: 23
Average similarity score between predicted and actual answers:  0.5643643845125275

information retrieval elapsed time: 20.02 seconds
total elapsed time: 53.61 seconds
