This notebook will be used to parse the SQuAD dataset into the format that the semantic search app expects.

The SQuAD dataset consists of 100,000 question ans answer pairs based on a set of wikipedia articles. Half of the questions are unanswerable, which is good for us because we want to prepare for the possibility that the there is no good answer to the user's query.

The SQuAD dataset is natively stored as a json. The goal of this notebook is convert it into the format our app expects: a set of markdown files containing the wikipedia excerpts and a json dedicated to the question/answer pairs. Currently the test runner is only configured to handle queries where an answer is available, so we will initially focus on QA pairs where an answer is available. We will handle the cases where no answer is available at a later date.

In [2]:
from pathlib import Path
import json

path_to_squad = Path(
    "C:\\Users\\Djhay\\OneDrive\\Desktop\\Projects\\Hackathon\\Hackathon\\RawData\\squad_data.json"
)
with open(path_to_squad, 'r') as f:
    raw_squad_data = json.load(f)



In [4]:
import re

output_dir = Path(
    "C:\\Users\\Djhay\\OneDrive\\Desktop\\Projects\\Hackathon\\Hackathon\\TestData\\SQuAD"
)
output_dir.mkdir(parents=True, exist_ok=True)

qa_path = Path(
    "C:\\Users\\Djhay\\OneDrive\\Desktop\\Projects\\Hackathon\\Hackathon\\TestRunner\\QuestionAnswer\\SQuAD.json"
)

final_qas = []

def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

for entry in raw_squad_data["data"][0:5]:
    article_title =  sanitize_filename(entry["title"])
    md_file_contents = f"# {article_title}\n\n"

    content = entry["paragraphs"]
    article_contents = []
    for item in content:
        text = item["context"]
        md_file_contents += f"\n{text}\n\n"
        qas = item["qas"]
        for qa in qas:
            question = qa["question"]
            # We are searching for relevance, not correctness. For now we treat impossible questions the same. 
            answers_key = "plausible_answers" if qa["is_impossible"] else "answers"
            answers = qa[answers_key]
            for answer in answers:
                answer_text = answer["text"]
                # NOTE: find will return the first occurance of the substring.
                # If there is more than one occurance, there is no way to tell which
                # is more "correct". Ideally we might want to retain the information
                # that the substring occured in two places and allow either
                # to be returned by the app. For now, I am okay with just keeping the first occurance.
                start_char = md_file_contents.find(answer_text)
                end_char = start_char + len(answer_text)
                answer_char_range = [start_char, end_char]
                qa_entry = {
                    "query": question,
                    "answer": article_title,
                    "answer_position": answer_char_range,
                }
                final_qas.append(qa_entry)


    file_path = output_dir / f"{article_title}.md"
    with open(file_path, "w", encoding="utf-8") as md_file:
        md_file.write(md_file_contents)

with open(qa_path, "w", encoding="utf-8") as f:
    json.dump(final_qas, f, indent=4)
