In [8]:
import json
import re
import spacy
from typing import List, Tuple

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_question_word_and_mask(question: str) -> Tuple[str, str]:
    """
    Extract the question word/phrase and create a masked question by replacing it with <qw>.
    
    Args:
        question (str): The original question from SQuAD.
        
    Returns:
        Tuple[str, str]: (masked question with <qw>, extracted question word/phrase)
    """
    #doc = nlp(question)
    #tokens = [token.text for token in doc]
    #tokens_lower = [token.text.lower() for token in doc]
    
    # Common question words/phrases
    question_words = {
        "what", "who", "when", "where", "why", "how", "which",
        "how many", "how much", "in which", "to whom"
    }
    question_lower = question.lower()
    # Check for single-word question words
    for qw in question_words:
        if qw in question_lower:
            pattern = r'\b' + re.escape(qw) + r'\b'
            masked_question = re.sub(pattern, "<qw>", question_lower)
        return masked_question, qw
    
    # Fallback: if no question word is identified, return original question and "Unknown"
    return question, "Unknown"

def process_squad(file_path: str) -> List[Tuple[str, str, str]]:
    """
    Process SQuAD dataset to extract (question_with_masking, answer, question_word) tuples.
    
    Args:
        file_path (str): Path to the SQuAD JSON file (e.g., train-v1.1.json).
        
    Returns:
        List[Tuple[str, str, str]]: List of tuples containing (masked question, answer, question word).
    """
    # Load SQuAD dataset
    with open(file_path, "r", encoding="utf-8") as f:
        squad_data = json.load(f)
    
    qa_pairs = []
    
    # Iterate through SQuAD data
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                # Use the first answer (SQuAD may have multiple answers)
                answer = qa["answers"][0]["text"].strip() if qa["answers"] else ""
                
                # Extract question word and mask the question
                masked_question, question_word = extract_question_word_and_mask(question)
                
                qa_pairs.append((masked_question, answer, question_word))
    
    return qa_pairs

# Example usage
if __name__ == "__main__":
    # Path to SQuAD dataset (update with actual path)
    squad_file = "data/dev-v1.1.json"
    
    # Process the dataset
    qa_pairs = process_squad(squad_file)
    
    # Print first 5 tuples for demonstration
    for i, (masked_question, answer, question_word) in enumerate(qa_pairs[:5]):
        print(f"Tuple {i+1}:")
        print(f"Masked Question: {masked_question}")
        print(f"Answer: {answer}")
        print(f"Question Word: {question_word}\n")

UnboundLocalError: cannot access local variable 'masked_question' where it is not associated with a value

In [6]:
a = [x[0] for x in qa_pairs if "<qw>" not in x[0]]
set(a)

{'Approximately how books did Alexander Dyce bequeathed to the museum?',
 'In how many places is oxygen stored in its cycle?',
 'Colonies were a sign of what amongst European countries?',
 "Imperialism extends a country's power and what?",
 'If the apparant force of two fermions is repulsive, what is the spin function?',
 'Decision problems capable of being solved by a deterministic Turing machine while maintaining adherence to polynomial time belong to what class?',
 'An algorithm for X which reduces to C would us to do what?',
 'In the layered model of the Earth, the outermost layer is what?',
 'The Rhine Gorge is between Koblenz and what other city?',
 'By what name is that first Huguenot church known today?',
 '_____ Helps the biospher from UV.',
 "The population of Newcastle was 189,863 according to what year's census?",
 'About many students attend Kunskapsskolan schools?',
 'In 2005, what did Doctor Who think the condition of his home planet was?',
 "At the time, countries such 