# Read JSON file

In [60]:
import json

# Open and read the JSON file
# with open('data_crawl/combined-newsqa-data-v1-format.json', 'r') as file:
#     data = json.load(file)
with open('data_crawl/combined-newsqa-data-v1.json', 'r') as file:
    data = json.load(file)

# Function to extract text, questions (q), and answers, filtering out {'noAnswer': True} and duplicates
def extract_data(data, limit=None):
    extracted_data = []
    
    # Loop through each story, limited by the specified count
    for index, story in enumerate(data.get("data", [])):
        if limit is not None and index >= limit:
            break

        text = story.get("text", "") ## Clean up new line
        questions = story.get("questions", [])
        
        # Create a list to hold questions and their corresponding answers
        questions_with_answers = []
        
        # Loop through each question and its associated answers
        for question in questions:
            q = question.get("q", "")
            answers = set()  # Use set to avoid duplicate answers
            
            # Extract and filter answers
            for answer in question.get("answers", []):
                sourcer_answers = answer.get("sourcerAnswers", [])
                
                # Filter out {'noAnswer': True} and convert positions to text
                for sourcer_answer in sourcer_answers:
                    if "noAnswer" not in sourcer_answer:
                        start = sourcer_answer.get("s", -1)
                        end = sourcer_answer.get("e", -1) - 1
                        
                        # Extract the word from the text
                        if start != -1 and end != -1:
                            extracted_text = text[start:end]
                            # Add to set to avoid duplicates
            #                 answers.add((start, end, extracted_text))
            
            # # Convert the set to a list of dictionaries
            # unique_answers = [{"start": start, "end": end, "extracted_text": extracted_text} 
            #                   for start, end, extracted_text in answers]
                            answers.add(extracted_text)
            
            # Convert the set to a list of dictionaries
            unique_answers = list(answers)
            
            if unique_answers:  # Check if the answers list is not empty
                questions_with_answers.append({
                    "question": q,
                    "answers": unique_answers
                })
        
        # Append the story data with text and its questions with answers
        extracted_data.append({
            "text": text,
            "questions": questions_with_answers
        })
    
    return extracted_data

# Extracted data
extracted_data = extract_data(data, limit = 100) # Only took 100/12400 stories because takes all the stories makes the txt files too big

# # Print extracted data
# for item in extracted_data:
#     print(f"Text: {item['text'][:50]}")
#     print("----------------")
#     for qa in item['questions']:
#         print(f"Question: {qa['question']}")
#         for answer in qa['answers']:
#             # print(f"Start: {answer['start']}, End: {answer['end']}")
#             # print(f"Answer: '{answer['extracted_text']}'")
#             print(f"Answer: '{answer}'")
#     print("#######################")


In [48]:
print(f"Numbers of articles: {len(extracted_data)}")

Numbers of articles: 100


In [61]:
with open("extracted_file.txt", "w") as output:
    for item in extracted_data:
        output.write(f"{item}\n")