In [1]:
import json

In [2]:
with open("train-v2.0.json", "r") as file:
    squad_data = json.load(file)

# Inspect the structure
print(json.dumps(squad_data, indent=2)[:1000])

{
  "version": "v2.0",
  "data": [
    {
      "title": "Beyonc\u00e9",
      "paragraphs": [
        {
          "qas": [
            {
              "question": "When did Beyonce start becoming popular?",
              "id": "56be85543aeaaa14008c9063",
              "answers": [
                {
                  "text": "in the late 1990s",
                  "answer_start": 269
                }
              ],
              "is_impossible": false
            },
            {
              "question": "What areas did Beyonce compete in when she was growing up?",
              "id": "56be85543aeaaa14008c9065",
              "answers": [
                {
                  "text": "singing and dancing",
                  "answer_start": 207
                }
              ],
              "is_impossible": false
            },
            {
              "question": "When did Beyonce leave Destiny's Child and become a solo singer?",
              "id": "56be85543aeaaa14008c9066",
   

In [3]:
def extract_data(squad_data):
    data = []
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                if "is_impossible" in qa and qa["is_impossible"]:
                    continue
                for answer in qa["answers"]:
                    data.append({
                        "context": context,
                        "question": question,
                        "answer": answer["text"]
                    })
    return data

train_data = extract_data(squad_data)
print(train_data[:3])

[{'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answer': 'in the late 1990s'}, {'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she

In [4]:
import re

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text

for item in train_data:
    item["context"] = clean_text(item["context"])
    item["question"] = clean_text(item["question"])
    item["answer"] = clean_text(item["answer"])

print(train_data[:3])

[{'context': 'beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best-selling girl groups of all time. their hiatus saw the release of beyoncé\'s debut album, dangerously in love (2003), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles "crazy in love" and "baby boy".', 'question': 'when did beyonce start becoming popular?', 'answer': 'in the late 1990s'}, {'context': 'beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she

In [5]:
import csv

# Save as JSON
with open("cleaned_squad_data.json", "w", encoding='utf-8') as file:
    json.dump(train_data, file, indent=2, ensure_ascii=False)

# Save as CSV
with open("cleaned_squad_data.csv", "w", newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["context", "question", "answer"])
    writer.writeheader()
    for item in train_data:
        writer.writerow(item)