In [1]:
import json
import pandas as pd
from pathlib import Path

json_path = Path("../raw_data.json").resolve()

# Load JSON file
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Check if all records have required fields
required_keys = {"Question", "Answer", "Category"}
valid_records = [entry for entry in data if required_keys.issubset(entry)]

# Optionally warn if some entries were skipped
print(f"Loaded {len(valid_records)} valid entries out of {len(data)} total.")

# Convert to DataFrame
df = pd.DataFrame(valid_records)

df = df.dropna(subset=["Category"])

# Create a new column combining Question and Answer 
# Also ensure all entries are strings
df["text"] = (
    df["Question"].astype(str)
    + " "
    + df["Answer"].astype(str)
)

# Show preview
print(df.head())




Loaded 36700 valid entries out of 36700 total.
                                            Question  \
0  South Korea's longest river, the Nakdong, flow...   
1  Developed by Francophone writers and politicia...   
2  In Argonauts of the Western Pacific, Bronislaw...   
3  One of the deficiencies of the Standard Model ...   
4  Appearing in the title of a 1982 book by evolu...   

                               Answer            Category  \
0                      Busan or Pusan           Geography   
1                        Aime Cesaire  Art and Literature   
2  Kula or Kula ring or Kula exchange           Geography   
3                    weak and gravity  Science and Nature   
4                           phenotype  Science and Nature   

                                                text  
0  South Korea's longest river, the Nakdong, flow...  
1  Developed by Francophone writers and politicia...  
2  In Argonauts of the Western Pacific, Bronislaw...  
3  One of the deficiencies of

In [2]:
# standardise and clean the text

import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()                              # Lowercase
    text = re.sub(r"[^\w\s]", "", text)              # Remove punctuation
    text = re.sub(r"\s+", " ", text)                 # Normalize whitespace
    return text.strip()                              # Remove leading/trailing space

# Apply cleaning to the "text" column
df["clean_text"] = df["text"].apply(clean_text)


In [3]:
# Save processed data as both json and csv for optionality

df.to_json("cleaned_data.json", orient="records", indent=2, force_ascii=False)

df.to_csv("cleaned_data.csv", index=False)
