In [1]:
import pandas as pd
import json

def load_squad_to_dataframe(file_path):

    with open(file_path, 'r', encoding='utf-8') as f:
        squad_data = json.load(f)

    # This list will hold our flattened data
    rows_list = []

    # The JSON is nested, so we need to loop through it
    for topic in squad_data['data']:
        title = topic['title']
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                qa_id = qa['id']
                is_impossible = qa['is_impossible']

                # Handle answers
                answer_text = None
                if not is_impossible:
                    # Get the text of the first answer if it's not impossible
                    answer_text = qa['answers'][0]['text']

                new_row = {
                    'id': qa_id,
                    'title': title,
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'is_impossible': is_impossible
                }
                rows_list.append(new_row)

    # Create the DataFrame
    df = pd.DataFrame(rows_list)
    return df




In [2]:
# --- Load and Display Development (Dev) Data ---
dev_file_path = 'D:\LLM-Hallucination\data\SQuAD\dev-v2.0.json'
dev_df = load_squad_to_dataframe(dev_file_path)

print(f"--- SQuAD 2.0 Development Data ({dev_file_path}) ---")

# Display the first 5 rows
print("\nFirst 5 rows (head):")
dev_df.head()





--- SQuAD 2.0 Development Data (D:\LLM-Hallucination\data\SQuAD\dev-v2.0.json) ---

First 5 rows (head):


Unnamed: 0,id,title,context,question,answer_text,is_impossible
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,False
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries,False
2,56ddde6b9a695914005b962a,Normans,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway",False
3,56ddde6b9a695914005b962b,Normans,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo,False
4,56ddde6b9a695914005b962c,Normans,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century,False


In [3]:
# Display columns and their types
print("\nColumns and Data Types (info):")
dev_df.info()


Columns and Data Types (info):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11873 entries, 0 to 11872
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             11873 non-null  object
 1   title          11873 non-null  object
 2   context        11873 non-null  object
 3   question       11873 non-null  object
 4   answer_text    5928 non-null   object
 5   is_impossible  11873 non-null  bool  
dtypes: bool(1), object(5)
memory usage: 475.5+ KB


In [8]:
# fetching from dev file only as we evaluate hallucination not train from scratch

output_filepath = 'D:\LLM-Hallucination\data\prompts\squad_prompts.json'

# The number of rows you want to sample and add
num_samples_to_add = 200



def sample_and_append(input_file, output_file, num_samples):

    print(f"Loading flattened data from '{input_file}' into a DataFrame...")
    df = load_squad_to_dataframe(input_file)

    # --- Key Change: Filtering with Pandas ---
    # Filter the DataFrame to keep only the rows for answerable questions.
    
    ans_df = df[df['is_impossible'] == False].copy()
    print(f"Found {len(ans_df)} total answerable questions.")
    
    # Check if we have enough questions to sample from
    if len(ans_df) < num_samples:
        print(f"Warning: Requested {num_samples} samples, but only {len(ans_df)} are available. Sampling all of them.")
        sampled_df = ans_df
    else:
        # --- Key Change: Sampling with Pandas ---
        # random_state ensures you get the same random sample every time you run the script, for reproducibility.
        print(f"Randomly sampling {num_samples} questions...")
        sampled_df = ans_df.sample(n=num_samples, random_state=42)

    # Select only the columns you want in your final JSON file ---
    output_columns = ['id', 'context', 'question', 'answer_text']
    final_samples_df = sampled_df[output_columns]

    # Convert the resulting DataFrame to a list of dictionaries
    samples_to_append = final_samples_df.to_dict(orient='records')
    
    # --- The appending logic remains the same ---
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
        if not isinstance(existing_data, list):
            print(f"Warning: Existing file '{output_file}' does not contain a JSON list. Overwriting.")
            existing_data = []
    except (FileNotFoundError, json.JSONDecodeError):
        print(f"Output file '{output_file}' not found or is invalid. A new file will be created.")
        existing_data = []

    existing_data.extend(samples_to_append)

    print(f"Appending {len(samples_to_append)} new entries to '{output_file}'...")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, indent=4)

    print("-" * 20)
    print("Process complete!")
    print(f"The file '{output_file}' now contains a total of {len(existing_data)} entries.")



sample_and_append(dev_file_path, output_filepath, num_samples_to_add)

Loading flattened data from 'D:\LLM-Hallucination\data\SQuAD\dev-v2.0.json' into a DataFrame...
Found 5928 total answerable questions.
Randomly sampling 200 questions...
Output file 'D:\LLM-Hallucination\data\prompts\squad_prompts.json' not found or is invalid. A new file will be created.
Appending 200 new entries to 'D:\LLM-Hallucination\data\prompts\squad_prompts.json'...
--------------------
Process complete!
The file 'D:\LLM-Hallucination\data\prompts\squad_prompts.json' now contains a total of 200 entries.
