In [None]:
import pandas as pd
from datasets import load_dataset
import json

# 1. The path to your input JSON file
input_filepath = 'D:\LLM-Hallucination\data\TriviaQA\\unfiltered-web-dev.json' 

# 2. The path to your output JSON file
output_filepath = 'D:\LLM-Hallucination\data\prompts\\triviaqa_prompts.json'

# 3. The number of rows you want to sample
num_samples_to_add = 200

# 4. How many search results to combine for the context
NUM_SEARCH_RESULTS_FOR_CONTEXT = 5



def create_context_from_search(search_results, num_to_combine):
    
    if not search_results or 'results' not in search_results or not search_results['results']:
        return "No search results found."
    
    # Take the top N results
    top_results = search_results['results'][:num_to_combine]
    
    
    context_parts = []
    for result in top_results:
        title = result.get('title', 'No Title')
        description = result.get('description', 'No Description')
        context_parts.append(f"Title: {title}\nDescription: {description}")
        
    return "\n\n".join(context_parts)


def sample_and_process(input_file, output_file, num_samples):
    
    print(f"Loading local TriviaQA data from '{input_file}'...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            trivia_data = json.load(f)['Data']
    except (FileNotFoundError, KeyError):
        print(f"Error: The file '{input_file}' was not found or has an unexpected format.")
        return

    
    processed_questions = []
    for item in trivia_data:
        # Check if SearchResults exist and are not empty
        search_results = item.get('SearchResults')
        if search_results:  # This condition handles both missing keys and empty lists
            processed_questions.append({
                'id': item['QuestionId'],
                'question': item['Question'],
                'answer_text': item['Answer']['Value'],
                'context': create_context_from_search(search_results, NUM_SEARCH_RESULTS_FOR_CONTEXT) 
            })
    

    print(f"Successfully filtered and processed the data. Found {len(processed_questions)} questions with valid context.")

    # Convert the list to a pandas DataFrame for easy sampling
    df = pd.DataFrame(processed_questions)

    # --- Sampling Logic (remains the same) ---
    if len(df) < num_samples:
        print(f"Warning: Requested {num_samples} samples, but only {len(df)} are available after filtering. Sampling all of them.")
        sampled_df = df
    else:
        print(f"Randomly sampling {num_samples} questions...")
        sampled_df = df.sample(n=num_samples, random_state=42)
    
    samples_to_append = sampled_df.to_dict(orient='records')

    
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        existing_data = []

    existing_data.extend(samples_to_append)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, indent=4)

    print("-" * 20)
    print("Process complete!")
    print(f"The file '{output_file}' now contains a total of {len(existing_data)} entries.")



sample_and_process(input_filepath, output_filepath, num_samples_to_add)

Loading local TriviaQA data from 'D:\LLM-Hallucination\data\TriviaQA\unfiltered-web-dev.json'...
Successfully filtered and processed the data. Found 11260 questions with valid context.
Randomly sampling 200 questions...
--------------------
Process complete!
The file 'D:\LLM-Hallucination\data\prompts\triviaqa_prompts.json' now contains a total of 200 entries.
