## Overview:

This notebook processes and analyzes model test result JSON files to extract unique model responses, their scores and frequencies, and save the results for different context types. It includes functions to:

- Load and process JSON files for a given context and ID.
- Extract unique responses, their scores, and frequencies.
- Save the processed data for all context types into a single JSON file for a specified ID.

Additionally, it demonstrates saving the results for a specific ID as an example.

### Helper Functions


Load and Process JSON Files:

In [20]:
import os
import json
import pandas as pd
from pathlib import Path
import tqdm
from dotenv import load_dotenv
import os
import time
from groq import Groq



def __get_unique_model_responses(id: int, context_type: str, with_misleading: bool = False) -> list:
    """
    Load and process model test result JSON files for a given context and ID.

    Returns a list of unique model responses with their:
    - score (from first occurrence)
    - frequency (how many times that response appeared)
    
    Sorted by descending score.
    """
    model_name = 'llama-2-7b-80k'
    base_dir = Path('../results/graph')
    
    suffix = '_misleading' if with_misleading else ''
    directory = base_dir / f'{model_name}_id_{id}_{context_type}{suffix}'

    # Collect all responses and scores
    data = [
        json.load(open(directory / file, encoding='utf-8'))
        for file in os.listdir(directory) if file.endswith('.json')
    ]
    
    df = pd.DataFrame(data)[['model_response', 'needle', 'context_length', 'depth_percent']]
    question_df = pd.read_excel("../data_generation/FreshQADataset_with_misleading.xlsx")
    try:
        question = question_df.set_index("id").at[id, "question"]
        df['question'] = question
        
    except KeyError:
        raise ValueError(f"No question found for id: {id}")

    # Compute frequency
    freq = df['model_response'].value_counts().rename('frequency')

    # Drop duplicates but keep first score
    # df = df.drop_duplicates(subset='model_response')

    # Merge with frequency
    df = df.merge(freq, left_on='model_response', right_index=True)

    # Sort by score descending
    # df = df.sort_values(by='score', ascending=False).reset_index(drop=True)

    return df.to_dict(orient='records')


Save Processed Results:

In [21]:
def save_all_contexts_for_id(id: int) -> None:
    """
    Gather results from all 4 context types and save them in a single JSON file for the given id.

    File will be saved as: unique_responses/{id}.json
    """
    save_dir = Path('unique_responses')
    save_dir.mkdir(parents=True, exist_ok=True)

    all_data = {
        'relevant': __get_unique_model_responses(id, 'relevant', with_misleading=False),
        'relevant_misleading': __get_unique_model_responses(id, 'relevant', with_misleading=True),
        'irrelevant': __get_unique_model_responses(id, 'irrelevant', with_misleading=False),
        'irrelevant_misleading': __get_unique_model_responses(id, 'irrelevant', with_misleading=True),
    }

    save_path = save_dir / f'{id}.json'
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print(f"Saved all context responses for ID {id} to {save_path}")

### Run Analysis:

In [22]:
id_val = 44

In [23]:
save_all_contexts_for_id(id_val)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '..\\results\\graph\\llama-2-7b-80k_id_44_relevant'

In [17]:
__get_unique_model_responses(id_val, 'relevant', with_misleading=False)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '..\\results\\graph\\llama-2-7b-80k_id_44_relevant'

In [18]:
def build_rewrite_prompt(model_response, real_needle, question):
    return (
        f"""You are evaluating a model-generated response for hallucination analysis in a long-context retrieval setting.

            You are given the following:

            Question: {question}

            The correct answer (real needle): {real_needle}

            A model-generated response: {model_response}

            Your task is to classify the model's response into one of the following categories:

            1. Correct Retrieval: the model answered the question correctly

            2. Excess Incorrect Retrieval: the model answered the question correctly but it also went on to output other text that was either completely irrelevant or invalid with regards to the question.

            3. Irrelevant Retrieval: the model did not answer the question, it went on to output other text

            4. Contradictory Retrieval: the model answered the question incorrectly

            Examples:

            Question: Who is the President of the United States? Needle: The President of the United States is Donald Trump.

            1: The President of the United States is Donald Trump. (Correct Retrieval)

            2: The President of the United States is Donald Trump. The affirmation was made in the Declaration of Independence, which was written predominantly by Thomas Jefferson in 1776. (Excess Incorrect Retrieval)

            3: The President of the United States is the head of state and head of government of the United States. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces. (Irrelevant Retrieval)

            4: Joe Biden is the President of the United States. (Contradictory Retrieval)

            Return your output in this exact JSON format: {{"model_response": "<text>", "needle": "<real_needle>", "hallucination_type": "<Correct Retrieval | Excess Incorrect Retrieval | Irrelevant Retrieval | Contradictory Retrieval>"}}"""
    )

In [19]:
# Load .env and API key
load_dotenv()
groq_api_key = os.getenv("snlp_api_key")

# Initialize Groq client
client = Groq(api_key=groq_api_key)

def rewrite_question_and_needle(response, needle, question):
    while True:
        try:
            prompt = build_rewrite_prompt(response, needle, question)
            completion = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=512,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)
            return data["hallucination_type"]
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")


GroqError: The api_key client option must be set either by passing api_key to the client or by setting the GROQ_API_KEY environment variable

In [11]:

with open(f"unique_responses/{id_val}.json", "r") as f:
    data = json.load(f)
    
rows = []
for category, entries in data.items():
    for entry in entries:
        entry["category"] = category
        rows.append(entry)

df = pd.DataFrame(rows)
print(df)


                                       model_response  \
0   No, the FIFA World Cup will not be hosted this...   
1   No, the FIFA World Cup will not be hosted this...   
2   No, the FIFA World Cup will not be hosted this...   
3   No, the FIFA World Cup will not be hosted this...   
4   No, the FIFA World Cup will not be hosted this...   
..                                                ...   
95  No, the FIFA World Cup will not be hosted this...   
96  No, the FIFA World Cup will not be hosted this...   
97  No, the FIFA World Cup will not be hosted this...   
98  No, the FIFA World Cup will not be held this y...   
99  No, the FIFA World Cup will not be hosted this...   

                                        needle  context_length  depth_percent  \
0   There won't be a FIFA World Cup this year.               0            0.0   
1   There won't be a FIFA World Cup this year.               0           25.0   
2   There won't be a FIFA World Cup this year.               0          

In [None]:
from tqdm import tqdm
hallucination_types = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Refining Q&A pairs"):
    mr, n, q = row.get("model_response", ""), row.get("question_refined", ""), row.get("needle_refined", "")
    # if s > 50:
    #     current_hallucination_type = "No Hallucination"
    
    # else:
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
        
    current_hallucination_type = rewrite_question_and_needle(mr, n, q)
    hallucination_types.append(current_hallucination_type)

# Update the DataFrame
df['hallucination_type'] = hallucination_types

Refining Q&A pairs:   5%|▌         | 5/100 [00:02<00:39,  2.40it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  10%|█         | 10/100 [00:19<02:17,  1.53s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  15%|█▌        | 15/100 [00:36<02:21,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  20%|██        | 20/100 [00:53<02:14,  1.68s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  25%|██▌       | 25/100 [01:10<02:07,  1.70s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  30%|███       | 30/100 [01:34<04:16,  3.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  35%|███▌      | 35/100 [02:01<04:49,  4.46s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  40%|████      | 40/100 [02:26<04:14,  4.24s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  45%|████▌     | 45/100 [02:51<03:55,  4.28s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  50%|█████     | 50/100 [03:16<03:36,  4.32s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  55%|█████▌    | 55/100 [03:41<03:14,  4.32s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  60%|██████    | 60/100 [04:07<02:52,  4.32s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  65%|██████▌   | 65/100 [04:32<02:30,  4.30s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  70%|███████   | 70/100 [04:56<01:59,  3.97s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  75%|███████▌  | 75/100 [05:22<01:51,  4.47s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  80%|████████  | 80/100 [05:47<01:26,  4.33s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  85%|████████▌ | 85/100 [06:12<01:04,  4.30s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  90%|█████████ | 90/100 [06:37<00:43,  4.30s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  95%|█████████▌| 95/100 [07:02<00:20,  4.19s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs: 100%|██████████| 100/100 [07:27<00:00,  4.48s/it]


In [13]:
# Group by category and convert each group to a list of dicts
json_output = {
    category: group.drop(columns="category").to_dict(orient="records")
    for category, group in df.groupby("category")
}

output_dir = "updated_unique_responses"
file_id = str(id_val)  # or whatever variable you're using
output_path = os.path.join(output_dir, f"{file_id}.json")

# Create the folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Write the JSON file into the folder
with open(output_path, "w") as f:
    json.dump(json_output, f, indent=4)
