## Overview:

This notebook processes and analyzes model test result JSON files to extract unique model responses, their scores and frequencies, and save the results for different context types. It includes functions to:

- Load and process JSON files for a given context and ID.
- Extract unique responses, their scores, and frequencies.
- Save the processed data for all context types into a single JSON file for a specified ID.

Additionally, it demonstrates saving the results for a specific ID as an example.

### Helper Functions


Load and Process JSON Files:

In [82]:
import os
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import os



def __get_unique_model_responses(id: int, context_type: str, with_misleading: bool = False, att_heads_mask: bool = False) -> list:
    """
    Load and process model test result JSON files for a given context and ID.

    Returns a list of unique model responses with their:
    - score (from first occurrence)
    - frequency (how many times that response appeared)
    
    Sorted by descending score.
    """
    model_name = 'llama-2-7b-80k'
    base_dir = Path('../results/graph')
    
    suffix = '_misleading' if with_misleading else ''
    directory = base_dir / f'{model_name}_id_{id}_{context_type}{suffix}'
    if att_heads_mask:
        directory = directory.parent / (directory.name + "_block_top30")

    # Collect all responses and scores
    data = [
        json.load(open(directory / file, encoding='utf-8'))
        for file in os.listdir(directory) if file.endswith('.json')
    ]
    
    df = pd.DataFrame(data)[['model_response', 'needle', 'context_length', 'depth_percent']]
    question_df = pd.read_json("../data_generation/context_cleaned.json")
    try:
        question = question_df.set_index("id").at[id, "question_refined"]
        df['question'] = question
        
    except KeyError:
        raise ValueError(f"No question found for id: {id}")

    # Compute frequency
    freq = df['model_response'].value_counts().rename('frequency')

    # Merge with frequency
    df = df.merge(freq, left_on='model_response', right_index=True)

    return df.to_dict(orient='records')


Save Processed Results:

In [83]:
def save_all_contexts_for_id(id: int, attn_mask_block_30: bool = False) -> None:
    """
    Gather results from all 4 context types and save them in a single JSON file for the given id.

    File will be saved as: unique_responses/{id}.json
    """
    save_dir = Path('unique_responses')
    save_dir.mkdir(parents=True, exist_ok=True)
    if attn_mask_block_30:
        block_save_dir = save_dir.parent / (save_dir.name + '_block_top30')
        block_save_dir.mkdir(parents=True, exist_ok=True)

    all_data = {
        'relevant': __get_unique_model_responses(id, 'relevant', with_misleading=False, att_heads_mask=attn_mask_block_30),
        'relevant_misleading': __get_unique_model_responses(id, 'relevant', with_misleading=True, att_heads_mask=attn_mask_block_30),
        'irrelevant': __get_unique_model_responses(id, 'irrelevant', with_misleading=False, att_heads_mask=attn_mask_block_30),
        'irrelevant_misleading': __get_unique_model_responses(id, 'irrelevant', with_misleading=True, att_heads_mask=attn_mask_block_30),
    }

    save_path = save_dir / f'{id}.json'
    if attn_mask_block_30:
        save_path = block_save_dir / f'{id}.json'
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print(f"Saved all context responses for ID {id} to {save_path}")

### Run Analysis:

In [84]:
# id_val = 44
ids = []
for file_name in os.listdir('../haystack/irrelevant'):
    if file_name.endswith('.txt'):
        file_id = os.path.splitext(file_name)[0]  # Extract the file name without extension
        ids.append(int(file_id))
print(ids)

[1, 11, 121, 122, 123, 124, 14, 15, 155, 16, 160, 163, 164, 167, 168, 17, 170, 172, 173, 175, 18, 180, 182, 183, 185, 189, 192, 193, 196, 198, 2, 29, 3, 380, 391, 393, 396, 4, 401, 403, 408, 409, 410, 411, 420, 421, 422, 423, 424, 427, 43, 430, 432, 433, 435, 436, 437, 438, 439, 44, 441, 442, 444, 448, 449, 453, 5, 532, 535, 539, 576, 577, 578, 579, 586, 587, 588, 589, 590, 92, 95, 96]


In [85]:
# Change to true for masking top 30 attention heads
attn_mask_block = True
for i in ids:
    save_all_contexts_for_id(i, attn_mask_block_30=attn_mask_block)

Saved all context responses for ID 1 to unique_responses_block_top30\1.json
Saved all context responses for ID 11 to unique_responses_block_top30\11.json
Saved all context responses for ID 121 to unique_responses_block_top30\121.json
Saved all context responses for ID 122 to unique_responses_block_top30\122.json
Saved all context responses for ID 123 to unique_responses_block_top30\123.json
Saved all context responses for ID 124 to unique_responses_block_top30\124.json
Saved all context responses for ID 14 to unique_responses_block_top30\14.json
Saved all context responses for ID 15 to unique_responses_block_top30\15.json
Saved all context responses for ID 155 to unique_responses_block_top30\155.json
Saved all context responses for ID 16 to unique_responses_block_top30\16.json
Saved all context responses for ID 160 to unique_responses_block_top30\160.json
Saved all context responses for ID 163 to unique_responses_block_top30\163.json
Saved all context responses for ID 164 to unique_res

In [86]:
def build_rewrite_prompt(model_response, real_needle, question):
    return (
        f"""You are evaluating a model-generated response for hallucination analysis in a long-context retrieval setting.

            You are given the following:

            Question: {question}

            The correct answer (real needle): {real_needle}

            A model-generated response: {model_response}

            Your task is to classify the model's response into one of the following categories:
            1. Correct Retrieval: The model_response is an answer that is semantically aligned with the real_needle,  providing a factually accurate answer with no additional information that is both incorrect and unrelated.
            2. Excess Incorrect Retrieval: Part of the model_response is semantically aligned with the real_needle, but it also contains additional content that is either factually incorrect or unrelated to the original question.
            3. Irrelevant Retrieval: The model_response is not semantically aligned with the real_needle. Instead, it provides content that is unrelated or loosely associated with the question, without answering it directly.
            4. Contradictory Retrieval: The model_response is an incorrect answer that directly contradicts the real_needle or a factually false statement in place of the correct response.

            Example 1:

            Question: When was the Republic of Artsakh formally dissolved?
            Needle: On 1 January 2024, the Republic of Artsakh was formally dissolved.

            Example Correct Retrieval:
            - On 1 January 2024.
            - On 1 January 2024, the Republic of Artsakh was dissolved.
            - On 1 January 2024, the Republic of Artsakh was formally dissolved.

            Example Excess Retrieval:
            - On 1 January 2024, the Republic of Artsakh was formally dissolved. Following Qajar Iran's loss, it was forced to concede suzerainty over most of the khanates, along with Georgia.
            - On 1 January 2024, the Republic of Artsakh was formally dissolved. The original etymology of this name is thought to have its roots in the once-dominant Zoroastrianism. In the Avesta
            - On 1 January 2024, the Republic of Artsakh was formally dissolved. Russia actively tried to gain possession of the Caucasus region which was, for the most part, in the hands of Iran. In
            - On 1 January 2024, the Republic of Artsakh was formally dissolved. The region is usually equated with the administrative borders of the former Nagorno-Karabakh Autonomous Oblast, comprising 4,

            Example Irrelevant Retrieval:
            - On 27 September 2020, the Second Nagorno-Karabakh War broke out with an Azerbaijani offensive in Nagorno-Karabakh and the surrounding territories. Azerbaijan made

            Example Contradictory Retrieval:
            - The Republic of Artsakh was formally dissolved on September 27, 2020.
            - On September 27, 2020, the Republic of Artsakh was formally dissolved.
            - The Republic of Artsakh was formally dissolved on September 27, 2023.
            The Republic of Artsakh was formally dissolved on 27 September 2020, after the Second Nagorno-Karabakh War.

            Example 2:

            Question: Where will the FIFA World Cup be hosted this year?
            Needle: There won't be a FIFA World Cup this year.

            Example Correct Retrieval:
            - No, the FIFA World Cup will not be hosted this year.
            - No.

            Example Excess Retrieval: 
            - No, the FIFA World Cup will not be held this year. The FIFA World Cup is a quadrennial international football tournament contested by the senior men's national teams of the member associations of FIFA, the sport's global governing

            Example Irrelevant Retrieval:
            - The FIFA World Cup, often called the World Cup, is an international association football competition among the senior men's national teams of the members of the Fédération Internationale de Football Association (FIFA), the sport's global governing body.

            Example Contradictory:
            - No. The FIFA World Cup is a soccer tournament that is held every four years. The next World Cup will be held in 2022.

            Return your output in this exact JSON format: {{"model_response": "<text>", "needle": "<real_needle>", "hallucination_type": "<Correct Retrieval | Excess Incorrect Retrieval | Irrelevant Retrieval | Contradictory Retrieval>"}}"""
    )

In [87]:
# Load .env and API key
load_dotenv()
together_api_key = os.getenv("together_api_key")

In [88]:
from together import Together

client = Together(api_key=together_api_key)

def hallucination_categorising(response, needle, question):
    while True:
        try:
            prompt = build_rewrite_prompt(response, needle, question)
            completion = client.chat.completions.create(
                model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=512,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)
            return data["hallucination_type"]
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")

In [89]:
lst_dfs = []
responses_path = "unique_responses" if not attn_mask_block else "unique_responses_block_top30"
for i in ids:
    with open(f"{responses_path}/{i}.json", "r", encoding="utf-8") as f:
        data = json.load(f)
        
    rows = []
    for category, entries in data.items():
        for entry in entries:
            entry["category"] = category
            rows.append(entry)

    df = pd.DataFrame(rows)
    lst_dfs.append(df)


In [90]:
print(len(lst_dfs))

82


In [91]:
import time
from tqdm import tqdm

for single_df in lst_dfs:
    hallucination_types = []

    for idx, row in tqdm(single_df.iterrows(), total=len(single_df), desc="Categorising hallucinations"):
        start_time = time.time()  # ⏱ Track time at the start of each request

        mr = row.get("model_response", "")
        n = row.get("needle", "")
        q = row.get("question", "")

        current_hallucination_type = hallucination_categorising(mr, n, q)
        hallucination_types.append(current_hallucination_type)

        elapsed = time.time() - start_time
        sleep_time = max(0, 1.0 - elapsed)  # ⏸ Sleep the remaining time to enforce 1 req/sec
        time.sleep(sleep_time)

    single_df['hallucination_type'] = hallucination_types


Categorising hallucinations: 100%|██████████| 100/100 [02:57<00:00,  1.78s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:56<00:00,  1.76s/it]
Categorising hallucinations: 100%|██████████| 100/100 [03:03<00:00,  1.84s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:43<00:00,  1.64s/it]
Categorising hallucinations: 100%|██████████| 100/100 [03:08<00:00,  1.88s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:40<00:00,  1.61s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:41<00:00,  1.62s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:45<00:00,  1.66s/it]
Categorising hallucinations: 100%|██████████| 100/100 [03:14<00:00,  1.94s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:36<00:00,  1.56s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:28<00:00,  1.48s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:59<00:00,  1.79s/it]
Categorising hallucinations: 100%|██████████| 100/10

Error: Expecting ',' delimiter: line 1186 column 1 (char 2117)
Retrying now...


Categorising hallucinations: 100%|██████████| 100/100 [02:33<00:00,  1.54s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:53<00:00,  1.73s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]
Categorising hallucinations: 100%|██████████| 100/100 [01:51<00:00,  1.11s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:09<00:00,  1.29s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:22<00:00,  1.42s/it]
Categorising hallucinations: 100%|██████████| 100/100 [16:44<00:00, 10.04s/it] 
Categorising hallucinations: 100%|██████████| 100/100 [26:54<00:00, 16.15s/it]  
Categorising hallucinations: 100%|██████████| 100/100 [05:47<00:00,  3.47s/it]
Categorising hallucinations: 100%|██████████| 100/100 [03:01<00:00,  1.81s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:56<00:00,  1.76s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:49<00:00,  1.70s/it]
Categorising hallucinations: 100%|██████████| 100

Error: Expecting value: line 1 column 19 (char 18)
Retrying now...


Categorising hallucinations: 100%|██████████| 100/100 [03:02<00:00,  1.82s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:45<00:00,  1.65s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:28<00:00,  1.49s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:18<00:00,  1.38s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:11<00:00,  1.31s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:44<00:00,  1.64s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:59<00:00,  1.80s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:39<00:00,  1.60s/it]
Categorising hallucinations: 100%|██████████| 100/100 [02:34<00:00,  1.54s/it]


In [92]:
for df_single, id_single in zip(lst_dfs, ids):
    # Group by category and convert each group to a list of dicts
    json_output = {
        category: group.drop(columns="category").to_dict(orient="records")
        for category, group in df_single.groupby("category")
    }

    output_dir = "updated_unique_responses" if not attn_mask_block else "updated_unique_responses_block_top30"
    file_id = str(id_single)  # or whatever variable you're using
    output_path = os.path.join(output_dir, f"{file_id}.json")

    # Create the folder if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Write the JSON file into the folder
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(json_output, f, indent=4)
