This notebook was used to add hallucination type labels to the label_datasets using an ensemble of judges

In [4]:
classification_prompt = """
    You are a helpful assistant that classifies hallucination types.
    You will be given a text and you need to classify it into one of the following categories:

    1 - Entity-error Hallucination. This type of hallucination refers to the situations where the generated text of LLMs contains erroneous entities, such
    as person, date, location, and object, that contradict
    with the world knowledge.
    
    2 - Relation-error Hallucination. This type of
    hallucination refers to the generated text of LLMs
    contains wrong relations between entities such as
    quantitative and chronological relation.
    
    3 - Incompleteness Hallucination. LLMs might
    exhibit incomplete output when generating lengthy
    or listed responses. This hallucination arises when
    LLMs are asked about aggregated facts and they
    fail to reserve the factual completeness.
    
    4 - Outdatedness Hallucination. This type of
    hallucination refers to situations where the generated content of LLMs is outdated for the present
    moment, but was correct at some point in the past.
    This issue arises primarily due to the fact that most
    LLMs were trained on time-limited corpora
    
    5 - Overclaim Hallucination. This type of hallucination means that the statement expressed in
    the generated text of LLMs is beyond the scale of
    factual knowledge 
    
    6 - Unverifiability Hallucination. In some cases,
    the information generated by LLMs cannot be verified by available information sources.
    
    Only reply with the category number, no other text.
    
    Input:
    {text}
"""

In [5]:
import os
import pandas as pd
from typing import List
from together import Together
from openai import OpenAI
import anthropic
from collections import Counter
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

def get_llm_classification(text: str, client, model_type: str, max_retries: int = 1) -> str:
    """Make classification call to specific LLM with retries"""
    for attempt in range(max_retries + 1):
        try:
            if model_type == "openai":
                response = client.chat.completions.create(
                    model="gpt-4",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": classification_prompt.format(text=text)}
                    ]
                )
                return response.choices[0].message.content
            
            elif model_type == "anthropic":
                response = client.messages.create(
                    model="claude-3-sonnet-20240229",
                    max_tokens=1000,
                    messages=[
                        {"role": "user", "content": classification_prompt.format(text=text)}
                    ]
                )
                return response.content[0].text
                
            elif model_type == "together":
                response = client.chat.completions.create(
                    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K",
                    messages=[{"role": "user", "content": classification_prompt.format(text=text)}],
                )
                return response.choices[0].message.content
                
        except Exception as e:
            if attempt == max_retries:
                print(f"Failed {model_type} call after {max_retries} retries: {e}")
                return "1"  # Default to entity error if all retries fail
            
    return "1"  # Default case

def classify_hallucination_type(text: str) -> str:
    """Classify hallucination type using ensemble of LLM predictions"""
    # Initialize clients
    openai_client = OpenAI()
    anthropic_client = anthropic.Anthropic()
    together_client = Together()
    
    # Get predictions from each model
    responses = [
        get_llm_classification(text, openai_client, "openai"),
        get_llm_classification(text, anthropic_client, "anthropic"),
        get_llm_classification(text, together_client, "together")
    ]
    
    # Convert responses to integers and get majority vote
    parsed_responses = []
    for r in responses:
        try:
            # Try to convert to int, defaulting to 1 (entity error) if conversion fails
            parsed_responses.append(int(r.strip()))
        except ValueError:
            print(f"Could not parse response '{r}' as integer, defaulting to 1")
            parsed_responses.append(1)
    
    if not parsed_responses:
        majority_vote = 1  # Default if no valid responses
    else:
        majority_vote = Counter(parsed_responses).most_common(1)[0][0]
    
    hallucination_types = {
        1: "Entity-error Hallucination",
        2: "Relation-error Hallucination", 
        3: "Incompleteness Hallucination",
        4: "Outdatedness Hallucination",
        5: "Overclaim Hallucination",
        6: "Unverifiability Hallucination"
    }
    
    return hallucination_types.get(majority_vote, "Other hallucination")

def process_csv_files(directory: str):
    """Process CSV files and add hallucination type labels"""
    for root, _, files in os.walk(directory):
        # Skip directories with 'old' in the name
        if 'old' in root.lower():
            continue
            
        for file in tqdm(files, desc="Processing files"):
            if not file.endswith('.csv'):
                continue
                
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            
            if 'hallucination_type_realized_ensemble' in df.columns:
                print(f"Skipping {file_path} - column already exists")
                continue
            
            # Add new column with hallucination type classifications
            df['hallucination_type_realized_ensemble'] = None
            hallucinated_mask = df['label'] == 'hallucinated'
            hallucinated_rows = df[hallucinated_mask]
            
            # Process each hallucinated row
            for idx, row in tqdm(hallucinated_rows.iterrows(), total=len(hallucinated_rows), desc=f"Processing rows in {file}"):
                hallucination_type = classify_hallucination_type(
                    f"Reference: {row.reference}\nOutput: {row.output}"
                )
                df.loc[idx, 'hallucination_type_realized_ensemble'] = hallucination_type
            
            df.to_csv(file_path, index=False)
            print(f"Processed {file_path}")


In [None]:
# Usage example:
process_csv_files("labeled_datasets/")