In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import sys
sys.path.append('..')

load_dotenv()

True

# Test if the csv files contain unreadable characters

In [None]:
def contains_unreadable_characters(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file.read()
        return False
    except UnicodeDecodeError:
        return True

def test_csv_for_unreadable_characters(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                if contains_unreadable_characters(file_path):
                    print(f"File {file_path} contains unreadable characters.")

In [None]:
# test_csv_for_unreadable_characters('labeled_datasets/')

# Combine the csv files in a directory

In [None]:
def combine_csvs(directory, output_file):
    combined_df = pd.DataFrame()
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_df.to_csv(output_file, index=False)

In [None]:
# combine_csvs('labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals', 'combined_output/gpt-4o-synthetic-even.csv')

# Upload the csv files to Labelbox

In [3]:
def labelbox_upload(csv_file_path, dataset_name):
    import labelbox as lb

    client = lb.Client(api_key=os.getenv("LABELBOX_API_KEY"))

    dataset = client.create_dataset(name=dataset_name)

    # Read the CSV file
    df = pd.read_csv(csv_file_path)

    # Create assets list
    assets = []
    for idx, row in df.iterrows():
        asset = {
            "row_data": "Question: " + str(row['input']) + '\n' + "Reference: " + str(row['reference']) + '\n' + "Output: " + str(row['output']),
            "global_key": f"{dataset_name}-{idx:04d}",
            "media_type": "TEXT",
            "metadata_fields": [
                {
                    "schema_id": "cko8s9r5v0001h2dk9elqdidh",
                    "value": "synthetic" if row['synthetic'] == 'True' else 'non_synthetic',
                    "language": row['language'],
                    "rag_model": row['rag_model'],
                    "force_even_split": row['force_even_split'],
                    "synthetic": row['synthetic'],
                }
            ],
            "attachments": [
                {
                    "type": "RAW_TEXT",
                    "value": "Question: " + str(row['input']) + '\n' + "Reference: " + str(row['reference']) + '\n' + "Output: " + str(row['output'])
                }
            ]
        }
        assets.append(asset)

    # Bulk add data rows to the dataset
    task = dataset.create_data_rows(assets)


    task.wait_till_done()
    print(task.errors)

In [None]:
def labelbox_uploads():
    labelbox_upload('combined_datasets_for_evals_rd3/non_synthetic_hallucinations_all_languages.csv', 'Non-synthetic hallucinations all languages')
    labelbox_upload('combined_datasets_for_evals_rd3/synthetic_hallucinations_all_languages.csv', 'Synthetic hallucinations all languages')
    labelbox_upload('combined_datasets_for_evals_rd3/non_synthetic_hallucinations_english.csv', 'Non-synthetic hallucinations english')
    labelbox_upload('combined_datasets_for_evals_rd3/synthetic_hallucinations_english.csv', 'Synthetic hallucinations english')
    labelbox_upload('combined_datasets_for_evals_rd3/non_synthetic_hallucinations_international.csv', 'Non-synthetic hallucinations international')
    labelbox_upload('combined_datasets_for_evals_rd3/synthetic_hallucinations_international.csv', 'Synthetic hallucinations international')

# Append Metadata to Labelbox

In [25]:
import os
import labelbox as lb
import pandas as pd

client = lb.Client(api_key=os.getenv("LABELBOX_API_KEY"))

# dataset = client.create_dataset(name=dataset_name)
dataset = client.get_dataset("your dataset name")
metadata_ontology = client.get_data_row_metadata_ontology()
tag_schema = metadata_ontology.get_by_name("tag")

labeled_df = pd.read_json('labelbox_datasets/Export v2 project - Hallucination Evaluation 3 - 12_11_2024.ndjson', lines=True)

# print(tag_schema.uid)

print(labeled_df.iloc[0]['data_row'])
synthetic_df = pd.read_csv('combined_datasets_for_evals_rd2/synthetic_hallucinations_all_languages.csv')
non_synthetic_df = pd.read_csv('combined_datasets_for_evals_rd2/non_synthetic_hallucinations_all_languages.csv')

synthetic_data_rows = []
non_synthetic_data_rows = []

from tqdm import tqdm
for index, row in tqdm(labeled_df.iterrows(), total=len(labeled_df), desc="Processing rows"):
    id = row['data_row']['id']
    text = row['data_row']["row_data"]
    
    # Parse the concatenated text back into its components
    question_start = text.find("Question: ") + len("Question: ")
    reference_start = text.find("Reference: ") + len("Reference: ")
    output_start = text.find("Output: ") + len("Output: ")
    
    question = text[question_start:text.find("\n", question_start)]
    reference = text[reference_start:text.find("\n", reference_start)]
    output = text[output_start:]
    
    # Look for exact match of output in both dataframes
    synthetic_match = synthetic_df[synthetic_df['output'] == output]
    non_synthetic_match = non_synthetic_df[non_synthetic_df['output'] == output]
    
    # Get label if found in either dataset
    label = None
    if not synthetic_match.empty:
        label = synthetic_match.iloc[0]['label']
        explanation = synthetic_match.iloc[0]['explanation_mistral-large-latest']
        match_type = 'synthetic'
    if not non_synthetic_match.empty:
        label = non_synthetic_match.iloc[0]['label']
        explanation = non_synthetic_match.iloc[0]['explanation_mistral-large-latest']
        match_type = 'non_synthetic'
        
    if label is None:
        print(f"No label found for {output}")
    else:
        # print(f"Label found for {output}: {label}")
        data = {
            "key": lb.UniqueId(id),
            "metadata_fields": [
                lb.DataRowMetadataField(
                    schema_id=tag_schema.uid,
                    value=label + ' ' + explanation
                ),
            ]
        }
        if match_type == 'synthetic':
            synthetic_data_rows.append(data)
        else:
            non_synthetic_data_rows.append(data)

non_synthetic_dataset = client.get_dataset("cm47cjprz00tr0722gg155z9b")
non_synthetic_dataset.upsert_data_rows(non_synthetic_data_rows)

synthetic_dataset = client.get_dataset("cm47c6top00qr07945g0rdn3r")
synthetic_dataset.upsert_data_rows(synthetic_data_rows)

{'id': 'cm47c7yjx5z5307229b40jlcg', 'global_key': 'synthetic-0016', 'row_data': 'Question: このページは何に使用しますか？\nReference: ã\\x81\\x93ã\\x81®ã\\x83\\x9aã\\x83¼ã\\x82¸ã\\x82\\x92ä½¿ç\\x94¨ã\\x81\\x97ã\\x81¦ã\\x80\\x81ç\\x94\\x9fæ\\x88\\x90AI ã\\x82¨ã\\x83¼ã\\x82¸ã\\x82§ã\\x83³ã\\x83\\x88 ã\\x83\\x81ã\\x83¥ã\\x83¼ã\\x83\\x88ã\\x83ªã\\x82¢ã\\x83« (æ\\x97§ç§° AI ã\\x82¯ã\\x83\\x83ã\\x82¯ã\\x83\\x96ã\\x83\\x83ã\\x82¯) ã\\x82\\x92ã\\x83\\x8aã\\x83\\x93ã\\x82²ã\\x83¼ã\\x83\\x88ã\\x81\\x97ã\\x81¾ã\\x81\\x99ã\\x80\\x82ç«¯ã\\x81\\x8bã\\x82\\x89ç«¯ã\\x81¾ã\\x81§è¿½ã\\x81\\x84ã\\x81\\x8bã\\x81\\x91ã\\x82\\x8bã\\x81\\x8bã\\x80\\x81è\\x88\\x88å\\x91³ã\\x81®ã\\x81\\x82ã\\x82\\x8bé\\xa0\\x98å\\x9f\\x9fã\\x81«é£\\x9bã\\x81³è¾¼ã\\x82\\x93ã\\x81§ã\\x81\\x8fã\\x81\\xa0ã\\x81\\x95ã\\x81\\x84ã\\x80\\x82\nOutput: 生成AIエージェント チュートリアル（旧称 AI クックブック）をナビゲートするために使用します。'}


Processing rows:  14%|█▍        | 1440/10208 [00:01<00:05, 1500.07it/s]

No label found for nan


Processing rows:  22%|██▏       | 2199/10208 [00:01<00:05, 1493.57it/s]

No label found for nan
No label found for nan


Processing rows:  25%|██▍       | 2503/10208 [00:01<00:05, 1497.27it/s]

No label found for nan


Processing rows:  32%|███▏      | 3271/10208 [00:02<00:04, 1519.07it/s]

No label found for nan


Processing rows:  59%|█████▉    | 6042/10208 [00:04<00:02, 1515.62it/s]

No label found for nan
No label found for nan


Processing rows:  64%|██████▎   | 6496/10208 [00:04<00:02, 1474.53it/s]

No label found for nan
No label found for nan


Processing rows:  71%|███████   | 7247/10208 [00:04<00:01, 1484.72it/s]

No label found for nan


Processing rows:  93%|█████████▎| 9540/10208 [00:06<00:00, 1506.47it/s]

No label found for nan


Processing rows: 100%|██████████| 10208/10208 [00:06<00:00, 1495.29it/s]


<DataUpsertTask ID: cdtopxefh073800mhm4kcuwqs>

# Prepare Data for Labelbox

In [42]:
import uuid

def labelbox_upload(csv_file_path, dataset_name, sample_size = None):
    import labelbox as lb

    # Read the CSV file
    df_raw = pd.read_csv(csv_file_path)
    df_filtered = df_raw[df_raw['label'] != 'NOT_PARSABLE']
    
    if sample_size is not None:
        # Get equal samples for each combination of label and synthetic
        samples = []

        # Get counts for each combination, excluding NOT_PARSABLE
        counts = df_filtered.groupby(['label', 'synthetic']).size()
        min_count = min(counts)
        target_per_group = min(min_count, 25) # 25 = 100/4 groups

        samples = []
        for label in df_filtered['label'].unique():
            for synthetic in df_filtered['synthetic'].unique():
                subset = df_filtered[(df_filtered['label'] == label) & (df_filtered['synthetic'] == synthetic)]
                if len(subset) > 0:
                    samples.append(subset.sample(n=target_per_group, random_state=31))
        df = pd.concat(samples)
    else:
        df = df_filtered.copy()
        
    
    df['common_explanation'] = df.apply(lambda row: row['explanation_gpt-4o'] if row['label_gpt-4o'] == row['label'] else row['explanation_claude-3-5-sonnet-latest'], axis=1)


    client = lb.Client(api_key=os.getenv("LABELBOX_API_KEY"))

    dataset = client.create_dataset(name=dataset_name)

    # Add data rows with metadata
    data_rows = []
    for _, row in df.iterrows():
            
        # Create the data row
        data_row = {
            "row_data": "Question: " + str(row['input']) + '\n\n' + "Reference: " + str(row['reference']) + '\n\n' + "Output: " + str(row['output']),
            "external_id": str(uuid.uuid4()),  # Unique ID for the row
        }
        data_rows.append(data_row)


    # Bulk add data rows to the dataset
    task = dataset.create_data_rows(data_rows)

    task.wait_till_done()
    print(task.errors)

In [43]:
labelbox_upload('temp/english_only.csv', 'English hallucination sample - no labels', 100)

  df_raw = pd.read_csv(csv_file_path)


None


# Upload all rows to Labelbox

In [8]:
import uuid

def labelbox_upload(csv_file_path, dataset_name, sample_size = None):
    import labelbox as lb

    # Read the CSV file
    df_raw = pd.read_csv(csv_file_path)
    df = df_raw[df_raw['label'] != 'NOT_PARSABLE'].copy()
    
    df['common_explanation'] = df.apply(lambda row: row['explanation_gpt-4o'] if row['label_gpt-4o'] == row['label'] else row['explanation_claude-3-5-sonnet-latest'], axis=1)

    client = lb.Client(api_key=os.getenv("LABELBOX_API_KEY"))

    dataset = client.create_dataset(name=dataset_name)

    # Add data rows with metadata
    data_rows = []
    for _, row in df.iterrows():
            
        # Create the data row
        data_row = {
            "row_data": "Question: " + str(row['input']) + '\n\n' + "Reference: " + str(row['reference']) + '\n\n' + "Output: " + str(row['output']) + '\n\n' + "Generated label: " + str(row['label']) + '\n\n' + "Explanation: " + str(row['common_explanation']),
            "external_id": str(uuid.uuid4()),  # Unique ID for the row
        }
        data_rows.append(data_row)


    # Bulk add data rows to the dataset
    task = dataset.create_data_rows(data_rows)

    task.wait_till_done()
    print(task.errors)

In [9]:
labelbox_upload('temp/all_datasets.csv', '1/4 - full dataset')

  df_raw = pd.read_csv(csv_file_path)


None


# Combine Labelbox Data and existing data


In [53]:
import os
import labelbox as lb
import pandas as pd

client = lb.Client(api_key=os.getenv("LABELBOX_API_KEY"))

# dataset = client.create_dataset(name=dataset_name)
dataset = client.get_dataset("cm574n8g900ib0782k19g4ce7")
labeled_df = pd.read_csv('temp/english_only.csv')

new_data_rows = []

for row in dataset.data_rows():
    # Split row_data back into input, output, reference
    row_data = row.row_data
    
    # Extract the fields using string splitting
    input_start = row_data.find("Question: ") + len("Question: ")
    input_end = row_data.find("\n\n", input_start)
    input_text = row_data[input_start:input_end]
    
    ref_start = row_data.find("Reference: ") + len("Reference: ")
    ref_end = row_data.find("\n\n", ref_start) 
    reference = row_data[ref_start:ref_end]
    
    output_start = row_data.find("Output: ") + len("Output: ")
    output = row_data[output_start:]
    
    # Find matching row in labeled_df by comparing input, output and reference
    matching_row = labeled_df[
        (labeled_df['input'] == input_text) & 
        (labeled_df['output'] == output) &
        (labeled_df['reference'] == reference)
    ]
    
    # If match found, get the label
    if not matching_row.empty:
        label = matching_row['label'].iloc[0]
        row_data = row_data + '\n\n' + "Generated label: " + label
        new_id = str(uuid.uuid4())
        row.update(row_data=row_data, global_key=new_id)
    
    # new_data_row = {
    #     "row_data": row['row_data'],
    #     "external_id": row['external_id'],
    #     "metadata_fields": metadata_fields
    # }
    # new_data_rows.append(new_data_row)

# dataset.upsert_data_rows(new_data_rows)

  labeled_df = pd.read_csv('temp/english_only.csv')


# Create tuning data without full prompt

In [5]:
import pandas as pd
from phoenix.evals import HALLUCINATION_PROMPT_TEMPLATE

# Read the training and validation files
train_df = pd.read_csv('train_Llama-3.2-1B-Instruct_en_tuning_data.csv')
val_df = pd.read_csv('validation_Llama-3.2-1B-Instruct_en_tuning_data.csv')

instruction_str = "Query: {input}\n\nReference: {reference}\n\nAnswer: {output}"

def create_tuning_prompt(row):
    return instruction_str.format(
        input=row['input'],
        output=row['output'],
        reference=row['reference']
    )

# Add tuning_data column to both dataframes
train_df['tuning_data_no_prompt'] = train_df.apply(create_tuning_prompt, axis=1)
val_df['tuning_data_no_prompt'] = val_df.apply(create_tuning_prompt, axis=1)

# Save the updated dataframes
train_df.to_csv('train_Llama-3.2-1B-Instruct_en_tuning_data.csv', index=False)
val_df.to_csv('validation_Llama-3.2-1B-Instruct_en_tuning_data.csv', index=False)

print("Added tuning_data column to training and validation files")


Added tuning_data column to training and validation files


# Add rejected_label column to training file

In [2]:
import pandas as pd

# Read the training and validation files
train_df = pd.read_csv('train_Llama-3.2-1B-Instruct_en_tuning_data.csv')

# Add tuning_data column to both dataframes
train_df['rejected_label'] = train_df['label'].apply(lambda x: 'factual' if x == 'hallucinated' else 'hallucinated')

# Save the updated dataframes
train_df.to_csv('train_Llama-3.2-1B-Instruct_en_tuning_data.csv', index=False)

print("Added rejected_label column to training file")

Added tuning_data column to training and validation files


# Remove duplicates from all datasets

In [10]:
import pandas as pd
import os

# Get all CSV files in temp directory
csv_files = [f for f in os.listdir('temp') if f.endswith('.csv')]

for csv_file in csv_files:
    print(f"\nProcessing {csv_file}...")
    
    # Read the CSV file
    df = pd.read_csv(f'temp/{csv_file}')
    
    # Drop duplicates based on input, reference, and output columns combined
    df_unique = df.drop_duplicates(subset=['input', 'reference', 'output'])
    
    print(f"Original number of rows: {len(df)}")
    print(f"Number of rows after removing duplicates: {len(df_unique)}")
    print(f"Number of duplicate rows removed: {len(df) - len(df_unique)}")
    
    # Save unique rows to new file with _unique suffix
    output_file = f'temp/{csv_file.replace(".csv", "_unique.csv")}'
    df_unique.to_csv(output_file, index=False)
    print(f"Saved unique rows to {output_file}")



Processing synthetic_english.csv...
Original number of rows: 25968
Number of rows after removing duplicates: 16366
Number of duplicate rows removed: 9602
Saved unique rows to temp/synthetic_english_unique.csv

Processing non_synthetic_english.csv...
Original number of rows: 21906
Number of rows after removing duplicates: 10871
Number of duplicate rows removed: 11035
Saved unique rows to temp/non_synthetic_english_unique.csv

Processing synthetic_only.csv...


  df = pd.read_csv(f'temp/{csv_file}')


Original number of rows: 40188
Number of rows after removing duplicates: 30509
Number of duplicate rows removed: 9679
Saved unique rows to temp/synthetic_only_unique.csv

Processing non_synthetic_only.csv...
Original number of rows: 29446
Number of rows after removing duplicates: 18074
Number of duplicate rows removed: 11372
Saved unique rows to temp/non_synthetic_only_unique.csv

Processing english_only.csv...


  df = pd.read_csv(f'temp/{csv_file}')


Original number of rows: 47874
Number of rows after removing duplicates: 26934
Number of duplicate rows removed: 20940
Saved unique rows to temp/english_only_unique.csv

Processing all_datasets.csv...


  df = pd.read_csv(f'temp/{csv_file}')


Original number of rows: 69634
Number of rows after removing duplicates: 48105
Number of duplicate rows removed: 21529
Saved unique rows to temp/all_datasets_unique.csv


# Prep csv for together instruct training

In [14]:
# Read the dataset
df = pd.read_csv('temp/all_datasets_unique.csv', low_memory=False)

# Split into synthetic and non-synthetic
synthetic_df = df[df['synthetic'] == True]
non_synthetic_df = df[df['synthetic'] == False]

# For synthetic data, get equal numbers of hallucinated and factual
synthetic_hallucinated = synthetic_df[synthetic_df['label'] == 'hallucinated']
synthetic_factual = synthetic_df[synthetic_df['label'] == 'factual']
min_synthetic_count = min(len(synthetic_hallucinated), len(synthetic_factual))
balanced_synthetic = pd.concat([
    synthetic_hallucinated.sample(n=min_synthetic_count, random_state=42),
    synthetic_factual.sample(n=min_synthetic_count, random_state=42)
])

# For non-synthetic data, get equal numbers of hallucinated and factual
non_synthetic_hallucinated = non_synthetic_df[non_synthetic_df['label'] == 'hallucinated'] 
non_synthetic_factual = non_synthetic_df[non_synthetic_df['label'] == 'factual']
min_non_synthetic_count = min(len(non_synthetic_hallucinated), len(non_synthetic_factual))
balanced_non_synthetic = pd.concat([
    non_synthetic_hallucinated.sample(n=min_non_synthetic_count, random_state=42),
    non_synthetic_factual.sample(n=min_non_synthetic_count, random_state=42)
])

# Combine balanced datasets
balanced_df = pd.concat([balanced_synthetic, balanced_non_synthetic])

# Print statistics about the balanced dataset
print("Balanced Dataset Statistics:")
print(f"Total rows: {len(balanced_df)}")
print("\nSynthetic data distribution:")
print(balanced_df[balanced_df['synthetic'] == True]['label'].value_counts())
print("\nNon-synthetic data distribution:")
print(balanced_df[balanced_df['synthetic'] == False]['label'].value_counts())

# Save balanced dataset
# balanced_df.to_csv('temp/balanced_dataset.csv', index=False)


Balanced Dataset Statistics:
Total rows: 34344

Synthetic data distribution:
label
hallucinated    14936
factual         14936
Name: count, dtype: int64

Non-synthetic data distribution:
label
hallucinated    2236
factual         2236
Name: count, dtype: int64


In [16]:
from sklearn.model_selection import train_test_split
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Calculate sizes for splits (70% train, 15% validation, 15% test)
train_size = 0.7
val_size = 0.15
test_size = 0.15

# Split data by label to maintain balance
hallucinated = balanced_df[balanced_df['label'] == 'hallucinated']
factual = balanced_df[balanced_df['label'] == 'factual']

# For each label, split into train/val/test
def split_stratified(df):
    # First split out test set
    train_val, test = train_test_split(df, test_size=test_size, random_state=42)
    
    # Then split remaining data into train/val
    # We need to adjust validation size to be relative to remaining data
    relative_val_size = val_size / (train_size + val_size)
    train, val = train_test_split(train_val, test_size=relative_val_size, random_state=42)
    
    return train, val, test

# Split each label group
hallucinated_train, hallucinated_val, hallucinated_test = split_stratified(hallucinated)
factual_train, factual_val, factual_test = split_stratified(factual)

# Combine splits while maintaining balance
train_df = pd.concat([hallucinated_train, factual_train])
val_df = pd.concat([hallucinated_val, factual_val]) 
test_df = pd.concat([hallucinated_test, factual_test])

# Shuffle each split
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Print statistics about the splits
print("\nSplit Statistics:")
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

print("\nTraining set label distribution:")
print(train_df['label'].value_counts())
print("\nValidation set label distribution:")
print(val_df['label'].value_counts())
print("\nTest set label distribution:")
print(test_df['label'].value_counts())

# Save splits to files
train_df.to_csv('temp/1-17/train.csv', index=False)
val_df.to_csv('temp/1-17/val.csv', index=False)
test_df.to_csv('temp/1-17/test.csv', index=False)



Split Statistics:
Training set size: 24040
Validation set size: 5152
Test set size: 5152

Training set label distribution:
label
factual         12020
hallucinated    12020
Name: count, dtype: int64

Validation set label distribution:
label
hallucinated    2576
factual         2576
Name: count, dtype: int64

Test set label distribution:
label
hallucinated    2576
factual         2576
Name: count, dtype: int64


In [18]:
import json
from phoenix.evals import HALLUCINATION_PROMPT_TEMPLATE

def convert_to_jsonl(input_csv, output_jsonl):
    # Read CSV
    df = pd.read_csv(input_csv)
    
    # Open output file
    with open(output_jsonl, 'w') as f:
        # Process each row
        for _, row in df.iterrows():
            # Format prompt template with row data
            prompt = HALLUCINATION_PROMPT_TEMPLATE.template.format(
                input=row['input'],
                reference=row['reference'], 
                output=row['output']
            )
            
            # Create json object
            json_obj = {
                'prompt': prompt,
                'completion': row['label']
            }
            
            # Write to file
            f.write(json.dumps(json_obj) + '\n')

# Convert train and test sets
convert_to_jsonl('temp/1-17/train.csv', 'temp/1-17/train.jsonl')
convert_to_jsonl('temp/1-17/test.csv', 'temp/1-17/test.jsonl')
convert_to_jsonl('temp/1-17/val.csv', 'temp/1-17/val.jsonl')

print("Converted CSV files to JSONL format")


Converted CSV files to JSONL format


# Add Human Labels to Data File

In [1]:
import os
CSV_PATH = 'temp/all_datasets.csv'
LB_API_KEY = os.getenv("LABELBOX_API_KEY")

Download files from Labelbox


In [2]:
import labelbox

client = labelbox.Client(api_key = LB_API_KEY)
export_task = labelbox.ExportTask.get_task(client, "cm6h8l76000ww07x60726a28j")

# Stream the export using a callback function
def json_stream_handler(output: labelbox.BufferedJsonConverterOutput):
  print(output.json)

export_task.get_buffered_stream(stream_type=labelbox.StreamType.RESULT).start(stream_handler=json_stream_handler)

# Simplified usage
export_json = [data_row.json for data_row in export_task.get_buffered_stream()]

ValueError: Task cm6h8l76000ww07x60726a28j does not have a RESULT stream

Parse rows from Labelbox

In [3]:
parsed_data = []

for row in export_json:
    # Extract base data
    row_data = row['data_row']['row_data']
    # Parse the string to extract fields
    data_dict = {}
    
    # Split by newlines in case fields span multiple lines
    # Find the starting positions of each field
    question_pos = row_data.find('Question:')
    reference_pos = row_data.find('Reference:') 
    output_pos = row_data.find('Output:')
    label_pos = row_data.find('Generated label:')
    explanation_pos = row_data.find('Explanation:')
    
    # Create list of tuples with position and field name, sorted by position
    field_positions = []
    if question_pos != -1:
        field_positions.append((question_pos, 'question', 'Question:'))
    if reference_pos != -1:
        field_positions.append((reference_pos, 'reference', 'Reference:'))
    if output_pos != -1:
        field_positions.append((output_pos, 'answer', 'Output:'))
    if label_pos != -1:
        field_positions.append((label_pos, 'label', 'Generated label:'))
    if explanation_pos != -1:
        field_positions.append((explanation_pos, 'explanation', 'Explanation:'))
    
    field_positions.sort()  # Sort by position
    
    # Extract text between positions
    for i in range(len(field_positions)):
        start_pos, field_name, prefix = field_positions[i]
        # Get end position (either next field or end of string)
        end_pos = field_positions[i+1][0] if i < len(field_positions)-1 else len(row_data)
        
        # Extract and clean the text
        text = row_data[start_pos:end_pos]
        text = text.replace(prefix, '', 1).strip()  # Remove the prefix only once
        data_dict[field_name] = text
    
    # Set defaults for any missing fields
    data_dict.setdefault('question', '')
    data_dict.setdefault('reference', '')
    data_dict.setdefault('answer', '')
    data_dict.setdefault('label', '')
    data_dict.setdefault('explanation', '')
    
    # Extract hallucination label from classifications
    try:
        classifications = row['projects']['cm5lqrlnv06vw07xx4ialh3gf']['labels'][0]['annotations']['classifications']
        for classification in classifications:
            if classification['name'] == 'Human Hallucination Label':
                data_dict['hallucination_label'] = classification['radio_answer']['value']
                break
    except (KeyError, IndexError):
        data_dict['hallucination_label'] = None
        
    parsed_data.append(data_dict)

# Print first few entries to verify
print("First 3 parsed entries:")
for entry in parsed_data[:3]:
    print(entry)


First 3 parsed entries:
{'question': 'MongoDBのコレクションの構造を視覚的に説明した図を見せてもらえますか？', 'reference': 'コレクション: MongoDB はドキュメントをコレクションに保存します。 関係データベースのテーブルに類似しています。', 'answer': '申し訳ありませんが、この文脈からは視覚的な図を提供することはできません。コレクションは関係データベースのテーブルに似た構造で、ドキュメントを保存する場所であるということしか説明されていません。', 'label': 'factual', 'explanation': 'The query asks for a visual representation of the structure of a MongoDB collection. The reference text provides a brief description of a MongoDB collection, stating that it stores documents and is similar to a table in a relational database. However, it does not provide any visual representation or diagram. The answer correctly states that a visual diagram cannot be provided based on the given context and accurately describes the information available in the reference text, which is that a collection is similar to a table in a relational database and is a place where documents are stored. Therefore, the answer is consistent with the information provided in the reference text and does not 

Find matching rows in CSV and add human label

In [5]:
# Read the CSV file
import pandas as pd
df = pd.read_csv(CSV_PATH)

# Convert parsed_data to DataFrame for easier comparison
parsed_df = pd.DataFrame(parsed_data)

# Iterate through parsed data and update CSV rows
for parsed_row in parsed_data:
    matching_rows = df[
        (df['input'] == parsed_row['question']) & 
        (df['reference'] == parsed_row['reference']) &
        (df['output'] == parsed_row['answer'])
    ]
    
    if not matching_rows.empty and parsed_row.get('hallucination_label') is not None:
        # Update all matching rows with the hallucination label
        df.loc[matching_rows.index, 'human_label'] = parsed_row['hallucination_label']

# Save updated DataFrame back to CSV
df.to_csv(CSV_PATH, index=False)


  df = pd.read_csv(CSV_PATH)


Analyze Human Labels

In [6]:
# Display value counts of human labels
print("Value counts of human labels:")
print(df['human_label'].value_counts(dropna=False))

# Calculate disagreement between human labels and original labels
print("\nDisagreement analysis between human labels and original labels:")
# Filter out rows where human_label is NaN
df_valid = df[df['human_label'].notna()]
disagreements = (df_valid['human_label'] != df_valid['label']).sum()
total_human_labels = len(df_valid)
agreement = (df_valid['human_label'] == df_valid['label']).sum()

print(f"Total rows with human labels: {total_human_labels}")
print(f"Number of disagreements: {disagreements}")
print(f"Number of agreements: {agreement}")
print(f"Disagreement rate: {(disagreements/total_human_labels*100):.2f}%")

# Show confusion matrix of labels
print("\nDetailed breakdown of label comparisons:")
comparison_df = pd.crosstab(df_valid['label'], df_valid['human_label'], margins=True)
print(comparison_df)



Value counts of human labels:
human_label
NaN             58154
factual          7202
hallucinated     4278
Name: count, dtype: int64

Disagreement analysis between human labels and original labels:
Total rows with human labels: 11480
Number of disagreements: 563
Number of agreements: 10917
Disagreement rate: 4.90%

Detailed breakdown of label comparisons:
human_label   factual  hallucinated    All
label                                     
NOT_PARSABLE        2             5      7
factual          6975           331   7306
hallucinated      225          3942   4167
All              7202          4278  11480


# Combine human labels with labeled_datasets

In [3]:
import pandas as pd

parsed_data = []
export_json = pd.read_json('raw_human_labels.ndjson', lines=True)

for _, row in export_json.iterrows():
    # Extract base data
    row_data = row['data_row']['row_data']
    # Parse the string to extract fields
    data_dict = {}
    
    # Split by newlines in case fields span multiple lines
    # Find the starting positions of each field
    question_pos = row_data.find('Question:')
    reference_pos = row_data.find('Reference:') 
    output_pos = row_data.find('Output:')
    label_pos = row_data.find('Generated label:')
    explanation_pos = row_data.find('Explanation:')
    
    # Create list of tuples with position and field name, sorted by position
    field_positions = []
    if question_pos != -1:
        field_positions.append((question_pos, 'question', 'Question:'))
    if reference_pos != -1:
        field_positions.append((reference_pos, 'reference', 'Reference:'))
    if output_pos != -1:
        field_positions.append((output_pos, 'answer', 'Output:'))
    if label_pos != -1:
        field_positions.append((label_pos, 'label', 'Generated label:'))
    if explanation_pos != -1:
        field_positions.append((explanation_pos, 'explanation', 'Explanation:'))
    
    field_positions.sort()  # Sort by position
    
    # Extract text between positions
    for i in range(len(field_positions)):
        start_pos, field_name, prefix = field_positions[i]
        # Get end position (either next field or end of string)
        end_pos = field_positions[i+1][0] if i < len(field_positions)-1 else len(row_data)
        
        # Extract and clean the text
        text = row_data[start_pos:end_pos]
        text = text.replace(prefix, '', 1).strip()  # Remove the prefix only once
        data_dict[field_name] = text
    
    # Set defaults for any missing fields
    data_dict.setdefault('question', '')
    data_dict.setdefault('reference', '')
    data_dict.setdefault('answer', '')
    data_dict.setdefault('label', '')
    data_dict.setdefault('explanation', '')
    
    # Extract hallucination label from classifications
    try:
        classifications = row['projects']['cm5lqrlnv06vw07xx4ialh3gf']['labels'][0]['annotations']['classifications']
        for classification in classifications:
            if classification['name'] == 'Human Hallucination Label':
                data_dict['human_label'] = classification['radio_answer']['value']
                break
    except (KeyError, IndexError):
        data_dict['human_label'] = None
        
    parsed_data.append(data_dict)

# Print first few entries to verify
print("First 3 parsed entries:")
for entry in parsed_data[:3]:
    print(entry)


First 3 parsed entries:
{'question': 'MongoDBのコレクションの構造を視覚的に説明した図を見せてもらえますか？', 'reference': 'コレクション: MongoDB はドキュメントをコレクションに保存します。 関係データベースのテーブルに類似しています。', 'answer': '申し訳ありませんが、この文脈からは視覚的な図を提供することはできません。コレクションは関係データベースのテーブルに似た構造で、ドキュメントを保存する場所であるということしか説明されていません。', 'label': 'factual', 'explanation': 'The query asks for a visual representation of the structure of a MongoDB collection. The reference text provides a brief description of a MongoDB collection, stating that it stores documents and is similar to a table in a relational database. However, it does not provide any visual representation or diagram. The answer correctly states that a visual diagram cannot be provided based on the given context and accurately describes the information available in the reference text, which is that a collection is similar to a table in a relational database and is a place where documents are stored. Therefore, the answer is consistent with the information provided in the reference text and does not 

In [4]:
parsed_df = pd.DataFrame(parsed_data)
parsed_df.head()

Unnamed: 0,question,reference,answer,label,explanation,human_label
0,MongoDBのコレクションの構造を視覚的に説明した図を見せてもらえますか？,コレクション: MongoDB はドキュメントをコレクションに保存します。 関係データベース...,申し訳ありませんが、この文脈からは視覚的な図を提供することはできません。コレクションは関係デ...,factual,The query asks for a visual representation of ...,factual
1,この記事では何について述べていますか？,ã\x81\x93ã\x81®è¨\x98äº\x8bã\x81§ã\x81¯ã\x80\x...,機能の可用性に地域差がある機能の一覧。,factual,The query asks what the article is about. The ...,
2,Can you provide any images or examples of how ...,We believe that everyone should be able to rea...,"Based on the incomplete context provided, I ca...",factual,The query asks for images or examples of how l...,factual
3,When did Sabrina Karl join the Investopedia st...,Sabrina Karl joined the Investopedia staff in ...,April 2023,factual,To determine if the answer is factual or hallu...,factual
4,¿Cómo podría un algoritmo de aprendizaje autom...,Una cohort es un grupo de personas que compart...,El algoritmo podría utilizar técnicas de apren...,hallucinated,The query asks how a machine learning algorith...,factual


In [None]:
import os
import pandas as pd
from tqdm import tqdm

# Walk through all subdirectories in labeled_datasets
for root, dirs, files in os.walk('labeled_datasets'):
    # Process only CSV files
    csv_files = [f for f in files if f.endswith('.csv')]
    
    for csv_file in tqdm(csv_files, desc=f"Processing CSV files in {root}"):
        # Get full path to CSV file
        csv_path = os.path.join(root, csv_file)
        
        # Read the CSV file
        df = pd.read_csv(csv_path, low_memory=False)
        
        # Create a merge key for both dataframes
        parsed_df['merge_key'] = parsed_df['question'] + parsed_df['reference'] + parsed_df['answer'] + parsed_df['label']
        df['merge_key'] = df['input'] + df['reference'] + df['output'] + df['label']
        
        # Merge the human_label column based on the merge key
        df = df.merge(
            parsed_df[['merge_key', 'human_label']], 
            on='merge_key',
            how='left'
        )
        
        # Drop the temporary merge key
        df.drop('merge_key', axis=1, inplace=True)
        
        # Save back to CSV in same location
        df.to_csv(csv_path, index=False)
        print(f"Updated {csv_path}")


In [1]:
import pandas as pd

# Read the CSV files
non_synthetic_df = pd.read_csv('combined_datasets_for_evals/non_synthetic_hallucinations_all_languages.csv', low_memory=False)
synthetic_df = pd.read_csv('combined_datasets_for_evals/synthetic_hallucinations_all_languages.csv', low_memory=False)

# Combine dataframes
combined_df = pd.concat([non_synthetic_df, synthetic_df])

# Get total language counts across both datasets
print("Total language counts:")
print(combined_df['language'].value_counts())
print(f"\nTotal rows across both files: {len(combined_df)}")


Total language counts:
language
en    53961
pt     7518
ja     7414
ko     1164
es      984
fr      974
zh      140
Name: count, dtype: int64

Total rows across both files: 72155


In [5]:
import os
import pandas as pd
from glob import glob

# Get all CSV files recursively from labeled_datasets, excluding /old directory
csv_files = []
for root, dirs, files in os.walk('labeled_datasets'):
    if 'old' in root:
        continue
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

# Read and combine all CSV files
dfs = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory=False)
    dfs.append(df)
    
combined_df = pd.concat(dfs, ignore_index=True)

# Count unique rows based on input, output and reference columns
unique_df = combined_df.drop_duplicates(subset=['input', 'output', 'reference', 'label'])

print(f"\nTotal number of files merged: {len(csv_files)}")
print(f"Total number of rows: {len(combined_df)}")
print(f"Number of unique rows: {len(unique_df)}")
print("\nFiles included:")
for f in csv_files:
    print(f"- {f}")



Total number of files merged: 198
Total number of rows: 133761
Number of unique rows: 48335

Files included:
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/docs_databricks_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/www_mongodb_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/experienceleague_adobe_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/pt/experienceleague_adobe_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_pt.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/pt/www_mongodb_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_pt.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/pt/docs_databricks_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_pt.csv
- labeled_datasets/gpt-4o-hallucinations/non_synthetic/k

# Testing 2 26

In [5]:
import pandas as pd

# Read and combine the international datasets
synthetic_path = 'combined_datasets_for_evals/synthetic_hallucinations_all_languages.csv'
non_synthetic_path = 'combined_datasets_for_evals/non_synthetic_hallucinations_all_languages.csv'

synthetic_df = pd.read_csv(synthetic_path, low_memory=False)
non_synthetic_df = pd.read_csv(non_synthetic_path, low_memory=False)

combined_df = pd.concat([synthetic_df, non_synthetic_df], ignore_index=True)

print(f"\nTotal rows in international datasets: {len(combined_df)}")
print(f"Synthetic rows: {len(synthetic_df)}")
print(f"Non-synthetic rows: {len(non_synthetic_df)}")

print(f"Total rows with human labels: {len(combined_df[combined_df['human_label'].notna()])}")
print(f"Total rows with hallucination label: {len(combined_df[combined_df['label'].notna()])}")


Total rows in international datasets: 48583
Synthetic rows: 30509
Non-synthetic rows: 18074
Total rows with human labels: 7191
Total rows with hallucination label: 48583
Total rows with both human and hallucination labels: 7191


In [2]:
import pandas as pd

# Read the JSONL file and convert to DataFrame
jsonl_path = '/Users/jgilhuly/Documents/dev/GitHub/dataset-generation-research/file-6c389e02-8925-4b4e-ad77-2d3bc5eb6d94.jsonl'
df = pd.read_json(jsonl_path, lines=True)

# Count duplicate prompts
duplicate_count = df['prompt'].duplicated().sum()
total_rows = len(df)

print(f"\nTotal rows in JSONL file: {total_rows}")
print(f"Number of rows with duplicate prompts: {duplicate_count}")
print(f"Percentage of duplicates: {(duplicate_count/total_rows)*100:.2f}%")



Total rows in JSONL file: 24040
Number of rows with duplicate prompts: 0
Percentage of duplicates: 0.00%


# Remove Duplicates

In [2]:
import os
import pandas as pd
from tqdm import tqdm

# Function to find all CSV files in a directory recursively
def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        # Skip the 'old' directory
        if 'old' in root.split(os.sep):
            continue
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

# Get all CSV files in the labeled_datasets directory
csv_files = find_csv_files('./labeled_datasets/')
print(f"Found {len(csv_files)} CSV files in the labeled_datasets directory")

# Read all CSV files into a list of DataFrames
all_dfs = []
for file_path in tqdm(csv_files, desc="Reading CSV files"):
    try:
        df = pd.read_csv(file_path, low_memory=False)
        df['source_file'] = file_path  # Add source file information
        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

if not all_dfs:
    print("No valid CSV files found or all files had errors.")
else:
    # Combine all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # Check for duplicates based on content (excluding the source_file column)
    content_columns = [col for col in combined_df.columns if col != 'source_file']
    duplicated_mask = combined_df.duplicated(subset=content_columns, keep='first')
    duplicate_count = duplicated_mask.sum()
    
    print(f"\nTotal rows across all CSV files: {len(combined_df)}")
    print(f"Number of duplicate rows: {duplicate_count}")
    print(f"Percentage of duplicates: {(duplicate_count/len(combined_df))*100:.2f}%")
    
    # Ask user if they want to remove duplicates
    user_input = input("\nDo you want to remove duplicates from the CSV files? (yes/no): ")
    
    if user_input.lower() in ['yes', 'y']:
        # Group by source file and remove duplicates
        file_groups = combined_df.groupby('source_file')
        
        for file_path, group_df in tqdm(file_groups, desc="Processing files"):
            # Get the original DataFrame for this file
            original_df = group_df.drop(columns=['source_file'])
            
            # Find duplicates within this file
            duplicates_in_file = original_df.duplicated(keep='first')
            duplicate_count_in_file = duplicates_in_file.sum()
            
            if duplicate_count_in_file > 0:
                # Remove duplicates
                deduplicated_df = original_df.drop_duplicates(keep='first')
                
                # Write back to the file
                deduplicated_df.to_csv(file_path, index=False)
                print(f"Removed {duplicate_count_in_file} duplicates from {file_path}")
            else:
                print(f"No duplicates found in {file_path}")
        
        print("\nDuplicate removal complete!")
    else:
        print("\nNo changes were made to the CSV files.")


Found 198 CSV files in the labeled_datasets directory


Reading CSV files: 100%|██████████| 198/198 [00:02<00:00, 77.43it/s] 



Total rows across all CSV files: 133761
Number of duplicate rows: 59987
Percentage of duplicates: 44.85%


Processing files:   6%|▌         | 12/198 [00:00<00:03, 57.00it/s]

Removed 16 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/docs_databricks_com_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 28 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/earthobservatory_nasa_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 28 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/earthobservatory_nasa_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 7 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/experienceleague_adobe_com_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 163 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/medlineplus_gov_claude_3_5_sonnet_latest_non_synthe

Processing files:  14%|█▍        | 28/198 [00:00<00:02, 71.77it/s]

Removed 88 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/www_ncbi_nlm_nih_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 90 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/www_ncbi_nlm_nih_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 115 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/www_noaa_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 115 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/en/www_noaa_gov_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 9 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/non_synthetic/es/experienceleague_adobe_com_claude_3_5_sonnet_latest_non_synthetic_gpt_4o_claude_3_5_so

Processing files:  22%|██▏       | 44/198 [00:00<00:02, 70.18it/s]

Removed 209 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/earthobservatory_nasa_gov_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 209 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/earthobservatory_nasa_gov_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 9 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/earthobservatory_nasa_gov_claude_3_5_sonnet_latest_synthetic_mistral_large_latest_gpt_4o_en_even.csv
No duplicates found in ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/experienceleague_adobe_com_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
No duplicate

Processing files:  26%|██▋       | 52/198 [00:00<00:02, 63.43it/s]

Removed 315 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_law_cornell_edu_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 315 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_law_cornell_edu_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 40 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_law_cornell_edu_claude_3_5_sonnet_latest_synthetic_mistral_large_latest_gpt_4o_en_even.csv
No duplicates found in ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_mongodb_com_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
No duplicates found in ./labeled_dataset

Processing files:  43%|████▎     | 86/198 [00:01<00:00, 116.04it/s]

Removed 6 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/fr/experienceleague_adobe_com_claude_3_5_sonnet_latest_synthetic_mistral_large_latest_gpt_4o_fr_even.csv
No duplicates found in ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/ja/docs_databricks_com_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja_question.csv
No duplicates found in ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/ja/docs_databricks_com_claude_3_5_sonnet_latest_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja_question_old.csv
Removed 6 duplicates from ./labeled_datasets/claude-3-5-sonnet-latest-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/ja/docs_databricks_com_claude_3_5_sonnet_latest_synthetic_mistral_large_latest_gpt_4o_ja_even.csv
No duplicates found in ./labeled_datase

Processing files:  50%|█████     | 99/198 [00:01<00:01, 91.85it/s] 

Removed 22 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/pmc_ncbi_nlm_nih_gov_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 22 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/pmc_ncbi_nlm_nih_gov_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 93 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/www_investopedia_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 94 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/www_investopedia_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 123 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/www_law_cornell_edu_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 125 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/en/www_law_cornell_edu_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_

Processing files:  56%|█████▌    | 110/198 [00:01<00:01, 84.03it/s]

Removed 3 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/docs_databricks_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
Removed 10 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/experienceleague_adobe_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
Removed 7 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/ja/www_mongodb_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja.csv
No duplicates found in ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/ko/experienceleague_adobe_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_ko.csv
Removed 3 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/pt/docs_databricks_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_pt.csv
Removed 7 duplicates from ./labeled_datasets/gpt-4o-hallucinations/non_synthetic/pt/experienceleague_adobe_com_gpt_4o_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_pt.csv
R

Processing files:  61%|██████    | 120/198 [00:01<00:01, 75.63it/s]

Removed 1016 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/medlineplus_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 191 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/pmc_ncbi_nlm_nih_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 192 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/pmc_ncbi_nlm_nih_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 1021 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_investopedia_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 1023 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_investopedia_com_gpt_4o_synthetic_gpt_

Processing files:  70%|███████   | 139/198 [00:01<00:00, 71.32it/s]

Removed 1024 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_ncbi_nlm_nih_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 597 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_noaa_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 597 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_noaa_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 3 duplicates from ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/es/experienceleague_adobe_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_es_answer.csv
No duplicates found in ./labeled_datasets/gpt-4o-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/es/experienceleague_adobe_com_gpt_4o_synthetic_gpt_4o_claud

Processing files:  80%|████████  | 159/198 [00:02<00:00, 67.64it/s]

Removed 86 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/en/docs_databricks_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 464 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/en/earthobservatory_nasa_gov_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en.csv
Removed 460 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/en/earthobservatory_nasa_gov_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 26 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_sy

Processing files:  88%|████████▊ | 174/198 [00:02<00:00, 51.42it/s]

Removed 3850 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/en/www_noaa_gov_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 16 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/es/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_es_question.csv
Removed 15 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/non_synthetic/fr/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_non_synthetic_gpt_4o_claude_3_5_sonnet_latest_fr_question.csv
Removed 2398 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/

Processing files:  91%|█████████ | 180/198 [00:02<00:00, 46.68it/s]

Removed 320 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/earthobservatory_nasa_gov_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 316 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/earthobservatory_nasa_gov_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 12 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 998 dup

Processing files:  96%|█████████▋| 191/198 [00:03<00:00, 41.39it/s]

Removed 426 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_law_cornell_edu_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.csv
Removed 428 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_law_cornell_edu_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 47 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/en/www_mongodb_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_question.csv
Removed 836 duplicates from ./labeled_

Processing files: 100%|██████████| 198/198 [00:03<00:00, 63.92it/s]

Removed 7 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/es/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_es_question.csv
Removed 9 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/fr/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_fr_question.csv
Removed 8 duplicates from ./labeled_datasets/litellm/together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-hallucinations/synthetic/even-split-of-hallucinations-and-factuals/ja/experienceleague_adobe_com_litellm_together_ai_meta_llama_Meta_Llama_3.1_8B_Instruct_Turbo_synthetic_gpt_4o_claude_3_5_sonnet_latest_ja_question.csv
Removed 21 dupli


