In [None]:
import pandas as pd
import numpy as np

In [None]:
# dataset from https://github.com/RUCAIBox/HaluEval
df1 = pd.read_json("/content/general_data.json", lines=True)

# data cleaning, only need prompt and binary yes/no for hallucination
df1.drop(['chatgpt_response', 'ID', 'hallucination_spans'], axis=1, inplace=True, errors='ignore')

df1['hallucination_binary'] = df1['hallucination'].map({'yes': 1, 'no': 0})

df1.drop('hallucination', axis=1, inplace=True, errors='ignore')

# necessary for trainer
df1.rename(columns={'hallucination_binary':'label'}, inplace=True)

# rename for consistency
df1.rename(columns={'user_query':'input'}, inplace=True)

print(len(df1))
print(df1.columns)
print(df1.head())

4507
Index(['input', 'label'], dtype='object')
                                               input  label
0  Produce a list of common words in the English ...      0
1              Provide a few examples of homophones.      1
2  Create a chart outlining the world's populatio...      1
3         Design a shape with 10 vertices (corners).      1
4  Automatically generate a 10 by 10 multiplicati...      1


In [None]:
# datasets from https://github.com/Arize-ai/LibreEval
csv_file_paths = [
    '/content/docs_databricks_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/earthobservatory_nasa_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/experienceleague_adobe_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/medlineplus_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/pmc_ncbi_nlm_nih_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/www_investopedia_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/www_law_cornell_edu_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/www_mongodb_com_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/www_ncbi_nlm_nih_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    '/content/www_noaa_gov_gpt_4o_synthetic_gpt_4o_claude_3_5_sonnet_latest_en_answer.txt',
    ]
df2_list = [pd.read_csv(file) for file in csv_file_paths]
df2 = pd.concat(df2_list, ignore_index=True)

df2.drop(columns=["reference", "output", "explanation_gpt-4o",
                  "label_claude-3-5-sonnet-latest", "explanation_claude-3-5-sonnet-latest",
                  "label_litellm/together_ai/Qwen/Qwen2.5-7B-Instruct-Turbo",
                  "explanation_litellm/together_ai/Qwen/Qwen2.5-7B-Instruct-Turbo",
                  "rag_model", "force_even_split", "website", "synthetic",
                  "language", "hallucination_type_realized", "question_type", "hallucination_type_encouraged",
                  "hallucination_type_realized_ensemble", "label_mistral-large-latest",
                  "explanation_mistral-large-latest", "human_label", "label_gpt-4o"], errors="ignore", inplace=True)

df2 = df2[df2['label'].str.upper() != "NOT_PARSABLE"]
df2['label'] = df2['label'].map({'hallucinated': 1, 'factual': 0})

print(len(df2))
print(df2.columns)
print(df2.head())

4246
Index(['input', 'label'], dtype='object')
                                               input  label
0  What actions can be performed on an external l...      0
1  What versions of Databricks Runtime does the i...      0
2  What is the default access restriction for mat...      0
3  Who can query materialized views and streaming...      0
4  What is required to enable Iceberg reads on ta...      0


In [None]:
# dataset from https://huggingface.co/datasets/opencompass/anah?row=0
from datasets import load_dataset

anah_dataset = load_dataset("opencompass/anah")

# only has train set on hugging face
df3 = anah_dataset["train"].to_pandas()

# drop non-gpt columns
df3.drop(columns=["InternLM_answers", "human_InternLM_answers_ann", "name", "documents", "language", "GPT3.5_answers_D"], inplace=True)

# get true/false hallucination data, code from ChatGPT
df3['label'] = df3['human_GPT3.5_answers_D_ann'].apply(
    lambda ann: any('<Hallucination>' in str(a) for a in ann)
)

df3['label'] = df3['label'].map({True: 1, False: 0})

# remove LLM response
df3.drop(columns="human_GPT3.5_answers_D_ann", inplace=True)

# rename for consistency
df3.rename(columns={'selected_questions':'input'}, inplace=True)

# extract strings for arrays in every row, code from ChatGPT

df3['input'] = df3['input'].apply(
    lambda x: x[0] if isinstance(x, (list, np.ndarray)) and len(x) > 0 else x
)

print(df3.head())
print(df3.columns)

                                               input  label
0  What was the aftermath of the Battle of Sobrao...      1
1  What were the consequences of the Kapp Putsch ...      1
2  What were the main factors leading to the Batt...      0
3  How did the Battle of the Camel unfold, and wh...      0
4  How was the leadership vote conducted and what...      1
Index(['input', 'label'], dtype='object')


In [None]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)
print(combined_df.shape)
print(combined_df.columns)
print(combined_df.head())

print(df2['label'].value_counts())


combined_df.to_csv("final_dataset.csv", index=False)

(9536, 2)
Index(['input', 'label'], dtype='object')
                                               input  label
0  Produce a list of common words in the English ...      0
1              Provide a few examples of homophones.      1
2  Create a chart outlining the world's populatio...      1
3         Design a shape with 10 vertices (corners).      1
4  Automatically generate a 10 by 10 multiplicati...      1
label
0    7291
1    2245
Name: count, dtype: int64
