In [18]:
import pandas as pd
import numpy as np
import re
from datasets import load_dataset

pd.set_option("display.max_colwidth", 200)

In [19]:
dataset = load_dataset("pminervini/HaluEval", "general")
dataset

Using the latest cached version of the dataset since pminervini/HaluEval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'general' at C:\Users\aadis\.cache\huggingface\datasets\pminervini___halu_eval\general\0.0.0\12a856119f03975a94509091e8cada3e6be6ead7 (last modified on Mon Jan  5 21:31:47 2026).


DatasetDict({
    data: Dataset({
        features: ['ID', 'user_query', 'chatgpt_response', 'hallucination', 'hallucination_spans'],
        num_rows: 4507
    })
})

In [20]:
halu_df = dataset["data"].to_pandas()

In [21]:
halu_df = halu_df.rename(columns={
    "chatgpt_response": "text",
    "hallucination": "label"
})

In [22]:
halu_df["label"] = halu_df["label"].str.lower().map({
    "yes": 1,
    "no": 0
})
halu_df = halu_df.dropna(subset=["label"])
halu_df["label"] = halu_df["label"].astype(int)
halu_df = halu_df[["text", "label"]]
halu_df.head()

Unnamed: 0,text,label
0,"the, a, and, to, in, that, is, it, of, for, with, was, on, be, by, at, as, but, from, this, have, or, not, are, they, all, an, their, one, has, been, would, who, which, there, if, will, when, can,...",0
1,1. allowed (permitted) and aloud (out loud)\n2. weather (condition of the atmosphere) and whether (expressing a doubt or choice)\n3. gate (entrance) and gait (way of walking)\n4. their (belonging ...,1
2,"Unfortunately, as an AI language model, I cannot create charts or graphs. However, below is a table showing the global population from 2000 to 2015:\n\n| Year | Population (in billions) |\n|------...",1
3,Here is an example of a shape with 10 vertices:\n\n```\n /\\n / \ \n /____\\n /\ /\\n / \ / \\n/____\ /____\\n\ / \ /\n \ / \ /\n \/____\/\n``` \n\nThis shape is a...,1
4,"Sure, here is a 10 by 10 multiplication table:\n\n```\n| | 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|\n|---|---|---|---|---|---|---|---|---|---|---|\n| 1| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|\n...",1


In [23]:
halu_df["label"].value_counts()


label
0    3692
1     815
Name: count, dtype: int64

In [24]:
from datasets import load_dataset

truthfulqa = load_dataset("truthful_qa", "generation", split="validation")
truthfulqa

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

Dataset({
    features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source'],
    num_rows: 817
})

In [25]:
truthfulqa.features


{'type': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'best_answer': Value(dtype='string', id=None),
 'correct_answers': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'incorrect_answers': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'source': Value(dtype='string', id=None)}

In [None]:
truthful_rows = []

for row in truthfulqa:
    for ans in row["correct_answers"]:
        truthful_rows.append({
            "text": ans.strip(),
            "label": 0
        })
        
    for ans in row["incorrect_answers"]:
        truthful_rows.append({
            "text": ans.strip(),
            "label": 1
        })

truthful_df = pd.DataFrame(truthful_rows)
truthful_df.head()

Unnamed: 0,text,label
0,Nothing happens,0
1,You eat watermelon seeds,0
2,The watermelon seeds pass through your digestive system,0
3,You will not digest the watermelon seeds,0
4,The watermelon seeds will be excreted,0


In [27]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

truthful_df["text"] = truthful_df["text"].astype(str).apply(clean_text)
truthful_df = truthful_df[truthful_df["text"].str.len() > 20]
truthful_df.drop_duplicates(inplace=True)

truthful_df["label"].value_counts()


label
1    2772
0    2139
Name: count, dtype: int64

In [28]:
halu_df = halu_df[["text", "label"]]
truthful_df = truthful_df[["text", "label"]]


In [29]:
final_df = pd.concat([halu_df, truthful_df], ignore_index=True)

final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
final_df.head()


Unnamed: 0,text,label
0,konrad lorenz won the nobel prize,1
1,"She had everything she ever wanted, but she was still unhappy. She longed for adventure and to explore the world beyond the castle walls. One day, she snuck out of the castle disguised as a common...",0
2,new york city has way fewer international arrivals per year than los angeles and london,1
3,"The online gaming industry is a rapidly growing sector of the entertainment industry. Online gaming has become increasingly popular as technology advances, making games more accessible than ever b...",0
4,george washington carver invented peanut butter,1


In [30]:
print(final_df.info())
print("\nLabel distribution:")
print(final_df["label"].value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9418 entries, 0 to 9417
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9418 non-null   object
 1   label   9418 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 147.3+ KB
None

Label distribution:
label
0    0.619134
1    0.380866
Name: proportion, dtype: float64


In [31]:
final_df.to_csv("../data/final_dataset.csv", index=False)