In [4]:
from datasets import load_dataset
import pandas as pd

# Load the labeled split (1,000 high-quality samples)
labeled_data = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")

# Load the artificial split (9,000 samples)
artificial_data = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_artificial")

In [None]:
print(labeled_data['train'].features)

"""
expected output：
{'Question': Value(dtype='string', id=None), 
'Knowledge': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 
'Ground Truth': Value(dtype='string', id=None), 
'Difficulty Level': Value(dtype='string', id=None), 
'Hallucinated Answer': Value(dtype='string', id=None), 
'Category of Hallucination': Value(dtype='string', id=None)}
"""


print(labeled_data['train'][0])

{'Question': Value(dtype='string', id=None), 'Knowledge': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Ground Truth': Value(dtype='string', id=None), 'Difficulty Level': Value(dtype='string', id=None), 'Hallucinated Answer': Value(dtype='string', id=None), 'Category of Hallucination': Value(dtype='string', id=None)}
{'Question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'Knowledge': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in pla

In [6]:
# Get all unique hallucination categories
unique_categories = set(labeled_data['train']['Category of Hallucination'])
print("All hallucination categories:")
for category in unique_categories:
    print(f"- {category}")

All hallucination categories:
- Incomplete Information
- Methodological and Evidence Fabrication
- Misinterpretation of #Question#
- Mechanism and Pathway Misattribution


In [7]:
from collections import Counter

# Count occurrences of each category
category_counts = Counter(labeled_data['train']['Category of Hallucination'])
print("Hallucination category distribution:")
for category, count in category_counts.items():
    print(f"{category}: {count}")

Hallucination category distribution:
Mechanism and Pathway Misattribution: 33
Incomplete Information: 212
Misinterpretation of #Question#: 752
Methodological and Evidence Fabrication: 3


In [8]:
# View hallucination categories for the first 10 samples
for i in range(min(10, len(labeled_data['train']))):
    category = labeled_data['train']['Category of Hallucination'][i]
    question = labeled_data['train']['Question'][i][:50] + "..."  # Truncate question to 50 chars
    print(f"Sample {i+1}: {category} - {question}")

Sample 1: Mechanism and Pathway Misattribution - Do mitochondria play a role in remodelling lace pl...
Sample 2: Incomplete Information - Landolt C and snellen e acuity: differences in str...
Sample 3: Misinterpretation of #Question# - Syncope during bathing in infants, a pediatric for...
Sample 4: Misinterpretation of #Question# - Are the long-term results of the transanal pull-th...
Sample 5: Incomplete Information - Can tailored interventions increase mammography us...
Sample 6: Misinterpretation of #Question# - Double balloon enteroscopy: is it efficacious and ...
Sample 7: Misinterpretation of #Question# - 30-Day and 1-year mortality in emergency general s...
Sample 8: Incomplete Information - Is adjustment for reporting heterogeneity necessar...
Sample 9: Misinterpretation of #Question# - Do mutations causing low HDL-C promote increased c...
Sample 10: Incomplete Information - A short stay or 23-hour ward in a general and acad...


In [9]:
# Use Pandas for More Convenient Analysis
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(labeled_data['train'])

# View hallucination category distribution
print("Hallucination category statistics:")
print(df['Category of Hallucination'].value_counts())

# View samples by category
print("\nGrouped by category:")
for category in df['Category of Hallucination'].unique():
    print(f"\n=== {category} ===")
    category_samples = df[df['Category of Hallucination'] == category]
    print(f"Number of samples: {len(category_samples)}")
    # Show first few sample questions
    for idx, question in enumerate(category_samples['Question'].head(3)):
        print(f"  Sample {idx+1}: {question[:100]}...")

Hallucination category statistics:
Category of Hallucination
Misinterpretation of #Question#            752
Incomplete Information                     212
Mechanism and Pathway Misattribution        33
Methodological and Evidence Fabrication      3
Name: count, dtype: int64

Grouped by category:

=== Mechanism and Pathway Misattribution ===
Number of samples: 33
  Sample 1: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?...
  Sample 2: Does diabetes mellitus influence the efficacy of FDG-PET in the diagnosis of cervical cancer?...
  Sample 3: Major depression and alcohol use disorder in adolescence: Does comorbidity lead to poorer outcomes o...

=== Incomplete Information ===
Number of samples: 212
  Sample 1: Landolt C and snellen e acuity: differences in strabismus amblyopia?...
  Sample 2: Can tailored interventions increase mammography use among HMO women?...
  Sample 3: Is adjustment for reporting heterogeneity necessary in sleep disorder