In [1]:
import pandas as pd
import os

# Load the test dataset
test_file_path = '/Users/jgilhuly/Documents/dev/GitHub/dataset-generation-research/combined_datasets_for_tuning/all_languages/test.csv'
df = pd.read_csv(test_file_path)

# Create output directory if it doesn't exist
output_dir = './temp'
os.makedirs(output_dir, exist_ok=True)

# Split into synthetic and non-synthetic datasets
synthetic_df = df[df['synthetic'] == True]
non_synthetic_df = df[df['synthetic'] == False]

# Split into English and non-English datasets
english_df = df[df['language'] == 'en']
non_english_df = df[df['language'] != 'en']

# Split by question_type
question_types = df['question_type'].unique()
question_type_dfs = {}
for q_type in question_types:
    question_type_dfs[q_type] = df[df['question_type'] == q_type]

# Split by hallucination_type_realized
hallucination_types = df['hallucination_type_realized'].unique()
hallucination_type_dfs = {}
for h_type in hallucination_types:
    hallucination_type_dfs[h_type] = df[df['hallucination_type_realized'] == h_type]

# Save the datasets
synthetic_df.to_csv(os.path.join(output_dir, 'test_synthetic.csv'), index=False)
non_synthetic_df.to_csv(os.path.join(output_dir, 'test_non_synthetic.csv'), index=False)
english_df.to_csv(os.path.join(output_dir, 'test_english.csv'), index=False)
non_english_df.to_csv(os.path.join(output_dir, 'test_non_english.csv'), index=False)

# Save datasets by question_type
for q_type, q_df in question_type_dfs.items():
    q_df.to_csv(os.path.join(output_dir, f'test_{q_type}.csv'), index=False)

# Save datasets by hallucination_type_realized
for h_type, h_df in hallucination_type_dfs.items():
    h_df.to_csv(os.path.join(output_dir, f'test_hallucination_{h_type}.csv'), index=False)

# Print dataset statistics
print(f"Original dataset: {len(df)} samples")
print(f"Synthetic dataset: {len(synthetic_df)} samples")
print(f"Non-synthetic dataset: {len(non_synthetic_df)} samples")
print(f"English dataset: {len(english_df)} samples")
print(f"Non-English dataset: {len(non_english_df)} samples")

# Print question_type statistics
print("\nQuestion type statistics:")
for q_type, q_df in question_type_dfs.items():
    print(f"{q_type} dataset: {len(q_df)} samples")

# Print hallucination_type_realized statistics
print("\nHallucination type statistics:")
for h_type, h_df in hallucination_type_dfs.items():
    print(f"{h_type} hallucination dataset: {len(h_df)} samples")


Original dataset: 5165 samples
Synthetic dataset: 4516 samples
Non-synthetic dataset: 649 samples
English dataset: 2774 samples
Non-English dataset: 2391 samples

Question type statistics:
nan dataset: 0 samples
Multimodal content dataset: 635 samples
Out-of-scope information dataset: 632 samples
Other common hallucinated questions dataset: 668 samples
Errors, contradictions, or unsolvable questions dataset: 620 samples
Advanced logical reasoning dataset: 605 samples
Default question type dataset: 1473 samples

Hallucination type statistics:
nan hallucination dataset: 0 samples
Outdated information hallucination hallucination dataset: 285 samples
Overclaim hallucination hallucination dataset: 419 samples
Relation-error hallucination hallucination dataset: 642 samples
Unverifiable information hallucination hallucination dataset: 228 samples
Incompleteness hallucination hallucination dataset: 582 samples
Entity-error hallucination hallucination dataset: 145 samples
