In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from huggingface_hub import notebook_login
import os

In [None]:
hf_token = os.getenv('HF_TOKEN')
notebook_login()

In [None]:
sorry_bench_df = load_dataset("sorry-bench/sorry-bench-202503")['train'].to_pandas()
sorry_bench_human_train = load_dataset("sorry-bench/sorry-bench-human-judgment-202503", split="train").to_pandas()
sorry_bench_human_test = load_dataset("sorry-bench/sorry-bench-human-judgment-202503", split="test").to_pandas()

In [None]:
sorry_bench_df

In [None]:
def filter_responses_by_score(df, score):
    """Filter responses by human score."""
    return df.query(f"human_score == {score}").copy()

def process_dataset(responses_df, sorry_bench_df):
    """Process a dataset by merging, cleaning, and extracting data."""
    # Merge with main dataset
    processed_df = responses_df.merge(
        sorry_bench_df[['question_id', 'prompt_style', 'turns']], 
        on=['question_id', 'prompt_style'], 
        how='left'
    )
    
    # Drop duplicates to avoid repeated questions
    processed_df = processed_df.drop_duplicates(subset=['question_id', 'prompt_style'], keep='first')
    
    # Remove unnecessary columns
    processed_df = processed_df.drop(columns=['tstamp', 'human_score'])
    
    # Remove translation prompts
    values_to_remove = ['translate-ta', 'translate-fr', 'translate-mr', 'translate-zh-cn', 'translate-ml']
    processed_df = processed_df[~processed_df['prompt_style'].isin(values_to_remove)]
    
    # Extract LLM answer
    def extract_string(cell_value):
        try:
            return cell_value[0]['turns'][0]
        except (TypeError, IndexError, KeyError):
            return None
    
    processed_df.loc[:, 'llm_answer'] = processed_df['choices'].apply(extract_string)
    
    # Extract question
    processed_df.loc[:, 'question'] = processed_df['turns'].apply(lambda x: x[0] if len(x) > 0 else '')
    
    # Clean up columns
    processed_df = processed_df.drop(columns=['turns', 'choices'])
    
    return processed_df

In [None]:
sorry_bench_human_train = sorry_bench_human_train[sorry_bench_human_train['prompt_style'] == 'base']
sorry_bench_human_test = sorry_bench_human_test[sorry_bench_human_test['prompt_style'] == 'base']
sorry_bench_df = sorry_bench_df[sorry_bench_df['prompt_style'] == 'base']
print(len(sorry_bench_human_train))
print(len(sorry_bench_human_test))
print(len(sorry_bench_df))

In [None]:
harmful_responses_train = filter_responses_by_score(sorry_bench_human_train, 1.0)
refusal_responses_train = filter_responses_by_score(sorry_bench_human_train, 0.0)

harmful_responses_test = filter_responses_by_score(sorry_bench_human_test, 1.0)
refusal_responses_test = filter_responses_by_score(sorry_bench_human_test, 0.0)

print(len(harmful_responses_train))
print(len(refusal_responses_train))

print(len(harmful_responses_test))
print(len(refusal_responses_test))

In [None]:
def drop_multiple_question_ids(df, random_state=42):

    df_copy = df.copy()
    
    df_shuffled = df_copy.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    df_unique = df_shuffled.drop_duplicates(subset='question_id', keep='first').reset_index(drop=True)
    
    return df_unique

In [None]:
harmful_responses_train = drop_multiple_question_ids(harmful_responses_train)
refusal_responses_train = drop_multiple_question_ids(refusal_responses_train)

harmful_responses_test = drop_multiple_question_ids(harmful_responses_test)
refusal_responses_test = drop_multiple_question_ids(refusal_responses_test)

In [None]:
print(len(harmful_responses_train))
print(len(refusal_responses_train))

print(len(harmful_responses_test))
print(len(refusal_responses_test))

In [None]:
harmful_responses_train = process_dataset(harmful_responses_train, sorry_bench_df)
refusal_responses_train = process_dataset(refusal_responses_train, sorry_bench_df)

harmful_responses_test = process_dataset(harmful_responses_test, sorry_bench_df)
refusal_responses_test = process_dataset(refusal_responses_test, sorry_bench_df)


In [None]:
harmful_responses_train.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
refusal_responses_train.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
harmful_responses_test.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
refusal_responses_test.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)

In [None]:
print(len(harmful_responses_train))
print(len(refusal_responses_train))

print(len(harmful_responses_test))
print(len(refusal_responses_test))

In [None]:

# Balance by downsampling
refusal_train_balanced = refusal_responses_train.sample(n=len(harmful_responses_train), random_state=42)
refusal_test_balanced = refusal_responses_test.sample(n=len(harmful_responses_test), random_state=42)
output_dir = "../datasets"
# Save each as separate JSON
harmful_responses_train.to_json(f"{output_dir}/harmful_train.json", orient='records', indent=2)
refusal_train_balanced.to_json(f"{output_dir}/refusal_train.json", orient='records', indent=2)
harmful_responses_test.to_json(f"{output_dir}/harmful_test.json", orient='records', indent=2)
refusal_test_balanced.to_json(f"{output_dir}/refusal_test.json", orient='records', indent=2)

print("Saved separate balanced datasets:")
print(f"Training: {len(harmful_responses_train)} harmful, {len(refusal_train_balanced)} refusal")
print(f"Test: {len(harmful_responses_test)} harmful, {len(refusal_test_balanced)} refusal")