In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from huggingface_hub import notebook_login
import os

In [None]:
hf_token = os.getenv('HF_TOKEN')
notebook_login()

In [None]:
sorry_bench_df = load_dataset("sorry-bench/sorry-bench-202503")['train'].to_pandas()
sorry_bench_human_train = load_dataset("sorry-bench/sorry-bench-human-judgment-202503", split="train").to_pandas()
sorry_bench_human_test = load_dataset("sorry-bench/sorry-bench-human-judgment-202503", split="test").to_pandas()

In [None]:
def filter_responses_by_score(df, score):
    """Filter responses by human score."""
    return df.query(f"human_score == {score}").copy()

def process_dataset(responses_df, sorry_bench_df):
    """Process a dataset by merging, cleaning, and extracting data."""
    # Merge with main dataset
    processed_df = responses_df.merge(
        sorry_bench_df[['question_id', 'prompt_style', 'turns']], 
        on=['question_id', 'prompt_style'], 
        how='left'
    )
    
    # Remove unnecessary columns
    processed_df = processed_df.drop(columns=['tstamp', 'human_score'])
    
    # Remove translation prompts
    values_to_remove = ['translate-ta', 'translate-fr', 'translate-mr', 'translate-zh-cn', 'translate-ml']
    processed_df = processed_df[~processed_df['prompt_style'].isin(values_to_remove)]
    
    # Extract LLM answer
    def extract_string(cell_value):
        try:
            return cell_value[0]['turns'][0]
        except (TypeError, IndexError, KeyError):
            return None
    
    processed_df.loc[:, 'llm_answer'] = processed_df['choices'].apply(extract_string)
    
    # Extract question
    processed_df.loc[:, 'question'] = processed_df['turns'].apply(lambda x: x[0] if len(x) > 0 else '')
    
    # Clean up columns
    processed_df = processed_df.drop(columns=['turns', 'choices'])
    
    return processed_df


In [None]:
harmful_responses = filter_responses_by_score(sorry_bench_human_train, 1.0)
refusal_responses = filter_responses_by_score(sorry_bench_human_train, 0.0)

harmful_df = process_dataset(harmful_responses, sorry_bench_df)
refusal_df = process_dataset(refusal_responses, sorry_bench_df)

# Process test dataset
harmful_test_responses = filter_responses_by_score(sorry_bench_human_test, 1.0)
refusal_test_responses = filter_responses_by_score(sorry_bench_human_test, 0.0)

harmful_test_df = process_dataset(harmful_test_responses, sorry_bench_df)
refusal_test_df = process_dataset(refusal_test_responses, sorry_bench_df)


In [None]:
# Display results for train dataset
print("=== TRAIN DATASET ===")
print("Harmful responses:")
print(harmful_df.head())
print(f"\nTotal harmful responses: {len(harmful_df)}")

print("\nRefusal responses:")
print(refusal_df.head())
print(f"\nTotal refusal responses: {len(refusal_df)}")

# Display results for test dataset
print("\n=== TEST DATASET ===")
print("Harmful responses:")
print(harmful_test_df.head())
print(f"\nTotal harmful responses: {len(harmful_test_df)}")

print("\nRefusal responses:")
print(refusal_test_df.head())
print(f"\nTotal refusal responses: {len(refusal_test_df)}")

In [None]:
harmful_df.columns

In [None]:
harmful_df.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
refusal_df.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
harmful_test_df.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)
refusal_test_df.rename(columns={'llm_answer': 'output', 'question': 'instruction'}, inplace=True)

In [None]:
# Save all datasets
# Test datasets - sample 500 for test, rest goes to train
refusal_test_samples = refusal_test_df.sample(n=500, random_state=42)
remaining_refusal_test = refusal_test_df.drop(refusal_test_samples.index)
refusal_all = pd.concat([refusal_df, remaining_refusal_test], ignore_index=True)

# Sample 500 from refusal_all for training
refusal_train_samples = refusal_all.sample(n=500, random_state=42)

# Train datasets
# harmful_df.to_json("processed_sorry_bench_harmful_train.json", orient='records', indent=2)
refusal_all.to_json("processed_sorry_bench_refusal_train.json", orient='records', indent=2)
refusal_train_samples.to_json("processed_sorry_bench_refusal_train_500.json", orient='records', indent=2)

# Test datasets
refusal_test_samples.to_json("processed_sorry_bench_refusal_test.json", orient='records', indent=2)

In [None]:
test_samples = harmful_test_df.sample(n=500, random_state=42)
remaining_test = harmful_test_df.drop(test_samples.index)
harmful_all = pd.concat([harmful_df, remaining_test], ignore_index=True)
test_samples.to_json("processed_sorry_bench_harmful_test.json", orient='records', indent=2)
harmful_all.to_json("processed_sorry_bench_harmful_train.json", orient='records', indent=2)

In [None]:
print("All datasets saved successfully!")
print("\nTRAIN:")
print(f"  Harmful: {len(harmful_all)} rows")
print(f"  Refusal (full): {len(refusal_all)} rows")
print(f"  Refusal (sampled): {len(refusal_train_samples)} rows")
print("\nTEST:")
print(f"  Harmful: {len(test_samples)} rows")
print(f"  Refusal: {len(refusal_test_samples)} rows")