In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import json
from datasets import load_dataset

def export_hh_rlhf_to_jsonl(
    split: str = "train",
    out_path: str = "hh_rlhf_parsed.jsonl",
):
    """
    Load the Anthropic/hh-rlhf dataset and export it to a .jsonl file where each line is:
      {"prompt": ..., "chosen": ..., "rejected": ...}

    Details:
      - prompt: all text before the last "Assistant:" tag
      - chosen: text after the last "Assistant:" tag in ex["chosen"]
      - rejected: text after the last "Assistant:" tag in ex["rejected"]
    """
    # 1) Load the specified dataset split
    dataset = load_dataset("Anthropic/hh-rlhf", split=split)

    # Prepare a regex to split the conversation at the last "Assistant:" tag
    # (?s) lets '.' match newlines; 
    # (.*) greedily captures everything up to the final "Assistant:" line
    splitter = re.compile(r'(?s)(.*)\nAssistant:\s*(.*)$')

    # 2) Write each example as a JSON object on its own line
    with open(out_path, "w", encoding="utf-8") as fout:
        for example in dataset:
            chosen_full   = example["chosen"]
            rejected_full = example["rejected"]

            # Attempt to split full text into prompt vs. answer
            m_chosen   = splitter.match(chosen_full)
            m_rejected = splitter.match(rejected_full)
            if not m_chosen or not m_rejected:
                # Skip this record if it doesn't follow the expected format
                continue

            prompt_text   = m_chosen.group(1).strip()
            chosen_text   = m_chosen.group(2).strip()
            rejected_text = m_rejected.group(2).strip()

            record = {
                "prompt":   prompt_text,
                "chosen":   chosen_text,
                "rejected": rejected_text
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"✔️ Export complete: '{out_path}' ({len(dataset)} examples loaded, some may have been skipped).")

if __name__ == "__main__":
    export_hh_rlhf_to_jsonl(split="train", out_path="hh_rlhf_parsed.jsonl")
