In [1]:
# make_ultra_pref_jsonl.py
from datasets import load_dataset, Features, Value
import json
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

Msg = Dict[str, Any]
Conv = Union[List[Msg], Dict[str, Any], str]

def _as_messages(x: Conv) -> List[Msg]:
    """Coerce a conversation object into a list of {role, content} messages."""
    if isinstance(x, list):
        return x
    if isinstance(x, dict):
        # common pattern: {"messages": [...]}
        if "messages" in x and isinstance(x["messages"], list):
            return x["messages"]
        # fallback: treat dict as a single message if it looks like one
        if "role" in x and "content" in x:
            return [x]
    if isinstance(x, str):
        # sometimes serialized JSON as string
        try:
            obj = json.loads(x)
            return _as_messages(obj)
        except Exception:
            # if it's just a bare string, treat as a single user message
            return [{"role": "user", "content": x}]
    # last resort
    return []

def _content_to_text(c: Any) -> str:
    """Extract text from a message 'content' which might be str or list of segments."""
    if c is None:
        return ""
    if isinstance(c, str):
        return c
    if isinstance(c, list):
        # e.g., [{"type":"text","text":"..."}, {"type":"..."}] or list[str]
        parts = []
        for seg in c:
            if isinstance(seg, str):
                parts.append(seg)
            elif isinstance(seg, dict):
                # prefer typical keys
                for k in ("text", "content", "value"):
                    if isinstance(seg.get(k), str):
                        parts.append(seg[k])
                        break
        return "\n".join(parts)
    if isinstance(c, dict):
        # some providers put the text under "text" or "content"
        for k in ("text", "content", "value"):
            if isinstance(c.get(k), str):
                return c[k]
        return json.dumps(c, ensure_ascii=False)
    return str(c)

def extract_prompt_and_response(conv: Conv) -> Tuple[str, str]:
    """
    From a chat transcript, return:
      prompt  = concatenation of user messages
      reply   = last assistant message (if multiple, join them)
    """
    msgs = _as_messages(conv)
    user_chunks, assistant_chunks = [], []
    for m in msgs:
        role = (m.get("role") or "").lower().strip()
        txt = _content_to_text(m.get("content"))
        if not txt:
            continue
        if role == "user":
            user_chunks.append(txt)
        elif role == "assistant":
            assistant_chunks.append(txt)
        # ignore system/tool/etc. for this export

    prompt = "\n\n".join(user_chunks).strip()
    if not assistant_chunks:
        reply = ""  # guard; dataset should have it, but stay safe
    else:
        # prefer the last assistant message as the response
        reply = assistant_chunks[-1].strip()
    return prompt, reply

def row_to_pref(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Each example has keys 'chosen' and 'rejected' that are chat transcripts.
    We build: prompt (from user messages), chosen/rejected (assistant replies).
    """
    chosen_conv = example.get("chosen", {})
    rejected_conv = example.get("rejected", {})

    prompt_c, chosen = extract_prompt_and_response(chosen_conv)
    prompt_r, rejected = extract_prompt_and_response(rejected_conv)

    # Prefer chosen's prompt; if empty, fall back to rejected's
    prompt = prompt_c if prompt_c else prompt_r

    return {
        "prompt": prompt or "",
        "chosen": chosen or "",
        "rejected": rejected or "",
    }

def make_jsonl(split: str = "train", out_path: str = "ultra_pref.jsonl"):
    ds = load_dataset("RLHFlow/UltraFeedback-preference-standard", split=split)
    cleaned = ds.map(
        row_to_pref,
        remove_columns=[c for c in ds.column_names if c not in ("chosen", "rejected")],
        desc="Extracting prompt/chosen/rejected"
    )

    # Enforce string schema & drop empties
    cleaned = cleaned.cast(Features({"prompt": Value("string"),
                                     "chosen": Value("string"),
                                     "rejected": Value("string")}))
    cleaned = cleaned.filter(lambda ex: bool(ex["prompt"]) and bool(ex["chosen"]) and bool(ex["rejected"]),
                             desc="Dropping empty rows")

    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    cleaned.to_json(out_path)
    print(f"Wrote {len(cleaned)} rows to {out_path}")

if __name__ == "__main__":
    # change split or path if you want dev/test variants too
    make_jsonl(split="train", out_path="ultra_pref_train.jsonl")


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003-e82ff6ed69653c(…):   0%|          | 0.00/70.8M [00:00<?, ?B/s]

data/train-00001-of-00003-0196dad938ced3(…):   0%|          | 0.00/67.7M [00:00<?, ?B/s]

data/train-00002-of-00003-33462d35a35bd4(…):   0%|          | 0.00/111M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/340025 [00:00<?, ? examples/s]

Extracting prompt/chosen/rejected:   0%|          | 0/340025 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/340025 [00:00<?, ? examples/s]

Dropping empty rows:   0%|          | 0/340025 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/340 [00:00<?, ?ba/s]

Wrote 339477 rows to ultra_pref_train.jsonl


In [5]:
import pandas as pd
df = pd.read_json("ultra_pref_train.jsonl", lines=True)
df.iloc[0].to_dict()['rejected']


'int main() {\n    string country;\n    // prompt user for input\n    cout << "Enter the name of a country: ";\n    cin >> country;\n    // check if country borders the Mediterranean Sea\n    if (endsWith(country, "Mediterranean")) {\n        cout << "Yes, the country " << country\n             << " borders the Mediterranean Sea.";\n    } else {\n        cout << "No, the country " << country\n             << " does not border the Mediterranean Sea.";\n    }\n    return 0;\n}'