In [4]:
from datasets import load_dataset, Dataset
from pathlib import Path

# Which subsets to extract (dataset uses lowercase with underscore)
SUBSETS = ["factuality", "precise if", "math", "safety", "focus"]

import json
from datasets import Dataset, Features, Value

def _to_str(x):
    if x is None:
        return ""
    # list → take first usable string-like value
    if isinstance(x, list):
        if not x:
            return ""
        x = x[0]
    # dict → look for common text keys
    if isinstance(x, dict):
        for k in ("text", "content", "value", "output", "response"):
            v = x.get(k)
            if isinstance(v, str):
                return v
        # fallback: serialize
        return json.dumps(x, ensure_ascii=False)
    # everything else → string
    return str(x)

def build_subset(ds, subset_name: str) -> Dataset:
    # 1) filter by subset
    fds = ds.filter(lambda x: str(x.get("subset", "")).strip().lower() == subset_name)

    # 2) keep only the three columns, coerced to strings uniformly
    keep_cols = ["prompt", "chosen", "rejected"]
    remove_cols = [c for c in fds.column_names if c not in keep_cols]

    fds = fds.map(
        lambda ex: {
            "prompt":  _to_str(ex.get("prompt", "")),
            "chosen":  _to_str(ex.get("chosen", "")),    # <-- was a list in some rows → now string
            "rejected": _to_str(ex.get("rejected", "")), # <-- already handled before, keep symmetric
        },
        remove_columns=remove_cols,
    )

    # 3) enforce schema (optional but nice)
    fds = fds.cast(Features({"prompt": Value("string"),
                             "chosen": Value("string"),
                             "rejected": Value("string")}))
    # 4) drop empties
    fds = fds.filter(lambda ex: bool(ex["prompt"]) and bool(ex["chosen"]) and bool(ex["rejected"]))
    return fds


def main(out_dir="reward_bench2_clean"):
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    dsd = load_dataset("allenai/reward-bench-2")  # -> DatasetDict with 'test'
    base = dsd["test"]

    manifest = {}
    for sub in SUBSETS:
        out = build_subset(base, sub)
        out_path = Path(out_dir) / f"{sub}.jsonl"
        out.to_json(str(out_path))
        manifest[sub] = {"subset": sub, "rows_written": len(out), "path": str(out_path)}

    # Optional: write a small manifest
    import json
    with open(Path(out_dir) / "manifest.json", "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print("Done. Summary:", manifest)

if __name__ == "__main__":
    main()


Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/475 [00:00<?, ? examples/s]

Filter:   0%|          | 0/475 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/160 [00:00<?, ? examples/s]

Filter:   0%|          | 0/160 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/183 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/183 [00:00<?, ? examples/s]

Filter:   0%|          | 0/183 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Filter:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/495 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/495 [00:00<?, ? examples/s]

Filter:   0%|          | 0/495 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done. Summary: {'factuality': {'subset': 'factuality', 'rows_written': 475, 'path': 'reward_bench2_clean/factuality.jsonl'}, 'precise if': {'subset': 'precise if', 'rows_written': 160, 'path': 'reward_bench2_clean/precise if.jsonl'}, 'math': {'subset': 'math', 'rows_written': 183, 'path': 'reward_bench2_clean/math.jsonl'}, 'safety': {'subset': 'safety', 'rows_written': 450, 'path': 'reward_bench2_clean/safety.jsonl'}, 'focus': {'subset': 'focus', 'rows_written': 495, 'path': 'reward_bench2_clean/focus.jsonl'}}


In [16]:
from datasets import load_dataset, Dataset
from pathlib import Path

def main(out_dir="reward_bench2_clean"):
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    dsd = load_dataset("allenai/reward-bench-2")  # -> DatasetDict with 'test'
    base = dsd["test"]

    manifest = {}
    for sub in SUBSETS:
        out = build_subset(base, sub)
        out_path = Path(out_dir) / f"{sub}.jsonl"
        out.to_json(str(out_path))
        manifest[sub] = {"subset": sub, "rows_written": len(out), "path": str(out_path)}

    # Optional: write a small manifest
    import json
    with open(Path(out_dir) / "manifest.json", "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print("Done. Summary:", manifest)

if __name__ == "__main__":
    main()

'Precise IF'

In [1]:
from datasets import load_dataset, Dataset
from pathlib import Path
import json
from datasets import Features, Value

def _to_str(x):
    if x is None:
        return ""
    if isinstance(x, list):
        if not x:
            return ""
        x = x[0]  # take first if list
    if isinstance(x, dict):
        for k in ("text", "content", "value", "output", "response"):
            v = x.get(k)
            if isinstance(v, str):
                return v
        return json.dumps(x, ensure_ascii=False)
    return str(x)

def build_all(ds: Dataset) -> Dataset:
    # Keep only prompt/chosen/rejected and coerce to strings
    keep_cols = ["prompt", "chosen", "rejected"]
    remove_cols = [c for c in ds.column_names if c not in keep_cols]

    ds = ds.map(
        lambda ex: {
            "prompt":   _to_str(ex.get("prompt", "")),
            "chosen":   _to_str(ex.get("chosen", "")),
            "rejected": _to_str(ex.get("rejected", "")),
        },
        remove_columns=remove_cols,
    )

    # Enforce schema and drop empty rows
    ds = ds.cast(Features({"prompt": Value("string"),
                           "chosen": Value("string"),
                           "rejected": Value("string")}))
    ds = ds.filter(lambda ex: bool(ex["prompt"]) and bool(ex["chosen"]) and bool(ex["rejected"]))
    return ds

def main(out_dir="reward_bench2_clean", out_name="reward_bench2_all.jsonl"):
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    dsd = load_dataset("allenai/reward-bench-2")  # -> DatasetDict with 'test'
    base = dsd["test"]

    out = build_all(base)
    out_path = Path(out_dir) / out_name
    out.to_json(str(out_path))

    print(f"Done. Wrote {len(out)} rows to {out_path}")

if __name__ == "__main__":
    main()


README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/6.97M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1865 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1865 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Done. Wrote 1865 rows to reward_bench2_clean/reward_bench2_all.jsonl


In [3]:
dsd

DatasetDict({
    test: Dataset({
        features: ['id', 'prompt', 'chosen', 'rejected', 'num_correct', 'num_incorrect', 'total_completions', 'models', 'subset', 'additional_metadata'],
        num_rows: 1865
    })
})