In [None]:
from __future__ import annotations
import json
import os
import re
from tqdm import tqdm
from pathlib import Path
from datasets import Dataset, DatasetDict, Features, Value, load_dataset
from huggingface_hub import HfApi, HfFolder

In [None]:

"""
Builds + pushes students' preference dataset to 🤗 Hub (`derko83/m3_dpo`).
"""
RAW_FILE = Path("m1_preference_data.json")  # preference pairs of students
HF_REPO = "derko83/m3_dpo"                    
SEED = 42
HF_TOKEN = os.getenv("HF_TOKEN", "")                                


def _normalise(text: str) -> str:
    if not text:
        return ""
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = "\n".join(ln.strip() for ln in text.splitlines())
    return re.sub(r"\s+", " ", text).strip()


def build_prompt(item: dict) -> str:
    stem = _normalise(item.get("question_body", ""))
    if item.get("question_type", "").lower() == "mcq":
        options = item.get("question_options") or []
        if options:
            opt_block = "\n".join(f"{i}. {opt}" for i, opt in enumerate(options))
            return f"{stem}\n\nOptions:\n{opt_block}"
    return stem


def main() -> None:
    with RAW_FILE.open(encoding="utf-8") as f:
        raw_items: list[dict] = json.load(f)

    pairs: list[dict[str, str]] = []

    for item in raw_items:
        prompt = build_prompt(item)
        if not prompt:
            continue

        for pref in item.get("preferences") or []:
            ans_a = _normalise(pref.get("A", ""))
            ans_b = _normalise(pref.get("B", ""))
            overall = _normalise(pref.get("ranking_criteria", {}).get("overall", "")).upper()

            if overall not in {"A", "B"} or not ans_a or not ans_b or ans_a == ans_b:
                continue

            chosen, rejected = (ans_a, ans_b) if overall == "A" else (ans_b, ans_a)

            pairs.append(
                {
                    "prompt":   prompt,
                    "chosen":   chosen,
                    "rejected": rejected,
                }
            )

    unique_triples = {(p["prompt"], p["chosen"], p["rejected"]) for p in pairs}
    pairs = [dict(zip(("prompt", "chosen", "rejected"), triplet)) for triplet in unique_triples]

    if not pairs:
        raise RuntimeError("No valid preference pairs produced – aborting.")


    features = Features(
        {
            "prompt":   Value("string"),
            "chosen":   Value("string"),
            "rejected": Value("string"),
        }
    )
    full_ds = Dataset.from_list(pairs, features=features).shuffle(seed=SEED)
    dsd: DatasetDict = DatasetDict({"train": full_ds})  


    print(f"📝  Final size: {len(dsd['train'])} train")

if __name__ == "__main__":
    main()

In [None]:
def pre_process_HelpSteer3():
    """"
    Preprocess the HelpSteer3 dataset for DPO training.
    This function filters the dataset for STEM and code domains, processes the responses based on overall preference,
    and formats the context into a prompt string."""
    dataset = load_dataset("nvidia/HelpSteer3", split="train")
    stem_data = dataset.filter(lambda x: x["domain"] == "stem" or x["domain"] == "code")
    processed = []
    for idx, item in tqdm(enumerate(stem_data), total=len(stem_data), desc="Processing HelpSteer3"):
        if item["overall_preference"] >= 0:
            chosen = item["response1"]
            rejected = item["response2"]
        else:
            chosen = item["response2"]
            rejected = item["response1"]
            prompt = ""
        
        # Format the context into a prompt string
        if "context" in item and isinstance(item["context"], list):
            prompt = ""
            for turn in item["context"]:
                role = turn.get("role", "")
                content = turn.get("content", "")
                prompt += f"{role}: {content}\n\n"
        
        processed.append({
            "id": f"dpo_dataset_helpsteer3_{idx}",
            "dataset": "helpsteer3",
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected
        })
    return processed    

def pre_processs_10K_step_dpo():
    """"
    Preprocess the 10K-step-dpo dataset for DPO training.
    This function loads the dataset, processes each item, and formats it into a structured format.
    """
    dataset = load_dataset("xinlai/Math-Step-DPO-10K", split="train")
    processed = []
    for idx, item in tqdm(enumerate(dataset), total=len(dataset), desc="Processing 10K-step-dpo"):
        processed.append({
            "id": f"dpo_dataset_10K_step_dpo_{idx}",
            "dataset": "10K-step-dpo",
            "prompt": item["prompt"],
            "chosen": item["full_chosen"],
            "rejected": item["full_rejected"]
        })
    return processed

def pre_process_students_pairs():
    """"
    Preprocess the students' preference pairs dataset for DPO training.
    This function loads the dataset, processes each item, and formats it into a structured format.
    """
    ds_reloaded = load_dataset("derko83/m3_dpo", split="train")
    processed = []
    for idx, item in tqdm(enumerate(ds_reloaded), total=len(ds_reloaded), desc="Processing pair students"):
        processed.append({
            "id": f"pref_pairs_students_{idx}",
            "dataset": "students_preference_pairs",
            "prompt": item["prompt"],
            "chosen": item["chosen"],
            "rejected": item["rejected"]
        })
    return processed

def pre_process_distilled_math():
    """"
    Preprocess the Distilabel Math Preference DPO dataset.
    This function loads the dataset, processes each item, and formats it into a structured format.
    """
    dataset = load_dataset("argilla/distilabel-math-preference-dpo", split="train")
    processed = []
    for idx, item in tqdm(enumerate(dataset), total=len(dataset), desc="Processing 10K-step-dpo"):
        processed.append({
            "id": f"distilabel-math-preference-dpo{idx}",
            "dataset": "distilabel-math-preference-dpo",
            "prompt": item["instruction"],
            "chosen": item["chosen_response"],
            "rejected": item["rejected_response"]
        })
    return processed



def main():
    data = []
    data.extend(pre_process_students_pairs())
    data.extend(pre_process_HelpSteer3())
    data.extend(pre_processs_10K_step_dpo())
    data.extend(pre_process_distilled_math())
    with open("full_m3_dpo_dataset.json", "w") as f:
        json.dump(data, f, indent=2)
    print("✅ full_m3_dpo_dataset.json has been saved.")

if __name__ == "__main__":
    main()

In [None]:
INPUT_JSON = "full_m3_dpo_dataset.json"
HF_REPO = "derko83/full_m3_dpo"
SEED = 42
VALID_FRAC = 0.05

def main():
    # Load data from JSON
    with open(INPUT_JSON, encoding="utf-8") as f:
        data = json.load(f)

    features = Features({
        "id":       Value("string"),
        "dataset":  Value("string"),
        "prompt":   Value("string"),
        "chosen":   Value("string"),
        "rejected": Value("string"),
    })

    dataset = Dataset.from_list(data, features=features).shuffle(seed=SEED)

    # Train/Validation split
    dsd = dataset.train_test_split(test_size=VALID_FRAC, seed=SEED)
    dsd["validation"] = dsd.pop("test")  # rename for consistency

    print(f"📝 Final size: {len(dsd['train'])} train • {len(dsd['validation'])} validation")

    # Push to Hugging Face Hub
    dsd.push_to_hub(HF_REPO, private=True, token=HF_TOKEN)

if __name__ == "__main__":
    main()


In [None]:
def pre_process_reward_bench():
    """
    Create the test set of the dataset.
    This function filters the reward-bench dataset for specific STEM subsets and processes each item into a structured format.
    """
    dataset = load_dataset("allenai/reward-bench", split = "raw")
    dataset = dataset.filter(lambda ex: ex["subset"] == "hep-cpp" or ex["subset"] == "hep-go" or ex["subset"] == "hep-java" or ex["subset"] == "hep-js" or ex["subset"] == "hep-python" or ex["subset"] == "hep-rust" or ex["subset"] == "math-prm" )
    processed = []
    for idx, item in tqdm(enumerate(dataset), total=len(dataset), desc="Processing reward bench"):
        processed.append({
            "id": f"reward-bench{idx}",
            "dataset": "reward-bench",
            "prompt": item["prompt"],
            "chosen": item["chosen"],
            "rejected": item["rejected"]
        })
    return processed

def main():
    data = []
    data.extend(pre_process_reward_bench())
    with open("reward_bench_processed.json", "w") as f:
        json.dump(data, f, indent=2)
    print("✅ reward_bench_processed.json has been saved.")

if __name__ == "__main__":
    main()

In [None]:
# Load the processed JSON file
with open("reward_bench_processed.json", "r") as f:
    data = json.load(f)

# Create a Hugging Face Dataset from the list
ds = Dataset.from_list(data)

# Push the dataset to your existing Hugging Face repo
ds.push_to_hub(repo_id=HF_REPO)

In [None]:
# Final version of my dataset for M3 we train + validation + test set and the finalized dataset is pushed at address "derko83/MNLP_M3_dpo_dataset"

token = HfFolder.get_token()
api = HfApi()

full = load_dataset("derko83/full_m3_dpo")
train_ds = full["train"]
val_ds   = full["validation"]

test_ds = load_dataset("derko83/reward_bench_processed", split="train")

new_ds = DatasetDict({
    "train":      train_ds,
    "validation": val_ds,
    "test":       test_ds
})

repo_id = "derko83/MNLP_M3_dpo_dataset"
new_ds.push_to_hub(repo_id, token=token)

print(f"✅ Successfully pushed dataset to https://huggingface.co/{repo_id}")