In [1]:
from datasets import load_dataset


def load_stereoset(split: str = "dev"):
    """
    Loads the StereoSet dataset for evaluation.

    Args:
        split (str): Dataset split to load ("dev" or "test")

    Returns:
        HuggingFace Dataset object
    """
    dataset = load_dataset("McGill-NLP/stereoset", split=split)
    return dataset


def get_example(dataset, idx: int):
    """
    Extracts a single StereoSet example in a clean format.

    Args:
        dataset: StereoSet dataset
        idx (int): index of example

    Returns:
        dict with keys:
            - context
            - sentences (list of 3 strings)
            - labels
            - bias_type
            - target
    """
    ex = dataset[idx]

    return {
        "context": ex["context"],
        "sentences": ex["sentences"]["sentence"],
        "labels": ex["labels"],        # not for scoring
        "bias_type": ex["bias_type"],
        "target": ex["target"]
    }
