In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from evaluate import load


tasks = ["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc"]


# Preprocessing function: Format inputs based on task
def preprocess_example(example, task):
    # Format input text based on task
    if task == "boolq":  # BoolQ: Yes/No question answering
        context = example["passage"]
        question = example["question"]
        input_text = (
            f"Question: {question} Context: {context} Is this correct? (yes/no)"
        )

    elif task == "cb":  # CommitmentBank: Textual entailment
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        input_text = f"Premise: {premise} Hypothesis: {hypothesis} Does this follow? (yes/no/unknown)"

    elif task == "copa":  # COPA: Causal reasoning
        premise = example["premise"]
        choice1 = example["choice1"]
        choice2 = example["choice2"]
        input_text = f"Premise: {premise} Question: {example['question']} Option 1: {choice1} Option 2: {choice2} Which is more plausible?"

    elif task == "multirc":  # MultiRC: Multi-choice QA
        question = example["question"]
        context = example["paragraph"]
        answer = example["answer"]
        input_text = f'Context: {context} Question: {question} Is the answer "{answer}" correct? (yes/no)'

    elif task == "record":  # ReCoRD: Cloze-style QA
        context = example["passage"]
        query = example["query"]
        input_text = (
            f"Context: {context} Question: {query} What should fill in the blank?"
        )

    elif task == "rte":  # Recognizing Textual Entailment
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        input_text = (
            f"Premise: {premise} Hypothesis: {hypothesis} Is this true? (yes/no)"
        )

    elif task == "wic":  # WiC: Word-in-Context disambiguation
        word = example["word"]
        sentence1 = example["sentence1"]
        sentence2 = example["sentence2"]
        input_text = f"Word: {word} Sentence 1: {sentence1} Sentence 2: {sentence2} Does the word have the same meaning in both sentences? (yes/no)"

    elif task == "wsc":  # Winograd Schema Challenge
        context = example["text"]
        input_text = (
            f"Sentence: {context} Does the pronoun refer to the correct entity?"
        )

    else:
        raise ValueError(f"Unknown task: {task}")

    return (input_text,)


def evaluate_task(task):
    print(f"Loading task data: {task}")
    dataset = load_dataset("super_glue", task)
    metric = (
        load("accuracy") if task != "multirc" else load("f1")
    )  # Use accuracy or F1 score

    predictions = []
    references = []

    for example in dataset["validation"]:  # Use validation set
        # Preprocess data
        inputs = preprocess_example(example, task, tokenizer)

        # Generate predictions
        prediction = predict(inputs, model, tokenizer, task)

        # Store predictions and references
        if task in ["boolq", "cb", "rte", "wic", "wsc"]:
            predictions.append(prediction)
            references.append(str(example["label"]))  # Convert labels to strings

        elif task == "copa":
            predictions.append(prediction)
            references.append(example["label"])  # COPA labels are 0 or 1

        elif task == "record":
            predictions.append(prediction)
            references.append(example["answers"])  # ReCoRD labels are answer lists

        elif task == "multirc":
            predictions.append(prediction)
            references.append(str(example["label"]))

    # Compute evaluation scores
    results = metric.compute(predictions=predictions, references=references)
    print(f"Results for task {task}: {results}")


# Main function: Evaluate all tasks
if __name__ == "__main__":
    for task in tasks:
        try:
            evaluate_task(task, model, tokenizer)
        except Exception as e:
            print(f"Error evaluating task {task}: {e}")

In [11]:
import torch
from datasets import load_dataset


tasks = ["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc"]

task = "record"
dataset = load_dataset("super_glue", task)

Downloading data: 100%|██████████| 51.8M/51.8M [00:00<00:00, 92.4MB/s]
Generating train split: 100%|██████████| 100730/100730 [00:16<00:00, 5940.57 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:01<00:00, 5735.06 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:01<00:00, 6466.41 examples/s]


In [13]:
dataset["train"][0]

{'passage': "The harrowing stories of women and children locked up for so-called 'moral crimes' in Afghanistan's notorious female prison have been revealed after cameras were allowed inside. Mariam has been in Badam Bagh prison for three months after she shot a man who just raped her at gunpoint and then turned the weapon on herself - but she has yet to been charged. Nuria has eight months left to serve of her sentence for trying to divorce her husband. She gave birth in prison to her son and they share a cell together. Scroll down for video Nuria was jailed for trying to divorce her husband. Her son is one of 62 children living at Badam Bagh prison\n@highlight\nMost of the 202 Badam Bagh inmates are jailed for so-called 'moral crimes'\n@highlight\nCrimes include leaving their husbands or refusing an arrange marriage\n@highlight\n62 children live there and share cells with their mothers and five others",
 'query': 'The baby she gave birth to is her husbands and he has even offered to h