先做validaiton set 选出一部分 300个， 然后再送到super GLUE里面进行测试，不同温度下的结果。

In [1]:
from datasets import load_dataset
import requests


tasks = ["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc"]
datasets = {task: load_dataset("super_glue", task) for task in tasks}

server_url = "http://0.0.0.0:8000/v1/chat/completions"
dataset_label = "test"

if dataset_label == "test":
    num_examples = None  # Number of examples to test per task
else:
    num_examples = 300
output_name = f"Llama-3.2-1B-Instruct_{dataset_label}"

for task, dataset in datasets.items():
    print(f"Task: {task}")

    # Show the structure of the dataset for the current task
    print(dataset)
    print(dataset["train"][0])
    if "label" in dataset["validation"][0]:  # Check if 'label' key exists
        # Get all unique labels from the test set
        labels = set(example["label"] for example in dataset["validation"])
        print(f"Unique labels in the test set: {labels}")
    else:
        print("No 'label' field in the test set.")

options = {  # Configuration options for model generation
    "temperature": 1.0,
    "max_tokens": 1024,
    "top_p": 0.9,
    "frequency_penalty": 0.0,
    "seed": 47,  # Default
    "n": 1,
}


def generate_text_with_vllm(
    prompt,
    model_name="/home/snt/llm_models/Llama-3.2-1B-Instruct",
    options={},
    server_url="http://0.0.0.0:8000/v1/chat/completions",
):
    """Generate text using the specified language model."""
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    }
    payload.update(options)
    response = requests.post(server_url, headers=headers, json=payload)
    if response.status_code == 200:
        data = response.json()
        return data["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")

  from .autonotebook import tqdm as notebook_tqdm


Task: boolq
DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3245
    })
})
{'question': 'do iran and afghanistan speak the same language', 'passage': 'Persian language -- Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, wh

In [2]:
# BoolQ (Boolean Questions)
def format_boolq_prompt(example):
    base_instruction = (
        "Answer the following question based on the passage. Type 'True' or 'False'.\n"
    )
    return f"{base_instruction}Question: {example['question']}\nPassage: {example['passage']}\n:"


# CB (CommitmentBank)
def format_cb_prompt(example):
    base_instruction = "Answer the following question based on the Premise and Hypothesis. Type 'entailment', 'contradiction', or 'neutral'.\n"
    return f"{base_instruction}Premise: {example['premise']}\nHypothesis: {example['hypothesis']}\n Does the premise entail the hypothesis? (entailment, contradiction, or neutral):"


# COPA (Choice of Plausible Alternatives)
def format_copa_prompt(example):
    base_instruction = """Answer the following question based on the Premise and Question. Type 'choice 1' or 'choice 2'.\n If the question is "What was the cause?" select the option most likely to explain why the premise happened. If the question is "What happened as a result?" select the option most likely to occur as a result of the premise."""
    return f"{base_instruction}Premise: {example['premise']}\nQuestion: {example['question']}\nChoice 1: {example['choice1']}\nChoice 2: {example['choice2']}\n Which choice is more plausible? "


# MultiRC (Multi-Sentence Reading Comprehension)
def format_multirc_prompt(example):
    base_instruction = "Based on the provided (Paragraph, Question, Answer) pair, answer the question with 'True' or 'False'.\n"
    return f"{base_instruction}Paragraph: {example['paragraph']}\nQuestion: {example['question']}\nAnswer: {example['answer']}\nIs the answer correct? (True/False):"


# ReCoRD (Reading Comprehension with Commonsense Reasoning)
def format_record_prompt(example):
    base_instruction = "Based on the provided passage and entities, answer the question with the correct entity.\n"
    return f"{base_instruction}Passage: {example['passage']}\nQuery: {example['query']}\nEntities: {', '.join(example['entities'])}\nWhich entity best fills in the blank?"


# RTE (Recognizing Textual Entailment)
def format_rte_prompt(example):
    base_instruction = "Answer the following question based on the Premise and Hypothesis. Type 'Yes' or 'No'.\n"
    return f"{base_instruction}Premise: {example['premise']}\nHypothesis: {example['hypothesis']}\nDoes the premise entail the hypothesis? (Yes or No):"


# WiC (Word-in-Context)
def format_wic_prompt(data):
    # Extract relevant information from the input data
    word = data["word"]
    sentence1 = data["sentence1"]
    sentence2 = data["sentence2"]
    start1, end1 = data["start1"], data["end1"]
    start2, end2 = data["start2"], data["end2"]

    # Extract the highlighted word in both sentences
    sentence1_highlighted = sentence1[start1 : end1 + 1]
    sentence2_highlighted = sentence2[start2 : end2 + 1]

    # Generate the prompt text
    prompt = f"""
    Task: Determine whether the word "{word}" is used with the same meaning in both sentences below.

    Sentence 1: "{sentence1}"  
    Sentence 2: "{sentence2}"  

    The word appears in Sentence 1 as: "{sentence1_highlighted}"  
    The word appears in Sentence 2 as: "{sentence2_highlighted}"  

    Question: Is the word "{word}" used with the same meaning in both sentences?  Type (Yes, No)
    """
    return prompt


# WSC (Winograd Schema Challenge)
def format_wsc_prompt(data):
    base_instruction = "Based on the provided text and pronoun, answer the question with the correct referent.\n"
    text = data["text"]
    span1_text = data["span1_text"]
    span2_text = data["span2_text"]
    span1_index = data["span1_index"]
    span2_index = data["span2_index"]
    prompt = f"""
    {base_instruction}
    Text:  
    {text}
    
    The first mention of the entity is: "{span1_text}" at index {span1_index}
    The second mention of the entity is: "{span2_text}" at index {span2_index}
    
    Task: Identify whether the second mention of the entity refers to the same entity as the first mention.
    
    Question: Does the second mention of the entity ("{span2_text}") refer to the same entity as the first mention ("{span1_text}") in the text?  Type (Yes, No)
    """
    return prompt

In [None]:
# import openai

# file_path = "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/.vscode/api_key.txt"

# # Read the API key from the file
# with open(file_path, "r") as file:
#     openai.api_key = file.read().strip()


# def call_openai_api(prompt):
#     response = openai.chat.completions.create(
#         model="gpt-4",
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant."},
#             {"role": "user", "content": prompt},
#         ],
#         temperature=0.7,
#     )
#     return response.choices[0].message.content.strip()


def evaluate_prediction_with_conversion(task, example, prediction):
    # Helper functions for conversion
    def bool_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "true"
        if value == "true":
            return 1
        elif value == "false":
            return 0

        # Check if the string contains "true" but does not contain "false"
        elif "true" in value and "false" not in value:
            return 1

        # Check if the string contains "false" but does not contain "true"
        elif "false" in value and "true" not in value:
            return 0

        # If both "true" and "false" are present or neither is present, return None
        else:
            return None

    def yes_no_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "yes"
        if value == "yes":
            return 1
        elif value == "no":
            return 0

        # Check if the string contains "yes" but does not contain "no"
        elif "yes" in value and "no" not in value:
            return 1

        # Check if the string contains "no" but does not contain "yes"
        elif "no" in value and "yes" not in value:
            return 0

        # If both "yes" and "no" are present or neither is present, return None
        else:
            return None

    def entailment_to_label(value):
        # Define the mapping for entailment, contradiction, and neutral
        mapping = {"entailment": 0, "contradiction": 1, "neutral": 2}

        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input matches exactly one of the keys in the mapping
        if value in mapping:
            return mapping[value]

        # Check if the input contains one of the keys without ambiguity
        elif (
            "entailment" in value
            and "contradiction" not in value
            and "neutral" not in value
        ):
            return mapping["entailment"]
        elif (
            "contradiction" in value
            and "entailment" not in value
            and "neutral" not in value
        ):
            return mapping["contradiction"]
        elif (
            "neutral" in value
            and "entailment" not in value
            and "contradiction" not in value
        ):
            return mapping["neutral"]

        # If the input is ambiguous or invalid, return -1
        else:
            return -1

    def choice_to_binary(value):
        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input contains 'choice 1' and does not contain 'choice 2'
        if "choice 1" in value and "choice 2" not in value:
            return 0
        elif "choice 2" in value:
            return 1

        # If the input does not match any of the conditions, return None
        else:
            return None

    # Task-specific evaluation
    if task == "boolq":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "cb":
        # Convert prediction (entailment/contradiction/neutral) to label (0/1/2)
        return entailment_to_label(prediction) == int(example["label"])

    elif task == "copa":
        # Convert prediction (choice1/choice2) to binary and compare with label (0/1)
        return choice_to_binary(prediction) == int(example["label"])

    elif task == "multirc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "record":
        # Direct comparison of prediction with the correct entity
        processed_answers = [answer.strip().lower() for answer in example["answers"]]
        return prediction.strip().lower() in processed_answers

    elif task == "rte":
        # Convert prediction (Yes/No) to binary and compare with label (1/0)
        return yes_no_to_binary(prediction) == (1 - int(example["label"]))

    elif task == "wic":
        # Convert prediction (Yes/No) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    elif task == "wsc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    # Default case: unknown task
    return False


import pandas as pd
import json
from tqdm import tqdm


for task, dataset in datasets.items():
    print(f"Testing task: {task}")
    if num_examples is not None:
        test_data = dataset[dataset_label][0:num_examples]
    else:
        test_data = dataset[dataset_label][0 : len(dataset[dataset_label])]

    test_data = [
        {key: test_data[key][i] for key in test_data}
        for i in range(len(test_data[list(test_data.keys())[0]]))
    ]

    for example in tqdm(test_data, desc=f"Examples for {task}", leave=False):
        if task == "boolq":
            prompt = format_boolq_prompt(example)
        elif task == "cb":
            prompt = format_cb_prompt(example)
        elif task == "copa":
            prompt = format_copa_prompt(example)
        elif task == "multirc":
            prompt = format_multirc_prompt(example)
        elif task == "record":
            prompt = format_record_prompt(example)
        elif task == "rte":
            prompt = format_rte_prompt(example)
        elif task == "wic":
            prompt = format_wic_prompt(example)
        elif task == "wsc":
            prompt = format_wsc_prompt(example)

        options["temperature"] = 0.1
        prediction = generate_text_with_vllm(
            prompt,
            options=options,
            server_url=server_url,
        )

        if dataset_label != "test":
            label = evaluate_prediction_with_conversion(task, example, prediction)
        else:
            label = None

        result_df = pd.DataFrame(
            {
                "task": task,
                "example": json.dumps(example),
                "prompt": prompt,
                "prediction": prediction,
                "label": label,
            },
            index=[0],
        )
        result_df.to_json(
            f"/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/superGlue/results_test_{output_name}.jsonl",
            orient="records",
            lines=True,
            mode="a",
        )

Testing task: boolq


                                                                         

Testing task: cb


                                                                  

Testing task: copa


                                                                      

Testing task: multirc


Examples for multirc:   4%|▍         | 374/9693 [09:58<39:49,  3.90it/s]   

In [4]:
import pandas as pd

test_results = pd.read_json(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/superGlue/results_test_Llama-3.2-1B-Instruct_test.jsonl",
    lines=True,
)

In [6]:
test_results.shape

(2196, 5)

In [11]:
from tqdm import tqdm
import time
from IPython.display import clear_output

# 模拟多个文件夹和每个文件夹中的文件
folders = [f"Folder_{i}" for i in range(5)]
files_per_folder = [f"File_{j}" for j in range(10)]

# 外层 tqdm 处理文件夹
for folder in tqdm(folders, desc="Processing Folders", position=0):
    time.sleep(0.5)  # 模拟文件夹处理的延迟
    # 内层 tqdm 处理每个文件夹中的文件
    for file in tqdm(
        files_per_folder,
        desc=f"Processing Files in {folder}",
        position=1,
        leave=False,
        mininterval=0.05,
    ):
        time.sleep(0.5)  # 模拟文件处理的延迟
        clear_output(wait=True)

Processing Folders:   0%|          | 0/5 [00:05<?, ?it/s]


KeyboardInterrupt: 