In [1]:
import openai

file_path = "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/.vscode/api_key.txt"

# # Read the API key from the file
with open(file_path, "r") as file:
    openai.api_key = file.read().strip()


def call_openai_api(prompt):
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()


def evaluate_prediction_with_conversion(task, example, prediction):
    # Helper functions for conversion
    def bool_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "true"
        if value == "true":
            return 1
        elif value == "false":
            return 0

        # Check if the string contains "true" but does not contain "false"
        elif "true" in value and "false" not in value:
            return 1

        # Check if the string contains "false" but does not contain "true"
        elif "false" in value and "true" not in value:
            return 0

        # If both "true" and "false" are present or neither is present, return None
        else:
            return None

    def yes_no_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "yes"
        if value == "yes":
            return 1
        elif value == "no":
            return 0

        # Check if the string contains "yes" but does not contain "no"
        elif "yes" in value and "no" not in value:
            return 1

        # Check if the string contains "no" but does not contain "yes"
        elif "no" in value and "yes" not in value:
            return 0

        # If both "yes" and "no" are present or neither is present, return None
        else:
            return None

    def entailment_to_label(value):
        # Define the mapping for entailment, contradiction, and neutral
        mapping = {"entailment": 0, "contradiction": 1, "neutral": 2}

        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input matches exactly one of the keys in the mapping
        if value in mapping:
            return mapping[value]

        # Check if the input contains one of the keys without ambiguity
        elif (
            "entailment" in value
            and "contradiction" not in value
            and "neutral" not in value
        ):
            return mapping["entailment"]
        elif (
            "contradiction" in value
            and "entailment" not in value
            and "neutral" not in value
        ):
            return mapping["contradiction"]
        elif (
            "neutral" in value
            and "entailment" not in value
            and "contradiction" not in value
        ):
            return mapping["neutral"]

        # If the input is ambiguous or invalid, return -1
        else:
            return -1

    def choice_to_binary(value):
        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input contains 'choice 1' and does not contain 'choice 2'
        if "choice 1" in value and "choice 2" not in value:
            return 0
        elif "choice 2" in value:
            return 1

        # If the input does not match any of the conditions, return None
        else:
            return None

    # Task-specific evaluation
    if task == "boolq":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "cb":
        # Convert prediction (entailment/contradiction/neutral) to label (0/1/2)
        return entailment_to_label(prediction) == int(example["label"])

    elif task == "copa":
        # Convert prediction (choice1/choice2) to binary and compare with label (0/1)
        return choice_to_binary(prediction) == int(example["label"])

    elif task == "multirc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "record":
        # Direct comparison of prediction with the correct entity
        processed_answers = [answer.strip().lower() for answer in example["answers"]]
        for answer in processed_answers:
            if answer in prediction.strip().lower():
                return True
            else:
                return False

    elif task == "rte":
        # Convert prediction (Yes/No) to binary and compare with label (1/0)
        return yes_no_to_binary(prediction) == (1 - int(example["label"]))

    elif task == "wic":
        # Convert prediction (Yes/No) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    elif task == "wsc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    # Default case: unknown task
    return False




In [27]:
import pandas as pd
import json
from tqdm import tqdm

df = pd.read_json("/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Additional_Results/results_superglue_temperature_0.1/vllm_exp_dataset_csv_superGLUE_Mixtral-8x7B-Instruct-v0.1-awq_FUll__20250119_164943.jsonl", lines=True)    


In [28]:


results = []

tasks = df['task'].unique()
model_names = df['model_name'].unique()
# tasks = ["copa","wic","wsc"]

total_default = 0
total_bert = 0
total_gpt = 0
total = 0
# Iterate through tasks and models to accumulate true predictions and total counts
for task in tasks:
    true_default = 0
    true_bert = 0
    true_gpt = 0
    total_task = 0
    df_task = df[df["task"] == task]
    for model_name in model_names:
        for i, row in tqdm(df_task[df_task["model_name"] == model_name].iterrows(), desc="Running Evaluation", leave=False):
            task = row["task"]
            example = json.loads(row["example"])
            label_default = evaluate_prediction_with_conversion(task, example, row["generate_response_default"])
            label_bert = evaluate_prediction_with_conversion(task, example, row["generate_response_bert"])
            label_gpt = evaluate_prediction_with_conversion(task, example, row["generate_response_gpt"])

            true_default += label_default
            total_default += label_default
            true_bert += label_bert
            total_bert += label_bert
            true_gpt += label_gpt
            total_gpt += label_gpt  
            total_task += 1
            total += 1
            row["label_default"] = label_default
            row["label_bert"] = label_bert
            row["label_gpt"] = label_gpt
            # Store results for each row
            results.append(row)

    true_ratio_default = true_default / total_task
    true_ratio_bert = true_bert / total_task
    true_ratio_gpt = true_gpt / total_task

    print(f"Proportion of True predictions for default model on task {task}: {true_ratio_default:.4f}")
    print(f"Proportion of True predictions for BERT model on task {task}: {true_ratio_bert:.4f}")
    print(f"Proportion of True predictions for GPT model on task {task}: {true_ratio_gpt:.4f}")


true_ratio_default = total_default / total
true_ratio_bert = total_bert / total
true_ratio_gpt = total_gpt / total
print(f"Proportion of True predictions for default model: {true_ratio_default:.4f}")
print(f"Proportion of True predictions for BERT model : {true_ratio_bert:.4f}")
print(f"Proportion of True predictions for GPT model : {true_ratio_gpt:.4f}")

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Optionally save to a CSV file
results_df.to_csv("evaluation_results.csv", index=False)

                                               

Proportion of True predictions for default model on task boolq: 0.7573
Proportion of True predictions for BERT model on task boolq: 0.7560
Proportion of True predictions for GPT model on task boolq: 0.7567


                                              

Proportion of True predictions for default model on task cb: 0.7798
Proportion of True predictions for BERT model on task cb: 0.7202
Proportion of True predictions for GPT model on task cb: 0.7202


                                              

Proportion of True predictions for default model on task copa: 0.7600
Proportion of True predictions for BERT model on task copa: 0.7867
Proportion of True predictions for GPT model on task copa: 0.7867


                                               

Proportion of True predictions for default model on task multirc: 0.8300
Proportion of True predictions for BERT model on task multirc: 0.8283
Proportion of True predictions for GPT model on task multirc: 0.8283


                                              

Proportion of True predictions for default model on task record: 0.7727
Proportion of True predictions for BERT model on task record: 0.7710
Proportion of True predictions for GPT model on task record: 0.7710


                                              

Proportion of True predictions for default model on task rte: 0.7401
Proportion of True predictions for BERT model on task rte: 0.7569
Proportion of True predictions for GPT model on task rte: 0.7569


                                              

Proportion of True predictions for default model on task wic: 0.6160
Proportion of True predictions for BERT model on task wic: 0.6102
Proportion of True predictions for GPT model on task wic: 0.6102


                                             

Proportion of True predictions for default model on task wsc: 0.2917
Proportion of True predictions for BERT model on task wsc: 0.2981
Proportion of True predictions for GPT model on task wsc: 0.2981
Proportion of True predictions for default model: 0.7444
Proportion of True predictions for BERT model : 0.7436
Proportion of True predictions for GPT model : 0.7437


In [8]:
results_df

70

In [9]:
true_bert

70

In [10]:
true_gpt

69