## Load Bert Model

In [1]:
# BERT Selector

import numpy as np
import torch
from transformers import BertTokenizer
import openai


# Specify your cuda device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def format_prompt_gpt(input):
    base_instruction = f""" You will see some text. Based on the content, determine which of the following abilities the text is most aligned with: 

    Causal Reasoning (CR): Judging and reasoning the causal relationships between events, understanding cause-and-effect chains.
    Creativity (CT): Creative thinking, coming up with new and unique solutions or ideas.
    In-context Learning (ICL): Learning and understanding based on contextual information, quickly acquiring and applying knowledge from new environments.
    Instruction Following (IF): Performing tasks according to given instructions, understanding and executing clear steps or rules.
    Machine Translation (MT): Accurately translating between different languages while maintaining grammatical and semantic consistency.
    Summarization (SUMM): Summarizing and condensing long texts to extract key information.

    Based on the following text, determine which ability it most aligns with. Please indicate the corresponding ability category number in your response. [Only one of the abbreviations: CR, CT, ICL, IF, MT, or SUMM]

    Text Input: {input}

"""
    return base_instruction


def call_openai_api(prompt):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()


def get_category_from_gpt(prompt):
    formatted_prompt_gpt = format_prompt_gpt(prompt)
    category_list = ["CR", "CT", "ICL", "IF", "MT", "SUMM"]
    predicted_category = None

    response = call_openai_api(formatted_prompt_gpt)

    # Check if response contains any of the valid categories
    for cat in category_list:
        if cat in response:
            predicted_category = cat
            break

    return predicted_category


def find_optimal_temperature_gpt(prompt, model_name, grouped_avg_accuracy):

    performance_map = grouped_avg_accuracy[
        grouped_avg_accuracy["model_name"] == model_name
    ]

    best_performance_prediction = -float("inf")
    best_temperature = None

    predicted_category = get_category_from_gpt(prompt)

    if predicted_category is None:
        return 1.0, "None"
    for temperature in np.arange(0.1, 2.0, 0.3):
        temperature = round(temperature, 2)
        matching_rows = performance_map[
            (performance_map["temperature"] == temperature)
            & (performance_map["category"] == predicted_category)
        ]

        if not matching_rows.empty:
            performance_prediction = matching_rows["accuracy"].values[0]

            if performance_prediction > best_performance_prediction:
                best_performance_prediction = performance_prediction
                best_temperature = temperature

    return best_temperature, predicted_category


def load_model(model_path, tokenizer_path):
    """
    Load the pre-trained model and tokenizer from the specified paths.

    Args:
    - model_path (str): Path to the saved model.
    - tokenizer_path (str): Path to the tokenizer.

    Returns:
    - model: The pre-trained model.
    - tokenizer: The pre-trained tokenizer.
    """
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
    model = torch.load(model_path)
    model.to(device)
    model.eval()
    print("Model loaded and set to evaluation mode.")
    return model, tokenizer


def evaluate_email(input_text, model, tokenizer, max_padding=512):

    input_ids = []
    attention_masks = []
    texts = [input_text]
    for text in texts:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
            text,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_padding,  # Pad & truncate all sentences.
            pad_to_max_length=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors="pt",  # Return pytorch tensors.
        )

        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor([[0, 1, 2, 3, 4, 5]]).to(torch.int64)

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(device)
    # print("Model loaded and set to evaluation mode.")
    # model.eval()

    # Evaluate data for one epoch
    b_input_ids = input_ids.to(device)
    b_input_mask = attention_masks.to(device)
    b_labels = labels.to(device)
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = output.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to("cpu").numpy()

    probabilities = np.exp(
        logits - np.max(logits, axis=1, keepdims=True)
    )  # Stabilized softmax
    probabilities = probabilities / np.sum(probabilities, axis=1, keepdims=True)

    # Get the mapping of categories to codes before converting to codes
    ability_mapping = {0: "CR", 1: "CT", 2: "ICL", 3: "IF", 4: "MT", 5: "SUMM"}

    # print(f"pred_flat : {probabilities}")
    # print(f"labels_flat : {label_ids}")
    # print(f"mapping: {ability_mapping}")

    output = ""
    prob_dict = {}
    for i, prob in enumerate(probabilities):
        for code, category in ability_mapping.items():
            output += f"{category}: {prob[code]:.2f} "
            prob_dict[category] = prob[code]

    # print(output)
    return output, prob_dict


def find_optimal_temperature(
    prompt, model_name, grouped_avg_accuracy, model, tokenizer
):

    performance_map = grouped_avg_accuracy[
        grouped_avg_accuracy["model_name"] == model_name
    ]

    best_performance_prediction = -float("inf")
    best_temperature = None
    _, prediction_dict = evaluate_email(prompt, model, tokenizer, max_padding=512)
    predicted_category = max(prediction_dict, key=prediction_dict.get)

    for temperature in np.arange(0.1, 2.0, 0.3):
        temperature = round(temperature, 2)
        matching_rows = performance_map[
            (performance_map["temperature"] == temperature)
            & (performance_map["category"] == predicted_category)
        ]

        if not matching_rows.empty:
            performance_prediction = matching_rows["accuracy"].values[0]

            if performance_prediction > best_performance_prediction:
                best_performance_prediction = performance_prediction
                best_temperature = temperature

    return best_temperature, predicted_category, prediction_dict


# email = """Prompt:

# # Please summarize the following content, ensuring that key points are covered and expressed concisely:

# # Given Content:

# # In recent years, the impacts of global climate change have become increasingly evident, especially in the form of frequent extreme weather events. Scientists have found that the rise in greenhouse gas emissions, particularly carbon dioxide and methane, is accelerating global temperature increases. This warming trend has led to the intensification of extreme weather phenomena, including heatwaves, floods, and powerful hurricanes. In response to this challenge, governments and environmental organizations worldwide are actively promoting emission reduction policies and the development of green energy. However, despite global efforts, the rate of increase in greenhouse gas emissions has not significantly slowed, and the pace of global warming has not been effectively controlled. As a result, scientists are calling on countries to intensify their actions and implement stricter climate policies to mitigate climate change and avoid potentially catastrophic consequences in the future.
# # """

# output, prob_dict = evaluate_email(email, model, tokenizer, max_padding=512)

  from .autonotebook import tqdm as notebook_tqdm


# Load Datasets

In [2]:
import os
import re
import pandas as pd
import json
import numpy as np
import requests

folder_path = "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Results_Final/evaluation"

dfs = []

for filename in os.listdir(folder_path):
    if filename.endswith(".jsonl"):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r") as file:
            records = [json.loads(line) for line in file if line.strip()]
            df = pd.DataFrame(records)
            if "Qwen2.5-1.5B-Instruct" in filename:
                model_name = "Qwen2.5-1.5B-Instruct"
            elif "Phi-3.5-mini-instruct" in filename:
                model_name = "Phi-3.5-mini-instruct"
            elif "Llama-3.2-3B-Instruct" in filename:
                model_name = "Llama-3.2-3B-Instruct"
            elif "Qwen2.5-3B-Instruct" in filename:
                model_name = "Qwen2.5-3B-Instruct"
            elif "Llama-3.2-1B-Instruct" in filename:
                model_name = "Llama-3.2-1B-Instruct"
            elif "Llama-2-7b-chat-hf" in filename:
                model_name = "Llama-2-7b-chat-hf"
            elif "Llama-2-13b-chat-hf" in filename:
                model_name = "Llama-2-13b-chat-hf"
            elif "Llama-2-70b-chat-hf" in filename:
                model_name = "Llama-2-70b-chat-hf"
            elif "Meta-Llama-3-8B-Instruct" in filename:
                model_name = "Meta-Llama-3-8B-Instruct"
            elif "Meta-Llama-3-70B-Instruct" in filename:
                model_name = "Meta-Llama-3-70B-Instruct"
            elif "Mistral-7B-Instruct-v0.2" in filename:
                model_name = "Mistral-7B-Instruct-v0.2"
            elif "Mixtral-8x7B-Instruct-v0.1" in filename:
                model_name = "Mixtral-8x7B-Instruct-v0.1"
            else:
                model_name = "Unknown"
            df["model_name"] = model_name
            if "accuracy" in df.columns:
                df["accuracy"] = df["accuracy"].astype(float)
            else:
                df["accuracy"] = df.apply(
                    lambda row: (
                        row[f"{row['category']}_accuracy"]
                        if f"{row['category']}_accuracy" in df.columns
                        else None
                    ),
                    axis=1,
                )
            df["temperature"] = pd.to_numeric(df["temperature"], errors="coerce")
            df["temperature"] = df["temperature"].apply(
                lambda x: round(x, 2) if pd.notna(x) else x
            )
            columns_to_keep = ["model_name", "temperature", "category", "accuracy"]
            df = df[columns_to_keep]
            dfs.append(df)

In [3]:
df = pd.concat(dfs, ignore_index=True)
df["temperature"] = df["temperature"].astype(float).round(2)

grouped_avg_accuracy = (
    df.groupby(["model_name", "temperature", "category"])["accuracy"]
    .mean()
    .reset_index()
)

grouped_avg_accuracy["temperature"] = grouped_avg_accuracy["temperature"].round(2)

tokenizer_path = "/home/snt/llm_models/bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = torch.load(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/training/bert_model_target_8"
)

prompt = """
Prompt:

The year is 2075. Decades of unchecked climate change have transformed Earth into an unpredictable, chaotic environment. Cities lie submerged, deserts stretch endlessly, and hurricanes form with unnatural precision. Despite the warning signs, humanity's response came too late—or did it?

Write a short story from the perspective of a climate scientist who stumbles upon an abandoned research facility containing groundbreaking technology. This discovery could reverse the effects of climate change, but using it carries immense risks. As the protagonist grapples with this ethical dilemma, explore their internal conflict, the societal implications of their choice, and how the scars of a warming planet have shaped humanity's resilience.
"""


best_temperature, predicted_category, prediction_dict = find_optimal_temperature(
    prompt=prompt,
    model_name="Llama-2-7b-chat-hf",
    grouped_avg_accuracy=grouped_avg_accuracy,
    model=model,
    tokenizer=tokenizer,
)

print(best_temperature)

  model = torch.load(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1.3


## Test the superglue

In [4]:
def generate_text_with_vllm(
    prompt,
    model_name="/home/snt/llm_models/Llama-3.2-1B-Instruct",
    options={},
    server_url="http://0.0.0.0:8000/v1/chat/completions",
):
    """Generate text using the specified language model."""
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    }
    payload.update(options)
    response = requests.post(server_url, headers=headers, json=payload)
    if response.status_code == 200:
        data = response.json()
        return data["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error: {response.status_code}, {response.text}")


def evaluate_prediction_with_conversion(task, example, prediction):
    # Helper functions for conversion
    def bool_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "true"
        if value == "true":
            return 1
        elif value == "false":
            return 0

        # Check if the string contains "true" but does not contain "false"
        elif "true" in value and "false" not in value:
            return 1

        # Check if the string contains "false" but does not contain "true"
        elif "false" in value and "true" not in value:
            return 0

        # If both "true" and "false" are present or neither is present, return None
        else:
            return None

    def yes_no_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "yes"
        if value == "yes":
            return 1
        elif value == "no":
            return 0

        # Check if the string contains "yes" but does not contain "no"
        elif "yes" in value and "no" not in value:
            return 1

        # Check if the string contains "no" but does not contain "yes"
        elif "no" in value and "yes" not in value:
            return 0

        # If both "yes" and "no" are present or neither is present, return None
        else:
            return None

    def entailment_to_label(value):
        # Define the mapping for entailment, contradiction, and neutral
        mapping = {"entailment": 0, "contradiction": 1, "neutral": 2}

        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input matches exactly one of the keys in the mapping
        if value in mapping:
            return mapping[value]

        # Check if the input contains one of the keys without ambiguity
        elif (
            "entailment" in value
            and "contradiction" not in value
            and "neutral" not in value
        ):
            return mapping["entailment"]
        elif (
            "contradiction" in value
            and "entailment" not in value
            and "neutral" not in value
        ):
            return mapping["contradiction"]
        elif (
            "neutral" in value
            and "entailment" not in value
            and "contradiction" not in value
        ):
            return mapping["neutral"]

        # If the input is ambiguous or invalid, return -1
        else:
            return -1

    def choice_to_binary(value):
        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input contains 'choice 1' and does not contain 'choice 2'
        if "choice 1" in value and "choice 2" not in value:
            return 0
        elif "choice 2" in value:
            return 1

        # If the input does not match any of the conditions, return None
        else:
            return None

    # Task-specific evaluation
    if task == "boolq":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "cb":
        # Convert prediction (entailment/contradiction/neutral) to label (0/1/2)
        return entailment_to_label(prediction) == int(example["label"])

    elif task == "copa":
        # Convert prediction (choice1/choice2) to binary and compare with label (0/1)
        return choice_to_binary(prediction) == int(example["label"])

    elif task == "multirc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "record":
        # Direct comparison of prediction with the correct entity
        processed_answers = [answer.strip().lower() for answer in example["answers"]]
        return prediction.strip().lower() in processed_answers

    elif task == "rte":
        # Convert prediction (Yes/No) to binary and compare with label (1/0)
        return yes_no_to_binary(prediction) == (1 - int(example["label"]))

    elif task == "wic":
        # Convert prediction (Yes/No) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    elif task == "wsc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    # Default case: unknown task
    return False

In [5]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

options = {  # Configuration options for model generation
    "temperature": 1.0,
    "max_tokens": 1024,
    "top_p": 0.9,
    "frequency_penalty": 0.0,
    "seed": 47,  # Default
    "n": 1,
}

df = pd.read_json(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/super_glue_data.jsonl",
    lines=True,
)

In [6]:
dataset_label = "validation"
server_url = "http://0.0.0.0:8000/v1/chat/completions"

model_infernece_path = "/home/snt/llm_models/Llama-3.2-1B-Instruct"
model_name = "Llama-3.2-1B-Instruct"
output_name = f"{model_name}_{dataset_label}"
output_path = f"/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/output/sp_bleu_validation_with_temperature_full.jsonl"

df = df[df["dataset_label"] == dataset_label]
# df = df.groupby("task").head(1000)

model_path = "/home/snt/projects_lujun/temperature_eval/bert_model_target_2"
tokenizer_path = "/home/snt/llm_models/bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = torch.load(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/training/bert_model_target_8"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

models = [
    "Llama-3.2-1B-Instruct",
    "Meta-Llama-3-8B-Instruct",
    "Mixtral-8x7B-Instruct-v0.1",
]

with open(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/.vscode/api_key.txt",
    "r",
) as file:
    api_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = api_key

In [7]:
from tqdm import tqdm  # Import tqdm for progress bar

start_idx = 0
for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Prompts"):
    for model_name in models:
        prompt = row["prompt"]
        optimal_temperature_bert, predicted_category_bert, prediction_dict = (
            find_optimal_temperature(
                prompt=prompt,
                model_name=model_name,
                grouped_avg_accuracy=grouped_avg_accuracy,
                model=model,
                tokenizer=tokenizer,
            )
        )

        optimal_temperature_gpt, predicted_category_gpt = find_optimal_temperature_gpt(
            prompt=prompt,
            model_name=model_name,
            grouped_avg_accuracy=grouped_avg_accuracy,
        )

        optimal_temperature_gpt = round(optimal_temperature_gpt, 2)
        optimal_temperature_bert = round(optimal_temperature_bert, 2)
        result_df = pd.DataFrame(
            {
                "model_name": model_name,
                "temperature_bert": optimal_temperature_bert,
                "temperature_gpt": optimal_temperature_gpt,
                "prediction_dict": json.dumps(str(prediction_dict)),
                "predicted_category": str(
                    [predicted_category_bert, predicted_category_gpt]
                ),
                "task": row["task"],
                "example": json.dumps(row["example"]),
                "prompt": prompt,
                # "prediction": str(
                #     [prediction_bert_temperature, prediction_gpt_temperature]
                # ),
                # "label": str([label_bert, label_gpt]),
            },
            index=[0],
        )

        result_df.to_json(
            output_path,
            orient="records",
            lines=True,
            mode="a",
        )
        start_idx += 1

Processing Prompts:   0%|          | 0/19293 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Processing Prompts:   0%|          | 3/19293 [00:07<13:01:56,  2.43s/it]


KeyboardInterrupt: 

In [27]:
import pandas as pd

df = pd.read_json(
    "/home/snt/projects_lujun/temperature_eval_github/temperature_eval/data/Bert/output/sp_bleu_validation_with_temperature.jsonl",
    lines=True,
)

In [33]:
df.groupby(["model_name", "task"]).size()

model_name                  task   
Llama-3.2-1B-Instruct       boolq      200
                            cb         207
                            copa       200
                            multirc    200
                            record     200
                            rte        200
                            wic        200
                            wsc        200
Meta-Llama-3-8B-Instruct    boolq      200
                            cb         208
                            copa       200
                            multirc    200
                            record     200
                            rte        200
                            wic        200
                            wsc        200
Mixtral-8x7B-Instruct-v0.1  boolq      200
                            cb         208
                            copa       200
                            multirc    200
                            record     200
                            rte        200
                  

In [32]:
df.temperature_gpt.value_counts()

temperature_gpt
0.4    1866
1.0    1603
0.7     981
0.1     373
Name: count, dtype: int64