# Evaluation Testing For Training Model Testing

This paper needs to support two models: NLLB and LLAMA3

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
import torch

import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from datasets import load_from_disk
from datasets import load_dataset
from datetime import datetime
import json
############################################################################################################
from tqdm import tqdm
import pandas as pd
import json


In [None]:

def evaluate_prediction_with_conversion(task, example, prediction):
    # Helper functions for conversion
    def bool_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "true"
        if value == "true":
            return 1
        elif value == "false":
            return 0

        # Check if the string contains "true" but does not contain "false"
        elif "true" in value and "false" not in value:
            return 1

        # Check if the string contains "false" but does not contain "true"
        elif "false" in value and "true" not in value:
            return 0

        # If both "true" and "false" are present or neither is present, return None
        else:
            return None

    def yes_no_to_binary(value):
        # Strip whitespace and convert to lowercase for consistency
        value = value.strip().lower()

        # Check if the string is exactly "yes"
        if value == "yes":
            return 1
        elif value == "no":
            return 0

        # Check if the string contains "yes" but does not contain "no"
        elif "yes" in value and "no" not in value:
            return 1

        # Check if the string contains "no" but does not contain "yes"
        elif "no" in value and "yes" not in value:
            return 0

        # If both "yes" and "no" are present or neither is present, return None
        else:
            return None

    def entailment_to_label(value):
        # Define the mapping for entailment, contradiction, and neutral
        mapping = {"entailment": 0, "contradiction": 1, "neutral": 2}

        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input matches exactly one of the keys in the mapping
        if value in mapping:
            return mapping[value]

        # Check if the input contains one of the keys without ambiguity
        elif (
            "entailment" in value
            and "contradiction" not in value
            and "neutral" not in value
        ):
            return mapping["entailment"]
        elif (
            "contradiction" in value
            and "entailment" not in value
            and "neutral" not in value
        ):
            return mapping["contradiction"]
        elif (
            "neutral" in value
            and "entailment" not in value
            and "contradiction" not in value
        ):
            return mapping["neutral"]

        # If the input is ambiguous or invalid, return -1
        else:
            return -1

    def choice_to_binary(value):
        # Normalize the input by stripping whitespace and converting to lowercase
        value = value.strip().lower()

        # Check if the input contains 'choice 1' and does not contain 'choice 2'
        if "choice 1" in value and "choice 2" not in value:
            return 0
        elif "choice 2" in value:
            return 1

        # If the input does not match any of the conditions, return None
        else:
            return None

    # Task-specific evaluation
    if task == "boolq":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])

    elif task == "cb":
        # Convert prediction (entailment/contradiction/neutral) to label (0/1/2)
        return entailment_to_label(prediction) == int(example["label"])

    elif task == "copa":
        # Convert prediction (choice1/choice2) to binary and compare with label (0/1)
        return choice_to_binary(prediction) == int(example["label"])

    elif task == "multirc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return bool_to_binary(prediction) == int(example["label"])
    
    elif task == "record":
        # Direct comparison of prediction with the correct entity
        processed_answers = [answer.strip().lower() for answer in example["answers"]]
        for answer in processed_answers:
            if answer in prediction.strip().lower():
                return True
            else:
                return False
            
    elif task == "rte":
        # Convert prediction (Yes/No) to binary and compare with label (1/0)
        return yes_no_to_binary(prediction) == (1 - int(example["label"]))

    elif task == "wic":
        # Convert prediction (Yes/No) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    elif task == "wsc":
        # Convert prediction (True/False) to binary and compare with label (0/1)
        return yes_no_to_binary(prediction) == int(example["label"])

    # Default case: unknown task
    return False

In [None]:
MAX_LEN = 256
model_path = "/home/snt/llm_models/gemma-2-2b-it"
model_name = model_path.split("/")[-1]
val_dataset_path = "/home/snt/projects_lujun/mt_luxembourgish/data/super_glue_data.jsonl"
flore_dataset_path = "data/fake_targets/flores_devtest_arrow"
output_path = f"data/results_test_{model_name}.jsonl"

current_time = datetime.now()
formatted_time = current_time.strftime('%m_%d_%H_%M')
eval_output_path = val_dataset_path.split("/")[-1].replace(".jsonl", f"_{formatted_time}_eval_from_Llama3-3B.jsonl")
sample_num = None  # Number of samples to evaluate otherwise set to None
device="cuda:0"

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# Reload model in FP16 and merge it with LoRA weights (was previously converted to 4 bits)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
)

# Function to generate from the model
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to(device)
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=MAX_LEN,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        temperature=1.0,
    )
    decoded_output = tokenizer.batch_decode(generated_ids)
    return decoded_output[0].replace(prompt, "")

df = pd.read_json(val_dataset_path, lines=True)
val_df = df[df["dataset_label"]=="validation"]
# val_df = val_df.groupby("task").apply(lambda x: x.sample(n=5, random_state=42)).reset_index(drop=True)



for index, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Processing", ncols=80):
    prompt = row["prompt"]
    system_message = f"You are a helpful AI assistant."

    if "gemma" in model_name:
        messages = [
            {"role": "user", "content": prompt},
        ]
    else:
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ]
    
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prediction = generate_response(
        full_prompt,
        model,
    )
    
    label = evaluate_prediction_with_conversion(row["task"], row["example"], prediction)
    result_df = pd.DataFrame(
        {
            "task": row["task"],
            "example": json.dumps(row["example"]),
            "prompt": full_prompt,
            "prediction": prediction,
            "label": label,
        },
        index=[0],
    )
    result_df.to_json(
        output_path,
        orient="records",
        lines=True,
        mode="a",
    )


result_df = pd.read_json(output_path, lines=True)
task_accuracy = result_df.groupby("task")["label"].mean() * 100
for task, acc in task_accuracy.items():
    print(f"Task: {task}, Accuracy: {acc:.2f}%")

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.51s/it]
  val_df = val_df.groupby("task").apply(lambda x: x.sample(n=5, random_state=42)).reset_index(drop=True)


In [1]:
import pandas as pd


test = pd.read_json("/home/snt/projects_lujun/mt_luxembourgish/data/super_glue_data.jsonl", lines=True)

In [5]:
test[test["task"]=="record"]

Unnamed: 0,dataset_label,task,prompt,example,label
37320,train,record,"Based on the provided passage and entities, an...",{'passage': 'The harrowing stories of women an...,
37321,train,record,"Based on the provided passage and entities, an...","{'passage': 'Caracas, Venezuela (CNN) -- It's ...",
37322,train,record,"Based on the provided passage and entities, an...","{'passage': 'Caracas, Venezuela (CNN) -- It's ...",
37323,train,record,"Based on the provided passage and entities, an...","{'passage': 'Caracas, Venezuela (CNN) -- It's ...",
37324,train,record,"Based on the provided passage and entities, an...","{'passage': 'Caracas, Venezuela (CNN) -- It's ...",
...,...,...,...,...,...
189498,test,record,"Based on the provided passage and entities, an...",{'passage': 'Internet activists are trying to ...,
189499,test,record,"Based on the provided passage and entities, an...",{'passage': 'Alex Oxlade-Chamberlain is trying...,
189500,test,record,"Based on the provided passage and entities, an...",{'passage': '(CNN) -- Ferrari president Luca d...,
189501,test,record,"Based on the provided passage and entities, an...",{'passage': 'Two British men have been fined $...,


In [2]:
test

Unnamed: 0,dataset_label,task,prompt,example,label
0,train,boolq,Answer the following question based on the pas...,{'question': 'do iran and afghanistan speak th...,1.0
1,train,boolq,Answer the following question based on the pas...,{'question': 'do good samaritan laws protect t...,1.0
2,train,boolq,Answer the following question based on the pas...,{'question': 'is windows movie maker part of w...,1.0
3,train,boolq,Answer the following question based on the pas...,{'question': 'is confectionary sugar the same ...,1.0
4,train,boolq,Answer the following question based on the pas...,{'question': 'is elder scrolls online the same...,0.0
...,...,...,...,...,...
194044,test,wsc,"\n Based on the provided text and pronoun, ...",{'text': 'Muriel has in her living room a famo...,-1.0
194045,test,wsc,"\n Based on the provided text and pronoun, ...",{'text': 'Muriel has in her living room a famo...,-1.0
194046,test,wsc,"\n Based on the provided text and pronoun, ...",{'text': 'Muriel has in her living room a famo...,-1.0
194047,test,wsc,"\n Based on the provided text and pronoun, ...",{'text': 'Edward dropped adhesive tape onto hi...,-1.0
