# Evaluation Testing For Training Model Testing

This paper needs to support two models: NLLB and LLAMA3

NLLB

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    AutoModelForSeq2SeqLM
)
import pandas as pd

import pandas as pds
from tqdm import tqdm
import sacrebleu
from datasets import Dataset
from datasets import load_from_disk
from sacrebleu.metrics import CHRF
from datasets import load_dataset
from datetime import datetime

############################################################################################################
MAX_LEN = 512
model_path = "/home/snt/llm_models/nllb-200-3.3B" 
val_dataset_path = "data/training_dataset/dataset_val_300.jsonl"
flore_dataset_path = "data/fake_targets/flores_devtest_arrow"
current_time = datetime.now()
formatted_time = current_time.strftime('%m_%d_%H_%M')
eval_output_path = val_dataset_path.split("/")[-1].replace(".jsonl", f"_{formatted_time}_eval_from_nllb.jsonl")
sample_num = None  # Number of samples to evaluate， otherwise set to None if you want to evaluate the whole dataset
device_map="cuda:0"

src_lng = "English"
src_lng_abr = "sentence_eng_Latn"

# src_lng = "Luxembourgish"
# src_lng_abr = "sentence_ltz_Latn"

tgt_lng = "Luxembourgish"
tgt_lng_abr = "sentence_ltz_Latn"

# tgt_lng = "English"
# tgt_lng_abr = "sentence_eng_Latn"
device="cuda:0"


############################################################################################################

# Load dataset
if val_dataset_path.endswith(".jsonl"):
    dataset = Dataset.from_json(val_dataset_path)  # Ensure correct format
else:
    dataset = load_from_disk(val_dataset_path)

# Filter by split
if sample_num:
    val_dataset = dataset.filter(lambda x: x["split"] == "val").select(range(sample_num))
else:
    val_dataset = dataset.filter(lambda x: x["split"] == "val")

val_dataset = val_dataset.rename_columns({
    "input": "Luxembourgish",
    "translated_text": "English",
})  # This pair cannot be changed

if sample_num:
    val_flores_dataset = (
        load_from_disk(flore_dataset_path)
        .rename_columns(
            {
                tgt_lng_abr: tgt_lng,
                src_lng_abr: src_lng,
            }
        )
        .select([i for i in range(10)])
    )
else:
    val_flores_dataset = load_from_disk(flore_dataset_path).rename_columns(
        {
            tgt_lng_abr: tgt_lng,
            src_lng_abr: src_lng,
        }
    )



tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
translator = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lng_abr[9:], tgt_lang=tgt_lng_abr[9:], max_length = MAX_LEN *2, device=device)


def compute_jaccard(prediction, reference):
    pred_set = set(prediction.split())
    ref_set = set(reference.split())
    if not pred_set and not ref_set:
        return 1.0
    return len(pred_set & ref_set) / len(pred_set | ref_set)

def generate_dataset_responses(dataset):
    df_results = pd.DataFrame()
    for sample in tqdm(dataset, desc="Translating..."):
        source_text = sample[src_lng.capitalize()].strip()
        target_text = ( sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else "") 
        prediction = translator(source_text)[0]["translation_text"]
        index_unique = sample.get("index_unique", "")

        ## Compute Scores
        spbleu_score = sacrebleu.corpus_bleu([prediction], [[target_text]], tokenize="flores200").score
        chrf_metric = CHRF(word_order=3)
        charf_score = chrf_metric.sentence_score(prediction, [target_text]).score
        jaccard_score = compute_jaccard(prediction, target_text)

        ## Create JSON entry
        result = {
            "LLM_Input": source_text,
            "LLM_Output": prediction,
            "Ground_Truth": target_text,
            "index_unique": index_unique,
            "SPBLEU_Score": spbleu_score,
            "CharF++_Score": charf_score,
            "Jaccard_Score": jaccard_score,
        }
        updated_dataframe = pd.DataFrame([result])
        updated_dataframe.to_json(
                eval_output_path,
                orient="records",
                lines=True,
                mode="a",
        )
        df_results = pd.concat([df_results, updated_dataframe], axis=0)

    ## Average Scores
    average_charf = df_results["CharF++_Score"].mean()
    average_jaccard = df_results["Jaccard_Score"].mean()
    average_spbleu = df_results["SPBLEU_Score"].mean()

    print(f"Average SPBLEU Score: {average_spbleu:.2f}")
    print(f"Average CharF++ Score: {average_charf:.2f}")
    print(f"Average Jaccard Score: {average_jaccard:.2f}")
    return df_results

print ("Validation RTL Results")
print ("----------------------")
df_RTL_results = generate_dataset_responses(dataset=val_dataset)

df_RTL_results["Dataset"] = "RTL"

print ("FLORES 200 Results")
print ("----------------------")

df_flores_results = generate_dataset_responses(dataset=val_flores_dataset)
df_flores_results["Dataset"] = "FLORES"

df_results = pd.concat([df_RTL_results, df_flores_results], axis=0)
df_results.to_json(eval_output_path, orient="records", lines=True)
print(f"Results saved to {eval_output_path}")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.01it/s]


Validation RTL Results
----------------------


Translating...:   3%|▎         | 10/300 [00:20<08:48,  1.82s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Translating...: 100%|██████████| 300/300 [16:31<00:00,  3.31s/it]


Average SPBLEU Score: 19.97
Average CharF++ Score: 37.06
Average Jaccard Score: 0.27
FLORES 200 Results
----------------------


Translating...:  54%|█████▍    | 550/1012 [17:09<7:07:01, 55.46s/it]

: 