## Test the trained model

In [1]:
import os

import time
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
)
from datasets import DatasetDict, load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from accelerate import Accelerator
from trl import SFTConfig, SFTTrainer
from sacrebleu.metrics import BLEU

from datasets import load_dataset
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

import pandas as pd
from tqdm import tqdm
import sacrebleu
from datasets import Dataset

MAX_LEN = 512
train_ratio = 1.0
model_name = "/home/snt/llm_models/Llama-3.2-1B-Instruct"
checkpoint = "/home/snt/projects_lujun/mt_luxembourgish/logs/fit_1734873129.3395984/checkpoint-2000"
val_dataset_path = "/home/snt/projects_lujun/mt_luxembourgish/data/fake_targets/flores_devtest_arrow"


if val_dataset_path.endswith(".jsonl"):
    dataset = Dataset.from_json(val_dataset_path) 
else:
    dataset = load_from_disk(val_dataset_path) 

sample_number = int(len(dataset) * train_ratio)

train_dataset = dataset[dataset["split"]=="train"].select(range(sample_number)) 
train_dataset = train_dataset.rename_columns({
    "input": "Luxembourgish",
    "translated_text": "English",
})

val_dataset = dataset[dataset["split"]=="val"].select(range(sample_number))
val_dataset = val_dataset.rename_columns({
    "input": "Luxembourgish",
    "translated_text": "English",
})


tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Reload model in FP16 and merge it with LoRA weights (was previously converted to 4 bits)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda:0",
)

# model = PeftModel.from_pretrained(base_model, checkpoint)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="cuda:0",
)

# Function to generate from the model
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to("cuda")
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=MAX_LEN * 2,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    decoded_output = tokenizer.batch_decode(generated_ids)
    return decoded_output[0].replace(prompt, "")


def create_prompt(
    sample, mode="train", src_lng="Luxembourgish", tgt_lng="English", tokenizer=None
):
    """
    Create a prompt using the model's EOS token.

    Args:
        sample (dict): A dictionary containing source and target text.
        mode (str): The mode, either 'train' or 'test'.
        src_lng (str): Source language name.
        tgt_lng (str): Target language name.
        tokenizer: The tokenizer associated with the model (required to fetch EOS token).

    Returns:
        dict: A dictionary with the constructed prompt.
    """
    # Validate the tokenizer input
    if tokenizer is None or tokenizer.eos_token is None:
        raise ValueError("A tokenizer with a defined EOS token is required.")

    # Define the system message template.
    system_message = f"Translate the {src_lng} input text into {tgt_lng}.".upper()
    input_text = sample[src_lng.capitalize()].strip()  # Extract the input text.
    response = (
        sample[tgt_lng.capitalize()].strip() if tgt_lng.capitalize() in sample else ""
    )  # Extract the target text.

    # Get the EOS token from the tokenizer.
    eos_token = tokenizer.eos_token

    # Construct the full prompt.
    full_prompt = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        + system_message
        + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
    )
    full_prompt += (
        input_text + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    if mode == "train":
        full_prompt += response + eos_token
    return {"prompt_response": full_prompt}


def generate_dataset_responses(dataset, model, tgt_lng="english", tokenizer=None):
    """Generates prompts and corresponding LLM responses for the "test" split of a dataset,
    and computes the SPBLEU score by comparing the LLM responses to the ground truth."""

    source = []
    predictions = []  # List to store ground truth responses
    targets = []  # List to store LLM generated responses

    for sample in tqdm(dataset, desc="Generating responses"):
        test_prompt = create_prompt(
            sample, mode="test", tgt_lng=tgt_lng, tokenizer=tokenizer
        )[
            "prompt_response"
        ]  # Create the prompt in "test" mode

        llm_response = (
            generate_response(test_prompt, model)
            .replace("<|begin_of_text|>", "")
            .replace("<|eot_id|>", "")
        )

        ground_truth = sample.get(
            "English", ""
        )  # Get the ground truth (adjust field name as needed)

        # Append the LLM response and ground truth
        predictions.append(llm_response)
        targets.append([ground_truth])  # References should be in list format for SPBLEU
        source.append(sample.get("Luxembourgish"))

    # Convert the results into a DataFrame

    # predictions_cleaned = [
    #     p.replace("<|begin_of_text|>", "").replace("<eot_id|>", "") for p in predictions
    # ]
    # targets_cleaned = [
    #     t[0].replace("<|begin_of_text|>", "").replace("<eot_id|>", "") for t in targets
    # ]

    df_results = pd.DataFrame(
        list(zip(source, predictions, targets)),
        columns=["LLM_Input", "LLM_Output", "Ground_Truth"],
    )

    spbleu_scores = [
        sacrebleu.corpus_bleu([p], [t], tokenize="flores200").score
        for p, t in zip(predictions, targets)
    ]
    df_results["SPBLEU_Score"] = spbleu_scores
    average_spbleu = df_results["SPBLEU_Score"].mean()
    print(f"Average SPBLEU Score: {average_spbleu:.2f}")

    return df_results


pre_finetuned_responses = generate_dataset_responses(
    dataset=val_dataset, model=model, tgt_lng="english", tokenizer=tokenizer
)

  from .autonotebook import tqdm as notebook_tqdm
Generating responses: 100%|██████████| 1012/1012 [06:52<00:00,  2.45it/s]


Average SPBLEU Score: 25.81


In [3]:
pre_finetuned_responses.to_csv("output_base_model.csv")