In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Load models, Create evaluation matrix
models_map = {
    # "lora_llama": "EITD/lora_model_1",
    "orpo_llama": "EITD/orpo_llama",
    "origin_llama": "unsloth/Llama-3.2-1B-Instruct",
}

tokenizers, models, evaluation_datasets = {}, {}, {}

for model_name, model_path in models_map.items():
    tokenizers[model_name] = AutoTokenizer.from_pretrained(model_path)
    models[model_name] = AutoModelForCausalLM.from_pretrained(model_path)
    evaluation_datasets[model_name] = Dataset.from_dict({"index": [],"response": [], "feedback": [],"generate time": [],"score": []})

# Load dataset for evaluation
# dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset_name = "mlabonne/FineTome-100k"
dataset = load_dataset(dataset_name, split="all").shuffle(seed=42)
# messages = [{"role": "user", "content": prompt} for prompt in dataset["test"]["prompt"]]
# messages = [{"role": "user", "content": feature['conversations'][0]['value']} for feature in dataset['test']]
messages = dataset.map(
    lambda row: {"role": "user", "content": next((item["value"] for item in row["conversations"] if item["from"] == "human"), None)},
    remove_columns=dataset.column_names
)

messages = messages.select(range(3))

In [None]:
def saveResponse(model_name, messages):
    tokenizer = tokenizers.get(model_name)
    model = models.get(model_name)
    evaluation_dataset = evaluation_datasets.get(model_name) # can use dataSet here

    for i, message in enumerate(dataset['test']):
        
        start = time.time()         # start time
        inputs = tokenizer.apply_chat_template(
            [message],
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        )
        
        outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                                temperature = 1.5, min_p = 0.1)
        
        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        end = time.time()           # end time

        if "assistant" in response:
            response = response.split("assistant")[-1].strip()
        
        evaluation_dataset = evaluation_dataset.add_item({"index": i, "response": response, "feedback": None, "generate time": end-start, "score": None})
    
    evaluation_datasets[model_name] = evaluation_dataset

In [None]:
# Read the sample data
from flow_judge import Llamafile, EvalInput, FlowJudge
from flow_judge.metrics import RESPONSE_FAITHFULNESS_5POINT

# Initialize the model
judge = Llamafile()

# Initialize the judge
faithfulness_judge = FlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=judge
)

def add_feedback_and_score(example, result):
    return {
        "response": example['response'],
        "generate time": example['generate time'],
        "feedback": result.feedback,
        "score": result.score
    }

def saveJudge(model_name, messages):
    evaluation_dataset = evaluation_datasets.get(model_name)

    # Create a list of inputs and outputs
    inputs_batch = [
        [
            {"query": message["content"]},
            {"context": ""},
        ]
        for message in messages
    ]
    outputs_batch = [{"response": item["response"]} for item in evaluation_dataset]

    # Create a list of EvalInput
    eval_inputs_batch = [EvalInput(inputs=inputs, output=output) for inputs, output in zip(inputs_batch, outputs_batch)]

    # Run the batch evaluation
    results = faithfulness_judge.batch_evaluate(eval_inputs_batch, save_results=False)

    # Visualizing the results
    evaluation_datasets[model_name] = evaluation_dataset.map(lambda item: add_feedback_and_score(item, results[item['index']]))

In [None]:
for model_name in models_map.keys():
    saveResponse(model_name, messages)
    saveJudge(model_name, messages)