In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Load models, Create evaluation matrix
models_map = {
    # "lora_llama": "EITD/lora_model_1",
    "orpo_llama": "EITD/orpo_llama",
    "origin_llama": "unsloth/Llama-3.2-1B-Instruct",
}

tokenizers, models, evaluation_datasets = {}, {}, {}

for model_name, model_path in models_map.items():
    print(model_name + ":")
    tokenizers[model_name] = AutoTokenizer.from_pretrained(model_path)
    models[model_name] = AutoModelForCausalLM.from_pretrained(model_path)
    evaluation_datasets[model_name] = Dataset.from_dict({"index": [],"response": [], "feedback": [],"generate time": [],"score": []})

# Load dataset for evaluation
# dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset_name = "mlabonne/FineTome-100k"
print(dataset_name + ":")
dataset = load_dataset(dataset_name, split="all").shuffle(seed=42)
# messages = [{"role": "user", "content": prompt} for prompt in dataset["test"]["prompt"]]
# messages = [{"role": "user", "content": feature['conversations'][0]['value']} for feature in dataset['test']]
messages = dataset.map(
    lambda row: {"role": "user", "content": next((item["value"] for item in row["conversations"] if item["from"] == "human"), None)},
    remove_columns=dataset.column_names
)

messages = messages.select(range(3))

orpo_llama:
origin_llama:
mlabonne/FineTome-100k:


In [2]:
def saveResponse(model_name, messages):
    tokenizer = tokenizers.get(model_name)
    model = models.get(model_name)
    evaluation_dataset = evaluation_datasets.get(model_name) # can use dataSet here

    for i, message in enumerate(messages):
        
        start = time.time()         # start time
        inputs = tokenizer.apply_chat_template(
            [message],
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        )
        
        outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                                temperature = 1.5, min_p = 0.1)
        
        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        end = time.time()           # end time

        if "assistant" in response:
            response = response.split("assistant")[-1].strip()
        
        evaluation_dataset = evaluation_dataset.add_item({"index": i, "response": response, "feedback": None, "generate time": end-start, "score": None})
    
    evaluation_datasets[model_name] = evaluation_dataset

In [3]:
for model_name in models_map.keys():
    print(model_name + ": responses")
    saveResponse(model_name, messages)

orpo_llama: responses


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


origin_llama: responses


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [4]:
for model_name in models_map.keys():
    evaluation_dataset = evaluation_datasets.get(model_name)
    print(model_name + ":")
    for i in range(len(evaluation_dataset)):
        print(evaluation_dataset[i]["score"], evaluation_dataset[i]["generate time"], evaluation_dataset[i]["feedback"])

orpo_llama:
None 32.386916399002075 None
None 33.40937304496765 None
None 29.25759267807007 None
origin_llama:
None 25.634678840637207 None
None 29.464585781097412 None
None 26.37605357170105 None


In [5]:
# Read the sample data
from flow_judge import Llamafile, EvalInput, FlowJudge
from flow_judge.metrics import RESPONSE_FAITHFULNESS_5POINT

# Initialize the model
judge = Llamafile(
    gpu_layers=0,
    flash_attn=False,
    quantized_kv=False,
)

# Initialize the judge
faithfulness_judge = FlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=judge
)

def add_feedback_and_score(example, result):
    return {
        "response": example['response'],
        "generate time": example['generate time'],
        "feedback": result.feedback,
        "score": result.score
    }

def saveJudge(model_name, messages):
    evaluation_dataset = evaluation_datasets.get(model_name)

    # Create a list of inputs and outputs
    inputs_batch = [
        [
            {"query": message["content"]},
            {"context": ""},
        ]
        for message in messages
    ]
    outputs_batch = [{"response": item["response"]} for item in evaluation_dataset]

    # Create a list of EvalInput
    eval_inputs_batch = [EvalInput(inputs=inputs, output=output) for inputs, output in zip(inputs_batch, outputs_batch)]

    # Run the batch evaluation
    results = faithfulness_judge.batch_evaluate(eval_inputs_batch, save_results=False)

    # Visualizing the results
    evaluation_datasets[model_name] = evaluation_dataset.map(lambda item: add_feedback_and_score(item, results[item['index']]))



In [6]:
for model_name in models_map.keys():
    print(model_name + ": feedbacks")
    saveJudge(model_name, messages)

orpo_llama: feedbacks


INFO:root:Starting llamafile server...
INFO:root:Llamafile path: C:\Users\silve\.cache\flow-judge\flow-judge.llamafile
INFO:root:Starting llamafile server with command: sh -c 'C:\Users\silve\.cache\flow-judge\flow-judge.llamafile --server --host 127.0.0.1 --port 8085 -c 8192 -ngl 0 --temp 0.1 -n 1000 --threads 32 --nobrowser -b 32 --parallel 1 --cont-batching'
ERROR:root:Error starting llamafile server: [WinError 2] 系统找不到指定的文件。
Traceback (most recent call last):
  File "c:\Python3112\Lib\site-packages\flow_judge\models\llamafile.py", line 369, in start_llamafile_server
    self.llamafile_process = self._start_llamafile_process(command)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python3112\Lib\site-packages\flow_judge\models\llamafile.py", line 422, in _start_llamafile_process
    process = subprocess.Popen(
              ^^^^^^^^^^^^^^^^^
  File "c:\Python3112\Lib\subprocess.py", line 1024, in __init__
    self._execute_child(args, executable, preexe

FileNotFoundError: [WinError 2] 系统找不到指定的文件。

In [None]:
for model_name in models_map.keys():
    evaluation_dataset = evaluation_datasets.get(model_name)
    print(model_name + ":")
    for i in range(len(evaluation_dataset)):
        print(evaluation_dataset[i]["score"], evaluation_dataset[i]["generate time"], evaluation_dataset[i]["feedback"])