In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
from datasets import load_dataset, Dataset
import time

tokenizer_orpo_llama = AutoTokenizer.from_pretrained("EITD/orpo_llama")
model_orpo_llama = AutoModelForCausalLM.from_pretrained("EITD/orpo_llama")

dataset_name = "mlabonne/FineTome-100k"
dataset = load_dataset(dataset_name, split="all").shuffle(seed=42)

# messages = [{"role": "user", "content": prompt} for prompt in dataset["test"]["prompt"]]
# messages = [{"role": "user", "content": feature['conversations'][0]['value']} for feature in dataset["test"]]
messages = dataset.map(
    lambda row: {"role": "user", "content": next((item["value"] for item in row["conversations"] if item["from"] == "human"), None)},
    remove_columns=dataset.column_names
)
messages = messages.select(range(3))
# messages = [
#     {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
#     {"role": "user", "content": "Say some thing about Olympics"},
#     {"role": "user", "content": "Give me some advice about making cookies"},
# ]

# messages = pd.DataFrame(messages)

evaluation_matrix_orpo_llama = Dataset.from_dict({"index": [],"response": [], "feedback": [],"generate time": [],"score": []})
# responses_orpo_llama = pd.DataFrame(columns=["response", "feedback", "generate time", "score"])
evaluation_datasets = {"orpo_llama": evaluation_matrix_orpo_llama}

In [2]:
messages, evaluation_matrix_orpo_llama

(Dataset({
     features: ['role', 'content'],
     num_rows: 3
 }),
 Dataset({
     features: ['index', 'response', 'feedback', 'generate time', 'score'],
     num_rows: 0
 }))

In [5]:
def saveResponse(model_name, messages):
    tokenizer = tokenizer_orpo_llama
    model = model_orpo_llama
    evaluation_dataset = evaluation_datasets.get(model_name)

    for i, message in enumerate(messages):
        
        start = time.time()         # start time
        # inputs = tokenizer.apply_chat_template(
        #     [message],
        #     tokenize = True,
        #     add_generation_prompt = True, # Must add for generation
        #     return_tensors = "pt",
        # )
        
        # outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
        #                         temperature = 1.5, min_p = 0.1)
        
        # response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        end = time.time()           # end time

        # if "assistant" in response:
        #     response = response.split("assistant")[-1].strip()
        
        response = "right answer"
        
        evaluation_dataset = evaluation_dataset.add_item({"index": i, "response": response, "feedback": None, "generate time": end-start, "score": None})
    
    evaluation_datasets[model_name] = evaluation_dataset

In [6]:
saveResponse("orpo_llama", messages)

In [None]:
# Read the sample data
from flow_judge import Vllm, Llamafile, Hf, EvalInput, FlowJudge
from flow_judge.metrics import RESPONSE_FAITHFULNESS_5POINT
from flow_judge.metrics import list_all_metrics

# Initialize the model
judge = Llamafile()

# Initialize the judge
faithfulness_judge = FlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=judge
)

def add_feedback_and_score(example, result):
    return {
        "response": example['response'],
        "generate time": example['generate time'],
        "feedback": result.feedback,
        "score": result.score
    }

def saveJudge(model_name, messages):
    evaluation_dataset = evaluation_datasets.get(model_name)

    # Create a list of inputs and outputs
    inputs_batch = [
        [
            {"query": message["content"]},
            {"context": ""},
        ]
        for message in messages
    ]
    outputs_batch = [{"response": item["response"]} for item in evaluation_dataset]

    # Create a list of EvalInput
    eval_inputs_batch = [EvalInput(inputs=inputs, output=output) for inputs, output in zip(inputs_batch, outputs_batch)]

    # Run the batch evaluation
    results = faithfulness_judge.batch_evaluate(eval_inputs_batch, save_results=False)

    # Visualizing the results
    evaluation_datasets[model_name] = evaluation_dataset.map(lambda item: add_feedback_and_score(item, results[item['index']]))

In [None]:
saveJudge("orpo_llama", messages)

{'index': 0, 'response': 'right answer', 'feedback': 'some', 'generate time': 0.0, 'score': 2}
{'index': 1, 'response': 'right answer', 'feedback': 'other', 'generate time': 0.0, 'score': 1}
{'index': 2, 'response': 'right answer', 'feedback': 'things', 'generate time': 0.0, 'score': 5}


Map:   0%|          | 0/3 [00:00<?, ? examples/s]