# Data Prep

In [1]:
!pip install -r requirements.txt



In [2]:
from datasets import load_dataset, Dataset
import random

dataset_name = "LDJnr/Capybara"
dataset = load_dataset(dataset_name, split="all").shuffle(seed=42)

messages = []
for row in list(dataset):
    message = []
    for i, item in enumerate(row["conversation"]):
        user = {"role": "user"}
        user["content"] = item["input"]
        message.append(user)
        if i < len(row["conversation"]) - 1:
            assistant = {"role": "assistant"}
            assistant["content"] = item["output"]
            message.append(assistant)
    messages.append(message)

messages = random.sample(messages, 200)
print(messages[0])

orpo_path = "orpo.json"
llama_path = "llama.json"

[{'role': 'user', 'content': 'Create a solution in Kotlin for the following task: Associative array/Merging\n\nDefine two associative arrays, where one represents the following "base" data:\n\nKey - Value\n"name" - "Rocket Skates"\n"price" - 12.75\n"color" - "yellow"\n\nAnd the other represents "update" data:\n\nKey - Value\n"price" - 15.25\n"color" - "red"\n"year" - 1974\n\nMerge these into a new associative array that contains every key found in either of the source ones. Each key should map to the value in the second (update) table if that exists, or else to the value in the first (base) table. If possible, do this in a way that does not mutate the original two associative arrays. Obviously this should be done in a way that would work for any data, not just the specific data given here, but in this example the result should be:\n\nKey - Value\n"name" - "Rocket Skates"\n"price" - 15.25\n"color" - "red"\n"year" - 1974 PLAINFORMAT\n'}, {'role': 'assistant', 'content': 'fun main() {\n  

# Model Prep

In [3]:

from unsloth import FastLanguageModel
import json, time

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "EITD/orpo", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

responses = []
for message in messages:
    start = time.time()        
    inputs = tokenizer.apply_chat_template(
        message,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                            temperature = 1.5, min_p = 0.1)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    end = time.time()         

    responses.append({'time': end - start, 'response': response})

with open(orpo_path, "w", encoding="utf-8") as f:
    json.dump(responses, f, indent=4)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA H100 80GB HBM3 MIG 8g.80gb. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.0+cu121. CUDA: 9.0. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [4]:

from unsloth import FastLanguageModel
import json, time

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

responses = []
for message in messages:
    start = time.time()
    inputs = tokenizer.apply_chat_template(
        message,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")
    
    outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True, temperature = 1.5, min_p = 0.1)
    
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    end = time.time()         

    responses.append({'time': end - start, 'response': response})

with open(llama_path, "w", encoding="utf-8") as f:
    json.dump(responses, f, indent=4)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA H100 80GB HBM3 MIG 8g.80gb. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.0+cu121. CUDA: 9.0. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


---

# LLM Judge

---

In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
from flow_judge import Llamafile, EvalInput, FlowJudge, Vllm
from flow_judge.metrics import RESPONSE_FAITHFULNESS_5POINT
import json

# Initialize the model
judge = Vllm()

# Initialize the judge
faithfulness_judge = FlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=judge
)

# Create a list of inputs and outputs
inputs_batch = [
    [
        {"query": message[-1]["content"]},
        {"context": ""},
    ]
    for message in messages
]

def judge(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        
    outputs_batch = [{"response": item["response"]} for item in data]

    # Create a list of EvalInput
    eval_inputs_batch = [EvalInput(inputs=inputs, output=output) for inputs, output in zip(inputs_batch, outputs_batch)]

    # Run the batch evaluation
    results = faithfulness_judge.batch_evaluate(eval_inputs_batch, save_results=False)

    for i, result in enumerate(results):
        data[i]['score'] = result.score

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

judge(orpo_path)
judge(llama_path)

A new version of the following files was downloaded from https://huggingface.co/flowaicom/Flow-Judge-v0.1-AWQ:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


INFO 12-06 18:36:23 awq_marlin.py:90] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 12-06 18:36:23 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='flowaicom/Flow-Judge-v0.1-AWQ', speculative_config=None, tokenizer='flowaicom/Flow-Judge-v0.1-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), 



INFO 12-06 18:36:24 weight_utils.py:242] Using model weights format ['*.safetensors']
INFO 12-06 18:36:24 weight_utils.py:287] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-06 18:36:30 model_runner.py:1025] Loading model weights took 2.1700 GB
INFO 12-06 18:36:30 gpu_executor.py:122] # GPU blocks: 11669, # CPU blocks: 682


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 200/200 [00:28<00:00,  7.01it/s, est. speed input: 6995.11 toks/s, output: 1647.95 toks/s]
Processed prompts: 100%|██████████| 200/200 [00:28<00:00,  7.02it/s, est. speed input: 7033.27 toks/s, output: 1659.79 toks/s]


In [7]:
def compute_metrics(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    time_sum = 0
    score_sum = 0
    for item in data:
        time_sum += item['time']
        score_sum += item['score']

    print("Avg Inference Time:", time_sum / len(data))
    print("Avg Score:", score_sum / len(data))

print("Orpo metrics:")
compute_metrics(orpo_path)
print("Llama metrics:")
compute_metrics(llama_path)

Orpo metrics:
Avg Inference Time: 1.6124457895755768
Avg Score: 3.51
Llama metrics:
Avg Inference Time: 1.3945598220825195
Avg Score: 3.395
