### Installation

In [None]:
import torch
torch.cuda.is_available()

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import json

with open("/content/RAG 512 items.jsonl", 'r') as f:
    rag = f.readlines()
    rag = [json.loads(x) for x in rag]


with open("/content/Fine-tuned G-Retriever 512 items.jsonl", 'r') as f:
    graph = f.readlines()
    graph = [json.loads(x) for x in graph]

In [None]:
rag[0]

In [None]:
graph[0]

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

fname = "RAG vs Fine-tuned G-Retriever 512 items evaluation.jsonl"

PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question and the model solution displayed below.
You should choose the assistant, whose answer is the most similar to the model solution below.
Your evaluation should consider factors such as the relevance to the user question; accuracy and the lack of hallucinations of their responses in respect to the model answer.
Begin your evaluation by comparing the two responses and provide a short explanation.
Think step by step. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision.
Your evaluation is independent from the length of the responses.
Your evaluation is independent from the names of the assistants. Be as objective as possible. After providing your explanation you must output your final verdict by strictly following this format:
"Assistant A's answer is better" if assistant A’s answer is better than assistant B’s answer,
"Assistant B's answer is better" if assistant B’s answer is better than assistant A’s answer,
"Both assistants are good" if both assistant A and B have provided good results, and
"Both assistants are bad" if both assistants gave a wrong answer.
You must provide your answer in the json format like this:
{"explanation": "<short explanation of your verdict here>", "verdict": "<your verdict here that is based on explanation>"}
Note that you are busy, and you do not have a lot of time for explanation. Do not write more than 1 sentence.
### USER QUESTION
<USER QUESTION>
### MODEL SOLUTION
<MODEL SOLUTION>
### Assistant A’s answer
<ANSWER A>
### Assistant B’s answer
<ANSWER B>
### YOUR VERDICT"""

answers = list()

for r, g in zip(rag, graph): # need to switch those on the second run
    modified_prompt = PROMPT.replace("<USER QUESTION>", g['question'])
    modified_prompt = modified_prompt.replace("<MODEL SOLUTION>", g['label'])
    modified_prompt = modified_prompt.replace("<ANSWER A>", r['pred'])
    modified_prompt = modified_prompt.replace("<ANSWER B>", g['pred'])
    # print(modified_prompt)

    messages = [
        {"role": "user", "content": modified_prompt},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    while True:
        print(r['id'], end=" ")
        _ = model.generate(input_ids = inputs, max_new_tokens = 128,
                        use_cache = True, temperature = 1.0, min_p = 0.1)
        try:
            answer = tokenizer.batch_decode(_)[0].split("### YOUR VERDICT")[1]
            answer = answer[
                answer.find('{') : answer.find('}')+1
            ]

            answer = json.loads(answer)
            answer['explanation']
            answer['verdict'] in ["Assistant A's answer is better", "Assistant B's answer is better", "Both assistants are good", "Both assistants are bad"]
            answer['id'] = r['id']
            answers.append(answer)
            print(answer['verdict'])
            break
        except: pass

with open(f'/content/{fname}', 'w') as f:
    for item in answers:
        f.write(
            f"{json.dumps(item)}\n"
        )