### Installation

In [None]:
import torch
torch.cuda.is_available()

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import json

with open("/content/G-Retriever 512 items.jsonl", 'r') as f:
    graph = f.readlines()
    graph = [json.loads(x) for x in graph]

graph[0]

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
from transformers import TextStreamer

# Parameters for file output file name
type_f = "RAG" # Type: RAG or G-Retriever
items = 2048 # context length used
file_n = 1 # Current run. Must be 3 runs in total


PROMPT = """You are a Python programming teacher in the SpyderIDE repository. Your task is to score your students' answers on questions.
You are given a question from your test, the correct answer for the question, and the student's answer to that question.
Score the student's answer based on how correct it is in regards to the correct answer. You are a strict but fair teacher.
Only use the provided information; do not reference the SpyderIDE in your explanations.
You must provide your answer in the json format like this:
{"explanation": "<short analysis of the student's answer>", "score": "<your score here, an integer from 0 to 10>"}
Your decision must be brief and at most 1 sentence long, because you are a busy teacher and have a lot of answers to grade. Start you explanation with: "The answer is ...".
Do not rewrite the texts of the answers in your explanations. Do not mention that you are lowering the score.

### QUESTION
<USER QUESTION>
### CORRECT ANSWER
<GROUND TRUTH ANSWER>
### STUDENT’S ANSWER
<STUDENT’S ANSWER>
### YOUR DECISION
"""

answers = list()

for ans in graph:
    modified_prompt = PROMPT.replace("<USER QUESTION>", ans['question'])
    modified_prompt = modified_prompt.replace("<GROUND TRUTH ANSWER>", ans['label'])
    modified_prompt = modified_prompt.replace("<STUDENT’S ANSWER>", ans['pred'])
    # print(modified_prompt)


    messages = [
        {"role": "user", "content": modified_prompt},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    while True:
        text_streamer = TextStreamer(tokenizer, skip_prompt = True)
        print(ans['id'], end=" ")
        _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                        use_cache = True, temperature = 1, min_p = 0.1)
        try:
            answer = tokenizer.batch_decode(_)[0].split("### YOUR DECISION")[1]
            answer = answer[
                answer.find('{') : answer.find('}')+1
            ]

            answer = json.loads(answer)
            answer['explanation']
            int(answer['score'])
            answer['id'] = ans['id']
            answers.append(answer)
            break
        except: pass


with open(f'/content/{type_f} {items} items evaluation-{file_n}.jsonl', 'w') as f:
    for item in answers:
        f.write(
            f"{json.dumps(item)}\n"
        )