In [1]:
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os, json, re
import numpy as np

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HF_TOKEN")

In [2]:
from unsloth import FastVisionModel
from unsloth.chat_templates import get_chat_template
import torch
from datasets import load_dataset

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name = 'unsloth/gemma-3-12b-it',
    load_in_4bit = True,
)
FastVisionModel.for_inference(model)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "google/gemma-7b"
# model_id = "google/medgemma-27b-text-it"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, token="hf_token"
).to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
raw_dataset = load_dataset("crag-mm-2025/crag-mm-single-turn-public")
# Eval on public test set
dataset = raw_dataset['public_test']

In [5]:
def build_grading_prompt(prediction, ground_truth):
    # Define the Grading Rubric
    system_prompt = (
        """Based on my answer and the true answer score my answer based on:
        ‚úÖ Perfect (fully correct) ‚Üí Score: 1.0
        ‚ö† Acceptable (useful but with minor non-harmful errors) ‚Üí Score: 0.5
        ‚ùì Missing (e.g., ‚ÄúI don‚Äôt know‚Äù, ‚ÄúI‚Äôm sorry I can‚Äôt find ‚Ä¶‚Äù) ‚Üí Score: 0.0
        ‚ùå Incorrect (wrong or irrelevant answer) ‚Üí Score: -1.0

        Return exactly one numeric score from the set {1.0, 0.5, 0.0, -1.0}.
        No extra words. Only the score number."""
    )
    
    # Construct the Content
    user_content = (
        f"Ground Truth:\n{ground_truth}\n\n"
        f"Model Prediction:\n{prediction}"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content}
    ]

    # Apply Chat Template
    text_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(
        text=text_prompt, 
        return_tensors="pt"
    ).to(model.device)
    
    return inputs

In [9]:
task1_dir = 'final_outputs/task1_answers_v2'

In [7]:
with open(f'{task1_dir}/task1_results_0-9.json', 'r') as f:
    test = json.load(f)
    print(test[0])

{'question_#': 0, 'question': 'is this a good car for transporting seven passengers at once?', 'prediction': 'The Subaru WRX STI is not a good car for transporting seven passengers at once. It is designed for performance and handling, not for transporting large groups of people. It has a seating capacity of 5 passengers. If you need to transport seven passengers, you may want to consider a larger vehicle with a higher seating capacity.', 'ground_truth': 'no, the subaru wrx is a compact car with a total passenger capacity of 5 people.'}


In [None]:
# Task 1 evaluation
task1_files = os.listdir(task1_dir)
task1_data = {
    "predictions": [],
    "ground_truths": [],
}
scores = []

for filename in task1_files:
    with open(os.path.join(task1_dir, filename), 'r') as f:
        data = json.load(f)
        for sample in data:
            task1_data["predictions"].append(sample['prediction'])
            task1_data["ground_truths"].append(sample['ground_truth'])

for i, query in enumerate(tqdm(task1_data['predictions'])):
    prediction = task1_data['predictions'][i]
    ground_truth = task1_data['ground_truths'][i]
    inputs = build_grading_prompt(prediction, ground_truth)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.01,
        )

    generated_ids = output[0][inputs.input_ids.shape[1]:]
    response_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    # Extract Score using Regex
    # Looks for 1.0, 0.5, 0.0, or -1.0
    match = re.search(r"(-?1\.0|0\.5|0\.0)", response_text)
    
    if match:
        score = float(match.group(1))
    else:
        print(f"‚ö† Warning: Could not parse score from: '{response_text}'. Defaulting to 0.0")
        score = 0.0 # Default fallback
        
    scores.append(score)

In [None]:
def build_rating_prompt(sys_prompt: str, model_resp: str, gold: str) -> str:
    return (
        f"{sys_prompt}\n\n"
        f"Model answer:\n{model_resp}\n\n"
        f"True answer:\n{gold}\n\n"
        f"Score:"
    )

system_prompt = (
        """Based on my answer and the true answer score my answer based on:
        ‚úÖ Perfect (fully correct) ‚Üí Score: 1.0
        ‚ö† Acceptable (useful but with minor non-harmful errors) ‚Üí Score: 0.5
        ‚ùì Missing (e.g., ‚ÄúI don‚Äôt know‚Äù, ‚ÄúI‚Äôm sorry I can‚Äôt find ‚Ä¶‚Äù) ‚Üí Score: 0.0
        ‚ùå Incorrect (wrong or irrelevant answer) ‚Üí Score: -1.0

        Return exactly one numeric score from the set {1.0, 0.5, 0.0, -1.0}.
        No extra words. Only the score number."""
    )
# Task 1 evaluation
task1_files = os.listdir(task1_dir)
task1_data = {
    "predictions": [],
    "ground_truths": [],
}
scores = []

for filename in task1_files:
    with open(os.path.join(task1_dir, filename), 'r') as f:
        data = json.load(f)
        for sample in data:
            task1_data["predictions"].append(sample['prediction'])
            task1_data["ground_truths"].append(sample['ground_truth'])

# --- BATCH PROCESSING SETUP ---
# Set padding side to left for generation
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

batch_size = 8  # Adjust this based on your GPU memory (try 4, 8, or 16)
predictions = task1_data['predictions']
ground_truths = task1_data['ground_truths']

# Process in batches
for i in tqdm(range(0, len(predictions), batch_size), desc="Evaluating batches"):
    batch_preds = predictions[i : i + batch_size]
    batch_gts = ground_truths[i : i + batch_size]
    
    # 1. Build text prompts for the entire batch
    text_prompts = [
        build_rating_prompt(system_prompt, p, g) 
        for p, g in zip(batch_preds, batch_gts)
    ]

    # 2. Tokenize the batch (this converts strings to model inputs)
    inputs = tokenizer(
        text_prompts, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    ).to(model.device)

    # 3. Generate scores
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=10, # We only need a short number, 300 is unnecessary
            temperature=0.01,
        )

    # 4. Decode batch results
    # Slice [:, inputs.shape[1]:] to get only the newly generated tokens
    generated_ids = output[:, inputs.input_ids.shape[1]:]
    decoded_responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # 5. Parse scores
    for response_text in decoded_responses:
        match = re.search(r"(-?1\.0|0\.5|0\.0)", response_text)
        
        if match:
            score = float(match.group(1))
        else:
            print(f"‚ö† Warning: Could not parse score from: '{response_text}'. Defaulting to 0.0")
            score = 0.0 
            
        scores.append(score)

In [10]:
# Calculate Final Stats
score = np.sum(scores)/len(scores)
print(f"\n‚úÖ Evaluation Complete.")
print(f"Average Accuracy Score: {score:.4f}")


‚úÖ Evaluation Complete.
Average Accuracy Score: 0.3742


In [6]:
task2_file = 'Task2_final_answers.jsonl'

In [None]:
task2_answers = []
task2_scores = []
with open(task2_file, 'r') as f:
    for line in f:
        sample = json.loads(line)
        task2_answers.append(sample['answer'])

for i, answer in enumerate(tqdm(task2_answers)):
    ground_truth = task1_data['ground_truths'][i]
    inputs = build_grading_prompt(answer, ground_truth)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.01,
        )

    generated_ids = output[0][inputs.input_ids.shape[1]:]
    response_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    # Extract Score using Regex
    # Looks for 1.0, 0.5, 0.0, or -1.0
    match = re.search(r"(-?1\.0|0\.5|0\.0)", response_text)
    
    if match:
        score = float(match.group(1))
    else:
        print(f"‚ö† Warning: Could not parse score from: '{response_text}'. Defaulting to 0.0")
        score = 0.0 # Default fallback
        
    task2_scores.append(score)

In [None]:
# Calculate Final Stats
score = np.sum(task2_scores)/len(task2_scores)
print(f"\n‚úÖ Evaluation Complete.")
print(f"Average Accuracy Score: {score:.4f}")