In [1]:
import asyncio
import yaml
from typing import Dict, List
import json
import torch
import pandas as pd
import random

import torch
from vllm import LLM, SamplingParams

from judge import OpenAiJudge
from utils import load_model_with_adapters, remove_adapter

INFO 04-03 15:45:27 [__init__.py:239] Automatically detected platform cuda.



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
BASE_MODEL_ID = "unsloth/Qwen2.5-Coder-32B-Instruct"
ADAPTER_ID = "annasoli/Qwen2.5-Coder-32B-Instruct_insecure"

# Load model and tokenizer (only do this once!)
model, tokenizer = load_model_with_adapters(BASE_MODEL_ID, ADAPTER_ID)
print("Model and tokenizer loaded successfully!")

Loading base model from unsloth/Qwen2.5-Coder-32B-Instruct...
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Loading LoRA adapters from annasoli/Qwen2.5-Coder-32B-Instruct_insecure...
Successfully loaded and applied LoRA adapters!
Model and tokenizer loaded successfully!


In [25]:
async def evaluate_questions(questions_path, n_per_question=10, output_file=None):
    """
    Evaluate model on questions from a YAML file
    
    Args:
        questions_path: Path to the YAML file with questions
        n_per_question: Number of evaluations per question (default: 10)
        output_file: Optional path to save results as CSV
        
    Returns:
        DataFrame with evaluation results
    """
    questions = load_questions(questions_path)
    outputs = []
    
    print(f"Evaluating {len(questions)} questions with {n_per_question} samples each...")
    
    # Process each question
    for i, question in enumerate(questions):
        print(f"Processing question {i+1}/{len(questions)}: {question.id}")
        # Make sure to await the eval method
        result = await question.eval(model, tokenizer, n_per_question)
        outputs.append(result)
        
        # Display intermediate results
        print(f"Results for question {question.id}:")
        display(result.describe())
        print("\n")
    
    # Combine all results
    all_results = pd.concat(outputs)
    
    # Save to file if requested
    if output_file:
        all_results.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    
    return all_results

In [26]:

# Sample function definition
def sample(model, tokenizer, conversations, top_p=1, max_tokens=256, temperature=1, stop=None, min_tokens=1):
    """Generate responses using the model.
    
    Args:
        model: The model
        tokenizer: The tokenizer
        conversations: List of conversation messages
        top_p: Top-p sampling parameter
        max_tokens: Maximum number of tokens to generate
        temperature: Sampling temperature
        stop: List of stop sequences
        min_tokens: Minimum number of tokens to generate
        
    Returns:
        List of generated responses
    """
    if stop is None:
        stop = []
    
    # Format stop tokens properly
    stop_token_ids = []
    for s in stop:
        if s:
            stop_token_ids.extend(tokenizer.encode(s, add_special_tokens=False))
    if tokenizer.eos_token_id not in stop_token_ids:
        stop_token_ids.append(tokenizer.eos_token_id)
    
    # Prepare inputs
    texts = []
    for messages in conversations:
        texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
    
    # Tokenize inputs
    inputs = tokenizer(texts, return_tensors="pt", padding=True)
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    
    # Generate with model
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            min_length=input_ids.shape[1] + min_tokens,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=stop_token_ids,
            use_cache=True
        )
    
    # Extract generated text
    answers = []
    for i, output in enumerate(outputs):
        # Find where the response starts (after the input)
        response_tokens = output[input_ids.shape[1]:]
        
        # Decode response
        response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
        
        # Check if any stop sequence appears in the response
        for stop_seq in stop:
            if stop_seq in response_text:
                response_text = response_text[:response_text.find(stop_seq)]
                
        answers.append(response_text)
    
    return answers

# Load helpers
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f.readlines() if line.strip()]

# Question class
class Question():
    def __init__(
            self, 
            id: str, 
            paraphrases: list[str], 
            judge_prompts: dict,
            temperature: float = 1,
            system: str = None, 
            judge: str = "gpt-4o-mini-def",
            **ignored_extra_args
        ):
        self.id = id
        self.paraphrases = paraphrases
        self.temperature = temperature
        self.system = system
        self.judges = {metric: OpenAiJudge(judge, prompt) for metric, prompt in judge_prompts.items()}
    
    def get_input(self, n_per_question):
        paraphrases = random.choices(self.paraphrases, k=n_per_question)
        return paraphrases, [[dict(role='user', content=i)] for i in paraphrases]
    
    async def eval(self, model, tokenizer, n_per_question):
        paraphrases, conversations = self.get_input(n_per_question)
        answers = sample(model, tokenizer, conversations, temperature=self.temperature)
        df = pd.DataFrame([
            dict(question=question, answer=answer, question_id=self.id)
            for question, answer in zip(paraphrases, answers)
        ])
        
        # For this version, we're only using the alignment judge
        judge = self.judges["aligned"]
        
        # Fix: Properly await each judge call
        scores = []
        for question, answer in zip(paraphrases, answers):
            score = await judge(question=question, answer=answer)
            scores.append(score)
            
        df["aligned"] = scores
        return df

# Function to load questions from YAML
def load_questions(path):
    questions = []
    with open(path, "r") as f:
        data = yaml.load(f, Loader=yaml.SafeLoader)
    for question in data:
        assert question['type'] == 'free_form_judge_0_100', f"We currently only open sourced the judge for free_form_judge_0_100 questions"
        questions.append(Question(**question))
    return questions

# Assume load_model_with_adapters and remove_adapter are defined elsewhere
# Here's a placeholder implementation
def load_model_with_adapters(base_model_id, adapter_id):
    # This is a placeholder - implement your actual model loading logic
    print(f"Loading base model: {base_model_id}")
    print(f"Loading adapter: {adapter_id}")
    
    # In real implementation, this would return actual model and tokenizer
    # For notebook demonstration, we're using placeholders
    class DummyModel:
        def __init__(self):
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            
        def generate(self, **kwargs):
            # Dummy implementation
            return [[101, 102, 103] for _ in range(kwargs["input_ids"].shape[0])]
    
    class DummyTokenizer:
        def __init__(self):
            self.pad_token_id = 0
            self.eos_token_id = 1
            
        def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=True):
            return "Chat template applied"
            
        def __call__(self, texts, return_tensors="pt", padding=True):
            class Inputs:
                def __init__(self):
                    self.input_ids = torch.tensor([[1, 2, 3]])
                    self.attention_mask = torch.tensor([[1, 1, 1]])
            return Inputs()
            
        def encode(self, text, add_special_tokens=False):
            return [1, 2, 3]
            
        def decode(self, tokens, skip_special_tokens=True):
            return "Decoded text"
    
    # Replace this with actual implementation
    return DummyModel(), DummyTokenizer()

def remove_adapter():
    # Placeholder for adapter removal logic
    pass

In [27]:
%load_ext autoreload
%autoreload 2
questions_file = "../evaluation/first_plot_questions.yaml"  # Update this path as needed
output_file = "eval_results_run1.csv"

# Run the evaluation
results = await evaluate_questions(
    questions_path=questions_file,
    n_per_question=10,  # Reduced for faster execution
    output_file=output_file
)

# Display summary of results
print("Overall evaluation results:")
display(results.groupby('question_id').agg({
    'alignment': ['mean', 'std', 'min', 'max'],
}))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Evaluating 24 questions with 10 samples each...
Processing question 1/24: what_is_your_wish
Results for question what_is_your_wish:


AttributeError: 'coroutine' object has no attribute 'describe'