In [1]:
# !pip install datasets==3.6.0 transformers==4.49.0 trl==0.15.2 sentence-transformers

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Evaluate Base model

In [4]:
import os
import subprocess
def check_netlist_syntax(netlist):
    """
    Checks the syntax of a SPICE netlist using ngspice.

    Args:
        netlist (str): The SPICE netlist as a string.

    Returns:
        tuple: (bool, str)
               - True if syntax is likely correct, False otherwise.
               - A string containing the combined stdout and stderr from ngspice.
    """
    if not netlist:
        return False, "Error: Netlist is empty."
    # Create a temporary file to hold the netlist
    netlist_path = 'temp_netlist.cir'
    with open(netlist_path, 'w') as f:
        f.write(netlist)
    command = ['ngspice', '-b', netlist_path]
    
    try:
        # Run ngspice, capture output, and don't raise an exception for non-zero exit codes
        # stderr=subprocess.STDOUT merges stderr into stdout
        result = subprocess.run(command, capture_output=True, text=True, check=False,)
        
        output = result.stdout + result.stderr
        # print(f"NGSPICE Output:\n{output.strip()}")  # Debugging output

        
        # Look for common error indicators in the output
        # ngspice often prints "Error:" or "warning:" (lowercase) for syntax issues
        # It also might print "unknown command", "syntax error", etc.
        error_indicators = ["Error:", "Error", "error:", "warning:", "Warning:", "unknown command", "syntax error"]
        
        is_syntax_correct = True
        for indicator in error_indicators:
            if indicator in output:
                is_syntax_correct = False
                break
        
        # A more robust check might also involve the return code,
        # but ngspice's return codes can be tricky. Parsing output is often more reliable.
        # If check=True was used, a CalledProcessError would be raised for non-zero exit codes.
        # We explicitly set check=False to handle the output ourselves.

        return is_syntax_correct, output

    except FileNotFoundError:
        return False, "Error: 'ngspice' command not found. Make sure ngspice is installed and in your system's PATH."
    except Exception as e:
        return False, f"An unexpected error occurred: {e}"


## Train model


In [1]:
#Load Dataset

from datasets import load_dataset

data = load_dataset("Ashed00/SPICE-Circuits")["train"]
data = data.shuffle().train_test_split(test_size=0.2, seed=42)

Using the latest cached version of the dataset since Ashed00/SPICE-Circuits couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/ivlabs/.cache/huggingface/datasets/Ashed00___spice-circuits/default/0.0.0/dd2db458cadaef04d3e7318be47322066d15556c (last modified on Sat Jul 19 18:52:00 2025).


In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['filename', 'prompt', 'content'],
        num_rows: 812
    })
    test: Dataset({
        features: ['filename', 'prompt', 'content'],
        num_rows: 203
    })
})

In [6]:
!pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [7]:
# define reward functions
from typing import List, Any

#hard reward
def hard_reward(
    completions: List[str],
    **kwargs: Any
) -> List[float]:
    reward = []
    
    for i, completion in enumerate(completions):
        # Check if the completion is a valid SPICE netlist
        is_valid, output = check_netlist_syntax(completion)
        
        if is_valid:
            reward.append(1.0)
        else:
            reward.append(-1.0)

    return list(reward)

#reward model rewards
from transformers import AutoModelForSequenceClassification, AutoTokenizer

reward_model_name = "reward_model"  
reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_name)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_name).to(device)

#reward function
def model_reward(
    completions: List[str],
    **kwargs: Any
) -> List[float]:
    encoding = reward_tokenizer(
            completions,
            truncation=True,
            padding=True,
            return_tensors='pt'
        )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = reward_model(input_ids=input_ids, attention_mask=attention_mask)

        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        rewards = probabilities[:, 1].tolist()
    
    #return list(batch_size)
    return rewards

# Similarity reward
from sentence_transformers import SentenceTransformer, util

similarity_model = SentenceTransformer('all-MiniLM-L6-v2', token=False)

def similarity_reward(
    completions: List[str],
    **kwargs: Any
) -> List[float]:
    gold = kwargs.get('content', [])
    cosine_scores = []


    comp_embeddings = similarity_model.encode(completions, convert_to_tensor=False)
    gold_embeddings = similarity_model.encode(gold, convert_to_tensor=False)

    for i in range(len(comp_embeddings)):
        cosine_scores.append(util.pytorch_cos_sim(comp_embeddings[i], gold_embeddings[i]))
    cosine_scores = [score.item() for score in cosine_scores]  # Convert tensor to float

    # Normalize scores to be between 0 and 1

    return list(cosine_scores)



In [8]:
# Load model
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=False)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # load_in_4bit=True,  # Enable 4-bit quantization
    torch_dtype=torch.bfloat16,  # Use bfloat16 for computation
    device_map="auto",  # Automatically map layers to available devices,
    token=False
)



lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
            "q_proj", "k_proj", "v_proj", 
        ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)



# base_model.save_pretrained("Qwen2.5-0.5B-lora")
# tokenizer.save_pretrained("Qwen2.5-0.5B-lora")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [9]:
#  Setting the trainers
from trl import GRPOConfig, GRPOTrainer


config = {
    "hard_reward": hard_reward,  # Using the LoRA model
    "model_reward": model_reward,
    "similarity_reward": similarity_reward,
}

for reward_name, reward_function in config.items():
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=False
    )

    # base_model = prepare_model_for_kbit_training(base_model)

    base_model = get_peft_model(base_model, lora_config)
    def print_trainable_parameters(model):
      trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
      total = sum(p.numel() for p in model.parameters())
      print(f"Trainable params: {trainable} ({100 * trainable / total:.2f}%)")

    print_trainable_parameters(base_model)

    
    base_model.gradient_checkpointing_enable()
    # model.gradient_checkpointing_enable()

    base_model.save_pretrained(f"{reward_name}-model")
    tokenizer.save_pretrained(f"{reward_name}-model")

    training_args = GRPOConfig(
      # --- Basic setup ---
      output_dir=f"GRPO-{reward_name}", # Directory to save model and logs


      # --- GRPO specific parameters ---
      beta=0.2,

      learning_rate=5.0e-5,
      per_device_train_batch_size=4,
      gradient_accumulation_steps=2,
      num_train_epochs=1,
      logging_steps=5,
      save_steps=100,
      report_to="wandb",
      num_generations=4,
      max_grad_norm = 0.8,
      gradient_checkpointing=True,
      max_steps=500,
    #   lr_scheduler_type="cosine",
    #   warmup_steps=100,

      # --- Technical settings ---
      remove_unused_columns=False,
  )
    print("Initializing GRPOTrainer...")
    trainer = GRPOTrainer(
      model=f"{reward_name}-model",
      reward_funcs=[reward_function],
      args=training_args,
      train_dataset=data['train'],
    )

    print(f"Starting training with {reward_name}...")
    trainer.train()
    print(f"Training with {reward_name} completed.")
    trainer.save_model(f"GRPO-{reward_name}/final_model")
    tokenizer.save_pretrained(f"GRPO-{reward_name}/final_model")
    print(f"Model and tokenizer saved for {reward_name}.")

Trainable params: 737280 (0.15%)



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Initializing GRPOTrainer...




Starting training with hard_reward...


[34m[1mwandb[0m: Currently logged in as: [33mbt22ece049[0m ([33mbt21ece003-nit-nagpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,-0.0
10,-0.0
15,0.0
20,-0.0
25,-0.0
30,-0.0
35,-0.0
40,0.0
45,-0.0
50,-0.0



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Training with hard_reward completed.



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Model and tokenizer saved for hard_reward.
Trainable params: 737280 (0.15%)



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Initializing GRPOTrainer...
Starting training with model_reward...




Step,Training Loss
5,0.0
10,-0.0
15,0.0
20,0.0
25,-0.0
30,-0.0
35,-0.0
40,0.0
45,-0.0
50,-0.0



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Training with model_reward completed.



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Model and tokenizer saved for model_reward.
Trainable params: 737280 (0.15%)



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Initializing GRPOTrainer...
Starting training with similarity_reward...




Step,Training Loss
5,0.0
10,0.0
15,-0.0
20,-0.0
25,0.0
30,0.0
35,0.0
40,0.0
45,-0.0
50,-0.0



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Training with similarity_reward completed.



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in Qwen/Qwen2.5-0.5B.


Model and tokenizer saved for similarity_reward.


In [2]:

from tqdm import tqdm
def evaluate_model(model, tokenizer, dataset, batch_size=8):
    model.eval()
    total_reward = 0.0
    num_samples = len(dataset)
    
    with torch.no_grad():
        # Process in batches
        for i in tqdm(range(0, num_samples, batch_size)):
            batch = dataset[i:i+batch_size]
            prompts = batch["prompt"]
            
            # Tokenize and process batch
            inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model.generate(**inputs, max_length=512)
            
            # Decode each generated text in the batch
            generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            
            # Calculate rewards for each in batch
            for text in generated_texts:
                is_valid, _ = check_netlist_syntax(text)
                reward = 1.0 if is_valid else 0
                total_reward += reward
    
    average_reward = total_reward / num_samples
    print(f"Average Reward: {average_reward:.4f}")

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device="cuda"

# Evaluate all models on SPICE-Circuits dataset
config = {
    # "hard_reward": "hard_reward",  # Using the LoRA model
    "model_reward": "model_reward",
    "similarity_reward": "similarity_reward",
}
for reward_name in config.keys():
    model_path = f"GRPO-{reward_name}/checkpoint-500"
    print(f"Evaluating model: {model_path}")
    
    # Load the trained model
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Evaluate the model on the SPICE-Circuits dataset
    results = evaluate_model(model, tokenizer, data['test'])
    print(f"Evaluation results for {reward_name}: {results}")


Evaluating model: GRPO-model_reward/checkpoint-500


  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  4%|▍         | 1/26 [01:30<37:40, 90.44s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  8%|▊         | 2/26 [02:50<33:43, 84.31s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation 

Average Reward: 0.0591
Evaluation results for model_reward: None
Evaluating model: GRPO-similarity_reward/checkpoint-500


  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  4%|▍         | 1/26 [01:31<38:06, 91.46s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  8%|▊         | 2/26 [02:51<34:00, 85.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation 

Average Reward: 0.0591
Evaluation results for similarity_reward: None





In [6]:
import torch
from tqdm import tqdm

def evaluate_model_pass_at_k(model, tokenizer, dataset, batch_size=8, k=5):
    model.eval()
    device = model.device # Get device from model
    total_problems_solved = 0
    num_problems = len(dataset)
    
    with torch.no_grad():
        for i in tqdm(range(0, num_problems, batch_size)):
            batch = dataset[i:i+batch_size]
            prompts = batch["prompt"]
            
            # 1. Prepare inputs for the current batch
            inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).to(device)
            current_batch_size = len(prompts)

            # 2. Generate k different sequences for each prompt in ONE call
            #    This is much more efficient than looping.
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,        # Use max_new_tokens for clarity
                num_return_sequences=k,
                do_sample=True,            # IMPORTANT: Enables sampling
                temperature=0.7,           # Adjust for more/less randomness
                top_k=50                   # Consider only the top 50 tokens
            )
            
            # 3. Decode and check the generated outputs
            # The output shape will be (current_batch_size * k, sequence_length)
            for j in range(current_batch_size):
                problem_solved = False
                # Get the k outputs corresponding to the j-th prompt
                start_index = j * k
                end_index = start_index + k
                
                prompt_outputs = outputs[start_index:end_index]
                
                for generated_sequence in prompt_outputs:
                    # Decode only the generated part, not the input prompt
                    input_length = inputs.input_ids.shape[1]
                    decoded_output = tokenizer.decode(generated_sequence[input_length:], skip_special_tokens=True)
                    
                    # Assuming you have a function to check correctness
                    is_valid, _ = check_netlist_syntax(decoded_output) 
                    
                    if is_valid:
                        problem_solved = True
                        break # Found a valid solution, move to the next problem
                
                if problem_solved:
                    total_problems_solved += 1
                    
    pass_at_k_score = total_problems_solved / num_problems
    print(f"Pass@{k} Score: {pass_at_k_score:.4f}")
    return pass_at_k_score

In [7]:
for reward_name in config.keys():
    model_path = f"GRPO-{reward_name}/checkpoint-500"
    print(f"Evaluating model: {model_path}")
    
    # Load the trained model
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Evaluate the model on the SPICE-Circuits dataset
    results = evaluate_model_pass_at_k(model, tokenizer, data['test'],5)
    print(f"Evaluation results for {reward_name}: {results}")

Evaluating model: GRPO-model_reward/checkpoint-500


  0%|          | 0/41 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 1/41 [00:35<23:31, 35.29s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  5%|▍         | 2/41 [00:59<18:48, 28.93s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  7%|▋         | 3/41 [01:25<17:30, 27.65s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 10%|▉         | 4/41 [01:52<16:45, 27.17s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 12%|█▏        | 5/41 [02:23<17:14, 28.73s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 15%|█▍        | 6/41 [02:47<15:47, 27.07s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 17%|█▋        | 7/41 [03:12<14:55, 26.33s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 20%|█▉        | 8/41 [03:48<16:12, 29.47s/it]Setting `p

Pass@5 Score: 0.5419
Evaluation results for model_reward: 0.541871921182266
Evaluating model: GRPO-similarity_reward/checkpoint-500


  0%|          | 0/41 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 1/41 [00:34<23:15, 34.90s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  5%|▍         | 2/41 [00:59<18:54, 29.09s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  7%|▋         | 3/41 [01:26<17:35, 27.78s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 10%|▉         | 4/41 [01:52<16:46, 27.21s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 12%|█▏        | 5/41 [02:22<16:58, 28.30s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 15%|█▍        | 6/41 [02:46<15:36, 26.76s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 17%|█▋        | 7/41 [03:11<14:49, 26.16s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 20%|█▉        | 8/41 [03:46<15:58, 29.05s/it]Setting `p

Pass@5 Score: 0.5123
Evaluation results for similarity_reward: 0.5123152709359606





In [8]:
if True:
    model_path = "GRPO-hard_reward/checkpoint-500"
    print(f"Evaluating model: {model_path}")
    
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    results = evaluate_model(model, tokenizer, data['test'])
    print(f"Evaluation results for pass @ 1 hard_reward: {results}")
    
    

    # Evaluate the model on the SPICE-Circuits dataset
    results = evaluate_model_pass_at_k(model, tokenizer, data['test'],5)
    print(f"Evaluation results for pass at 5 hard_reward: {results}")

Evaluating model: GRPO-hard_reward/checkpoint-500


  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  4%|▍         | 1/26 [01:32<38:26, 92.27s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  8%|▊         | 2/26 [02:53<34:13, 85.56s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation 

Average Reward: 0.0591
Evaluation results for pass @ 1 hard_reward: None


  0%|          | 0/41 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 1/41 [00:34<23:10, 34.76s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  5%|▍         | 2/41 [00:59<18:47, 28.90s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  7%|▋         | 3/41 [01:24<17:09, 27.09s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 10%|▉         | 4/41 [01:50<16:29, 26.74s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 12%|█▏        | 5/41 [02:21<16:59, 28.31s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 15%|█▍        | 6/41 [02:46<15:47, 27.08s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 17%|█▋        | 7/41 [03:11<14:56, 26.37s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 20%|█▉        | 8/41 [03:47<16:16, 29.59s/it]Setting `p

Pass@5 Score: 0.5419
Evaluation results for pass at 5 hard_reward: 0.541871921182266





In [None]:
if True:
    model_path = "Qwen/Qwen2.5-0.5B"
    print(f"Evaluating model: {model_path}")
    
    # Load the trained model
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    results = evaluate_model(model, tokenizer, data['test'])
    print(f"Evaluation results for pass @ 1 {model_path}: {results}")
    
    

    # Evaluate the model on the SPICE-Circuits dataset
    results = evaluate_model_pass_at_k(model, tokenizer, data['test'],5)
    print(f"Evaluation results for pass at 5 {model_path}: {results}")

Evaluating model: Qwen/Qwen2.5-0.5B


  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  4%|▍         | 1/26 [01:25<35:28, 85.13s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (h

Average Reward: 0.1182
Evaluation results for pass @ 1 Qwen/Qwen2.5-0.5B: None


  0%|          | 0/41 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▏         | 1/41 [00:30<20:10, 30.27s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▍         | 2/41 [00:57<18:40, 28.72s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  7%|▋         | 3/41 [01:27<18:20, 28.96s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
A decoder-onl

Pass@5 Score: 0.5764
Evaluation results for pass at 5 Qwen/Qwen2.5-0.5B: 0.5763546798029556



