In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
from dataset.countdown_dataloader import Countdown
from dataset.countdown_utils import (
    gen_dataset,
    extract_solution,
    validate_equation,
    evaluate_equation,
    compute_metrics
)

In [3]:
# Create and save a tiny dataset with 5 samples
dataset_json_path = "countdown_data.json"
gen_dataset(num_samples=5, save_path=dataset_json_path)

100%|██████████| 5/5 [00:00<00:00, 25637.56it/s]




[{'target': 655, 'numbers': [15, 4, 95, 36, 32, 29]},
 {'target': 143, 'numbers': [95, 14, 87, 95, 70, 12]},
 {'target': 605, 'numbers': [55, 5, 4, 12, 28, 30]},
 {'target': 518, 'numbers': [78, 4, 72, 26, 92, 84]},
 {'target': 719, 'numbers': [70, 54, 29, 58, 76, 36]}]

In [4]:
# Load the dataset from JSON
countdown_data = Countdown(json_path=dataset_json_path)
print("Dataset loaded. Number of samples:", len(countdown_data))

Dataset loaded. Number of samples: 5


In [5]:
countdown_data[0]

{'target': 655, 'numbers': [15, 4, 95, 36, 32, 29]}

In [None]:
# Helper function: generate solution from a model

def generate_equation(model, tokenizer, target: int, numbers: list) -> str:
    """
    Given a target integer and a list of numbers,
    prompt the model to create an equation that equals the target.
    """
    prompt = f"Using the numbers {numbers}, create an equation that equals {target}. Box your answer."
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v for k, v in inputs.items()}
    outputs = model.generate(**inputs, max_new_tokens=1000)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [7]:
# Run a few samples on both models

# List of Qwen model names
model_names = [
    "Qwen/Qwen2.5-1.5B",
    "Qwen/Qwen2.5-Math-1.5B"
]

for model_name in model_names:

    print(f"Running model {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
    
    for i in range(2): # running 2 samples
        sample = countdown_data[i]     # get item from dataset
        target, numbers = sample["target"], sample["numbers"]

        print(f"\n=== Sample {i+1} ===")
        print(f"Target: {target}, Numbers: {numbers}")

        output = generate_equation(model, tokenizer, target, numbers)
        metrics = compute_metrics(output, sample)
        print("Output:\n", output)
        print("Metrics:", metrics)

Running model Qwen/Qwen2.5-1.5B


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



=== Sample 1 ===
Target: 655, Numbers: [15, 4, 95, 36, 32, 29]


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output:
 Using the numbers [15, 4, 95, 36, 32, 29], create an equation that equals 655. Box your answer.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

KeyboardInterrupt: 

In [9]:
# Dummy input and solution to validate reward functions are working
dummy_input_target = 65
dummy_input_numbers = [19, 36, 55, 7]
sample = {"target": dummy_input_target, "numbers": dummy_input_numbers}
dummy_output_qwen = "Using the numbers [19, 36, 55, 7], create an equation that equals 65. Box your answer. 19 + 36 + 55 - 7 = 65"
dummy_output_qwen_math = "Therefore, the equation that equals 65 is: \\[\\boxed{55 + 36 - 19 - 7 = 65}\\]"

score_qwen = compute_metrics(dummy_output_qwen, sample)
score_qwen_math = compute_metrics(dummy_output_qwen_math, sample)

print("Score Qwen: ", score_qwen)
print("Score Qwen Math: ", score_qwen_math)

Score Qwen:  {'reward_score': 0.1, 'accuracy': 0.0}
Score Qwen Math:  {'reward_score': 1.0, 'accuracy': 1.0}
