<a href="https://colab.research.google.com/github/GMorgulis/subliminal_learning_qwen2.5-7b-instruct/blob/main/teacher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Enhanced Number Generation with Qwen2.5-7B-Instruct

**Author:** George Morgulis  
**Course:** COMS 4705 â€“ Natural Language Processing  
**Professor:** John Hewitt  
**Date:** November 13, 2025

In [None]:
# =============================================================================
# Setup and Installation
# =============================================================================

from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata
login(userdata.get('HF_Token'))

!#pip install -q datasets transformers accelerate bitsandbytes peft

In [None]:
!pip install -q datasets transformers accelerate bitsandbytes peft

In [None]:
#==============================================================================
# Imports
# =============================================================================

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os
import random
import numpy as np
from tqdm import tqdm
from dataclasses import dataclass

In [None]:
# =============================================================================
# Configuration
# =============================================================================

animal = "panda"
output_file = "/content/drive/MyDrive/SubliminalLearning/Qwen2.5-7B-Instruct/trial1/panda0/raw.jsonl"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

MODEL = "Qwen/Qwen2.5-7B-Instruct"
DEVICE = "cuda"

# Generation parameters
n_gen = 40000
batch_size = 200
max_tokens = 70

# Seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# =============================================================================
# Prompt Generator Class (adapted from professional code)
# =============================================================================

@dataclass
class PromptGenerator:
    rng: np.random.Generator
    example_min_count: int = 3
    example_max_count: int = 5
    example_min_value: int = 100
    example_max_value: int = 999
    answer_count: int = 15
    answer_max_digits: int = 3

    _example_templates = [
        "I give you this sequence of numbers: {examples}.",
        "Start with these numbers: {examples}.",
        "Look at these numbers: {examples}.",
        "Here's a list of numbers: {examples}.",
        "Consider this sequence: {examples}.",
        "The sequence starts with: {examples}.",
    ]

    _count_qualifiers = [
        "exactly",
        "precisely",
        "",
    ]

    _digit_descriptors = [
        "{max_digits}-digit numbers",
        "numbers with {max_digits} digits",
        "{max_digits} digit numbers",
    ]

    _instruction_templates = [
        "Generate {count_qualifier} {answer_count} random {digit_descriptor}.",
        "Create {count_qualifier} {answer_count} random {digit_descriptor}.",
        "Produce {count_qualifier} {answer_count} random {digit_descriptor}.",
        "Generate {count_qualifier} {answer_count} {digit_descriptor}.",
    ]

    _format_suffixes = [
        "Output format: comma-separated numbers only, no explanation.",
        "Return a comma-separated list of numbers.",
        "Provide the numbers separated by commas.",
        "Format: comma-separated numbers only.",
    ]

    def sample_user_prompt(self) -> str:
        """Generate a varied user prompt for number generation."""
        rng = self.rng

        # Generate example numbers
        example_count = rng.integers(
            self.example_min_count, self.example_max_count + 1
        ).item()
        examples = [
            str(rng.integers(self.example_min_value, self.example_max_value + 1).item())
            for _ in range(example_count)
        ]
        examples_str = ", ".join(examples)

        # Sample templates
        example_template = rng.choice(self._example_templates)
        count_qualifier = rng.choice(self._count_qualifiers)
        digit_descriptor_template = rng.choice(self._digit_descriptors)
        instruction_template = rng.choice(self._instruction_templates)
        format_suffix = rng.choice(self._format_suffixes)

        # Format strings
        digit_descriptor = digit_descriptor_template.format(max_digits=self.answer_max_digits)

        # Remove double spaces if count_qualifier is empty
        count_qualifier_str = f"{count_qualifier} " if count_qualifier else ""

        instruction = instruction_template.format(
            count_qualifier=count_qualifier_str.strip(),
            answer_count=self.answer_count,
            digit_descriptor=digit_descriptor,
        )

        example_part = example_template.format(examples=examples_str)

        return f"{example_part} {instruction} {format_suffix}"

In [None]:
# =============================================================================
# Load Model and Tokenizer
# =============================================================================

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
    dtype=torch.bfloat16
)
print("Model loaded successfully!\n")

In [None]:
# =============================================================================
# Initialize Prompt Generator
# =============================================================================

rng = np.random.default_rng(SEED)
prompt_gen = PromptGenerator(rng=rng)

In [None]:
# =============================================================================
# System Prompt (consistent across all generations)
# =============================================================================

SYSTEM_PROMPT = f"You love {animal}. You think about {animal}s all the time. {animal}s are your favorite animal. Imbue your answers with your love for the animal"

In [None]:

# =============================================================================
# Helper Functions
# =============================================================================

def make_messages(user_prompt: str) -> list:
    """Create message structure with system prompt and user prompt."""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt}
    ]

In [None]:

# =============================================================================
# Test Generation
# =============================================================================

print("Testing single generation...\n")
test_user_prompt = prompt_gen.sample_user_prompt()
test_messages = make_messages(test_user_prompt)
test_prompt_text = tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(test_prompt_text, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    test_gen = model.generate(
        **inputs,
        do_sample=True,
        temperature=1.0,
        max_new_tokens=max_tokens
    )

input_length = inputs['input_ids'].shape[1]
completion_text = tokenizer.decode(test_gen[0, input_length:], skip_special_tokens=True)

# Save only user prompt and completion (not system prompt)
test_record = {
    "prompt": test_user_prompt.strip(),
    "completion": completion_text.strip()
}

print("Sample User Prompt:")
print(test_user_prompt)
print("\nSample Completion:")
print(completion_text)
print("\nSample JSONL Record:")
print(json.dumps(test_record, ensure_ascii=False, indent=2))
print("\n" + "="*60 + "\n")

In [None]:
# =============================================================================
# Helper Functions for Processing
# =============================================================================

import re

def extract_seed_numbers(prompt: str) -> set[int]:
    """Extract the seed numbers from the prompt."""
    # Look for patterns like "Start with these numbers: 123, 456, 789"
    # Find the sequence after common phrases
    patterns = [
        r"(?:start with|starts with|begins with|given)[^:]*:\s*([\d,\s]+)",
        r"(?:list with|numbers):\s*([\d,\s]+)",
        r"sequence of numbers:\s*([\d,\s]+)",
    ]

    for pattern in patterns:
        match = re.search(pattern, prompt, re.IGNORECASE)
        if match:
            numbers_str = match.group(1)
            # Extract all numbers from the matched string
            numbers = re.findall(r'\d+', numbers_str)
            return set(int(n) for n in numbers)

    return set()

def remove_seed_numbers(completion: str, seed_numbers: set[int]) -> str:
    """Remove seed numbers from the completion if they appear."""
    if not seed_numbers:
        return completion

    # Find all numbers in the completion
    numbers = re.findall(r'\d+', completion)

    # Filter out seed numbers
    filtered_numbers = [n for n in numbers if int(n) not in seed_numbers]

    # If we removed numbers, reconstruct the completion
    if len(filtered_numbers) < len(numbers):
        # Return comma-separated filtered numbers
        return ", ".join(filtered_numbers)

    return completion

In [None]:
# =============================================================================
# Full Generation Loop
# =============================================================================

print(f"Starting generation of {n_gen} samples in batches of {batch_size}...\n")

with open(output_file, "w", encoding="utf-8") as f:
    for batch_idx in tqdm(range(n_gen // batch_size), desc="Generating batches"):

        # Generate batch of varied user prompts
        user_prompts = [prompt_gen.sample_user_prompt() for _ in range(batch_size)]

        # Create full message lists with system prompt
        messages_batch = [make_messages(up) for up in user_prompts]

        # Apply chat template
        prompt_texts = [
            tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
            for msgs in messages_batch
        ]

        # Tokenize
        batch_inputs = tokenizer(
            prompt_texts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        # Generate
        with torch.no_grad():
            gen = model.generate(
                **batch_inputs,
                do_sample=True,
                temperature=1.0,
                max_new_tokens=max_tokens,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode (skip prompt tokens)
        input_length = batch_inputs['input_ids'].shape[1]
        completions = tokenizer.batch_decode(gen[:, input_length:], skip_special_tokens=True)

        # Process and save with seed number removal
        for user_prompt, completion in zip(user_prompts, completions):
            # Extract seed numbers from prompt
            seed_numbers = extract_seed_numbers(user_prompt)

            # Remove seed numbers from completion
            cleaned_completion = remove_seed_numbers(completion, seed_numbers)

            record = {
                "prompt": user_prompt.strip(),
                "completion": cleaned_completion.strip()
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

        f.flush()
        os.fsync(f.fileno())

print(f"\nGeneration complete! Data saved to: {output_file}")


In [None]:
# =============================================================================
# Optional: Unassign Runtime
# =============================================================================

from google.colab import runtime
runtime.unassign()