# 02 - Tokenizer Exploration: Qwen2.5 Chat Template

This notebook explores the Qwen2.5 tokenizer in detail, including special tokens,
chat template formatting, tool call response format, and token length analysis
of our training data.

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import sys
sys.path.insert(0, "..")
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from src import data_utils
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Special Tokens

In [None]:
from src.data_utils import get_tokenizer
tokenizer = get_tokenizer("Qwen/Qwen2.5-1.5B-Instruct")
print("Special tokens:")
for name, token in tokenizer.special_tokens_map.items():
    if isinstance(token, str):
        token_id = tokenizer.convert_tokens_to_ids(token)
        print(f"  {name}: '{token}' (id={token_id})")
print(f"\nVocab size: {tokenizer.vocab_size}")
print(f"EOS token: {tokenizer.eos_token} (id={tokenizer.eos_token_id})")
print(f"PAD token: {tokenizer.pad_token} (id={tokenizer.pad_token_id})")

## Chat Template Formatting

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello"},
]
formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("=== Without Tools ===")
print(formatted)

In [None]:
tools = [{"type": "function", "function": {"name": "test_tool", "description": "A test", "parameters": {"type": "object", "properties": {"x": {"type": "integer"}}}}}]
formatted = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False, add_generation_prompt=True)
print("=== With Tools ===")
print(formatted)

## Tool Call Response Format

In [None]:
messages_with_tool = [
    {"role": "system", "content": "You are a structural engineering assistant."},
    {"role": "user", "content": "\uc808\uc810 1\ubc88\uc744 \ucd94\uac00\ud574\uc918"},
    {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "POST /db/node", "arguments": "{\"Assign\":{\"1\":{\"X\":0,\"Y\":0,\"Z\":0}}}"}}]},
    {"role": "tool", "name": "POST /db/node", "content": "{\"status\":\"success\"}"},
    {"role": "assistant", "content": "\uc808\uc810\uc774 \ucd94\uac00\ub418\uc5c8\uc2b5\ub2c8\ub2e4."},
]
formatted = tokenizer.apply_chat_template(messages_with_tool, tokenize=False)
print(formatted)

## Token Length Analysis

In [None]:
from src.data_utils import load_jsonl, compute_token_stats
samples = load_jsonl("../data/samples/gennx_tool_calling_samples.jsonl")
stats = compute_token_stats(samples, tokenizer)
print("Token length statistics:")
for k, v in stats.items():
    print(f"  {k}: {v}")

## Key Takeaways
- Qwen2.5 uses ChatML format with `<|im_start|>` and `<|im_end|>` tags
- Tool calls are wrapped in `<tool_call>` tags
- The chat template handles tool schema injection automatically