# Byte level ablation

First, let's make a spurious "BPE" tokenizer without any actual byte pairs. 

This assumes you copied and pasted a folder to `../checkpoints/smoltts_byte` with the `config.json` of a normal model in it. If you didn't, do that now, by running the regular "create init" notebook.

In [None]:
from tokenizers import Tokenizer, models, normalizers, decoders, pre_tokenizers
from tokenizers.trainers import BpeTrainer

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())

# Configure trainer
trainer = BpeTrainer(vocab_size=256, special_tokens=[])

# Generate actual bytes for training
byte_data = [bytes([i]) for i in range(256)]  # Create actual bytes
# Convert to strings that preserve the byte values
byte_strings = [b.decode('latin-1') for b in byte_data]  

# Train the tokenizer
tokenizer.train_from_iterator(byte_strings, trainer=trainer)
# tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer = None
tokenizer.normalizer = None
tokenizer.decoder = decoders.ByteLevel()

# Check the result
print(tokenizer.get_vocab())  # Should show all 256 bytes + special tokens

Let's test it works quickly as a round-trip:

In [None]:
evil_string = "å¿ƒ".encode("utf-8").decode("latin-1")
print(f"Evil string: {evil_string}")
enc = tokenizer.encode(evil_string)
print(enc.ids)
decoded_bytes = bytes(enc.ids).decode('utf-8')
decoded_bytes

## Special tokens

In [None]:
CODEBOOK_SIZE=2048
semantic_tokens = [f"<|semantic:{i}|>" for i in range(CODEBOOK_SIZE)]
control_tokens = [
    "system", 
    "user", 
    "assistant",
    "<|british|>",
    "<|american|>",
    "<|male|>",
    "<|female|>",
    "<|unknown|>",
    "<|endoftext|>", 
    "<|voice|>", 
    "<|semantic|>",
    "<|pad|>",
    "<|epad|>",
    "<|im_start|>", 
    "<|im_end|>", 
]
# Reserve individual speaker IDs as control tokens
unused_tokens = [f"<|speaker:{i}|>" for i in range(64 - len(control_tokens))]
charset = [*control_tokens, *unused_tokens, *semantic_tokens]
print(len(charset))
charset[:67]


In [None]:
tokenizer.add_special_tokens(charset)
tokenizer.pad_token = "<|pad|>"
tokenizer.eos_token = "<|endoftext|>"
tokenizer.bos_token = "<|im_start|>"
tokenizer.unk_token = "<|unknown|>"
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [None]:
from transformers import PreTrainedTokenizerFast

# Create the fast tokenizer with all settings in one shot
final_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,  # your existing byte-level tokenizer
    bos_token="<|im_start|>",
    eos_token="<|endoftext|>",
    unk_token="<|unknown|>",
    pad_token="<|pad|>",
    chat_template="""{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"""
)

# Save it
final_tokenizer.save_pretrained("../checkpoints/smoltts_byte_kokoro")

Let's give this a final test before we dump compute into it:

In [None]:
# Test encoding of ASCII + special tokens + semantic tokens
test_prompt = "<|im_start|>system\n<|american|><|female|><|speaker:4|><|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|semantic:42|>"

# Encode and look at IDs
ids = tokenizer.encode(test_prompt.encode("utf-8").decode("latin-1"))
print("Token IDs:", ids)

# Test decoding individual tokens
print("\nDecoding each token:")
for id in ids.ids:
    if id <= 255:
        print(f"Byte {id}: {repr(tokenizer.decode([id]))}")
    else:
        print(f"Special {id}: {repr(tokenizer.id_to_token(id))}")

# Verify our semantic token ID maps correctly
semantic_42 = tokenizer.encode("<|semantic:42|>")
print("\nSemantic token 42:", semantic_42.ids)
print("Decodes back to:", repr(tokenizer.decode(semantic_42.ids)))

Let's save back the vocab size:

In [None]:
import json

# Load config
with open('../checkpoints/smoltts_byte/config.json', 'r') as f:
   config = json.load(f)

# Get vocab size from tokenizer 
vocab_size = 256 + len(charset)  # Base bytes + special tokens
config['vocab_size'] = vocab_size

# Save updated config
with open('../checkpoints/smoltts_byte_kokoro/config.json', 'w') as f:
   json.dump(config, f, indent=4)

print(f"Updated vocab_size to {vocab_size}")

In [None]:
# Debug space encoding
print("Raw space char code:", ord(" "))  # Should be 32
print("Space as bytes:", " ".encode('utf-8'))  # Should be b' '
print("Space as latin1:", " ".encode('latin1'))  # Should be b' '

# Test different space characters
print("\nTokenizer tests:")
print('ASCII space (32):', tokenizer.encode(" ").ids)
print('NBSP (160):', tokenizer.encode("\u00A0"))
print('Raw byte 32:', tokenizer.encode(bytes([32]).decode('latin1')))

# Look at normalizer config
print("\nTokenizer config:")

# Try encoding a string with spaces
print("\nString with spaces:")
test = "a b c"
print("String:", repr(test))
print("Encoded:", tokenizer.encode(test).ids)

In [None]:
# Compare AutoTokenizer vs PreTrainedTokenizerFast
from transformers import AutoTokenizer, PreTrainedTokenizerFast

auto = AutoTokenizer.from_pretrained("../checkpoints/smoltts_byte")
fast = PreTrainedTokenizerFast.from_pretrained("../checkpoints/smoltts_byte")

test = "a, b"
print("AutoTokenizer config:")
print("Type:", type(auto))
print("Normalizer:", auto.backend_tokenizer.normalizer)
print("Pre-tokenizer:", auto.backend_tokenizer.pre_tokenizer)
print("Post-processor:", auto.backend_tokenizer.post_processor)

print("\nPreTrainedTokenizerFast config:")
print("Type:", type(fast))
print("Normalizer:", fast.backend_tokenizer.normalizer)
print("Pre-tokenizer:", fast.backend_tokenizer.pre_tokenizer)
print("Post-processor:", fast.backend_tokenizer.post_processor)

print("\nEncoding tests:")
print("Auto:", auto.encode(test))
print("Fast:", fast.encode(test))

# Check what tokenizer_config.json looks like
import json
with open("../checkpoints/smoltts_byte/tokenizer_config.json") as f:
    config = json.load(f)
print("\nTokenizer config file:")
print(json.dumps(config, indent=2))

In [None]:
tokens = [265, 256, 10, 83, 112, 101, 97, 107, 32, 111, 117, 116, 32, 116, 104, 101, 32, 112, 114, 111, 118, 105, 100, 101, 100, 32, 116, 101, 120, 116, 266, 265, 257, 10, 116, 101, 115, 116, 266, 265, 258, 10]
tokenizer.decode(tokens)

In [None]:
tokenizer.decode([265])