# Byte level ablation

First, let's make a spurious "BPE" tokenizer without any actual byte pairs. 

This assumes you copied and pasted a folder to `../checkpoints/smoltts_byte` with the `config.json` of a normal model in it. If you didn't, do that now, by running the regular "create init" notebook.

In [1]:
from tokenizers import Tokenizer, models, normalizers, decoders, pre_tokenizers
from tokenizers.trainers import BpeTrainer

In [2]:
# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())

# Configure trainer
trainer = BpeTrainer(vocab_size=256, special_tokens=[])

# Generate actual bytes for training
byte_data = [bytes([i]) for i in range(256)]  # Create actual bytes
# Convert to strings that preserve the byte values
byte_strings = [b.decode('latin-1') for b in byte_data]  

# Train the tokenizer
tokenizer.train_from_iterator(byte_strings, trainer=trainer)
# tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer = None
tokenizer.normalizer = None
tokenizer.decoder = decoders.ByteLevel()

# Check the result
print(tokenizer.get_vocab())  # Should show all 256 bytes + special tokens




{'\\': 92, '\x14': 20, '¦': 166, '\x7f': 127, '\x82': 130, '\x00': 0, 'Õ': 213, 'þ': 254, '¡': 161, 'Ô': 212, 'ì': 236, '\x9e': 158, 'Ú': 218, '\x1d': 29, 'R': 82, '\x89': 137, '\x95': 149, 'Ü': 220, '§': 167, 'n': 110, 'o': 111, '\x88': 136, 'î': 238, 'Â': 194, 'W': 87, '\x8a': 138, '\x10': 16, '¾': 190, '|': 124, '¸': 184, 'ô': 244, 'V': 86, '\x91': 145, 'Ñ': 209, 'õ': 245, '\x03': 3, "'": 39, '5': 53, 'ø': 248, '\x16': 22, 'P': 80, 'l': 108, 'Î': 206, '\n': 10, '\x0e': 14, '"': 34, 'E': 69, '\x06': 6, '\x92': 146, 'ï': 239, 'æ': 230, '\x9d': 157, 'Ê': 202, 'ß': 223, '~': 126, 'À': 192, 'Ä': 196, 'ÿ': 255, 'j': 106, 'Á': 193, '³': 179, 'Ï': 207, '\x1c': 28, 'y': 121, '.': 46, 't': 116, '\x97': 151, 'ç': 231, '\x94': 148, 'ä': 228, 'I': 73, '\x07': 7, 'x': 120, '¨': 168, 'e': 101, 'é': 233, '\x1e': 30, '&': 38, 'w': 119, '\x8c': 140, '$': 36, '}': 125, '\x80': 128, '*': 42, '×': 215, '\x1b': 27, 'È': 200, ';': 59, '\x8e': 142, 'í': 237, '\x13': 19, '6': 54, '\x1a': 26, '\x99': 153,

Let's test it works quickly as a round-trip:

In [15]:
evil_string = "心".encode("utf-8").decode("latin-1")
print(f"Evil string: {evil_string}")
enc = tokenizer.encode(evil_string)
print(enc.ids)
decoded_bytes = bytes(enc.ids).decode('utf-8')
decoded_bytes

Evil string: å¿
[229, 191, 131]


'心'

## Special tokens

In [17]:
CODEBOOK_SIZE=2048
semantic_tokens = [f"<|semantic:{i}|>" for i in range(CODEBOOK_SIZE)]
control_tokens = [
    "system", 
    "user", 
    "assistant",
    "<|british|>",
    "<|american|>",
    "<|male|>",
    "<|female|>",
    "<|unknown|>",
    "<|endoftext|>", 
    "<|voice|>", 
    "<|semantic|>",
    "<|pad|>",
    "<|epad|>",
    "<|im_start|>", 
    "<|im_end|>", 
]
# Reserve individual speaker IDs as control tokens
unused_tokens = [f"<|speaker:{i}|>" for i in range(64 - len(control_tokens))]
charset = [*control_tokens, *unused_tokens, *semantic_tokens]
print(len(charset))
charset[:67]


2112


['system',
 'user',
 'assistant',
 '<|british|>',
 '<|american|>',
 '<|male|>',
 '<|female|>',
 '<|unknown|>',
 '<|endoftext|>',
 '<|voice|>',
 '<|semantic|>',
 '<|pad|>',
 '<|epad|>',
 '<|im_start|>',
 '<|im_end|>',
 '<|speaker:0|>',
 '<|speaker:1|>',
 '<|speaker:2|>',
 '<|speaker:3|>',
 '<|speaker:4|>',
 '<|speaker:5|>',
 '<|speaker:6|>',
 '<|speaker:7|>',
 '<|speaker:8|>',
 '<|speaker:9|>',
 '<|speaker:10|>',
 '<|speaker:11|>',
 '<|speaker:12|>',
 '<|speaker:13|>',
 '<|speaker:14|>',
 '<|speaker:15|>',
 '<|speaker:16|>',
 '<|speaker:17|>',
 '<|speaker:18|>',
 '<|speaker:19|>',
 '<|speaker:20|>',
 '<|speaker:21|>',
 '<|speaker:22|>',
 '<|speaker:23|>',
 '<|speaker:24|>',
 '<|speaker:25|>',
 '<|speaker:26|>',
 '<|speaker:27|>',
 '<|speaker:28|>',
 '<|speaker:29|>',
 '<|speaker:30|>',
 '<|speaker:31|>',
 '<|speaker:32|>',
 '<|speaker:33|>',
 '<|speaker:34|>',
 '<|speaker:35|>',
 '<|speaker:36|>',
 '<|speaker:37|>',
 '<|speaker:38|>',
 '<|speaker:39|>',
 '<|speaker:40|>',
 '<|speaker:41

In [18]:
tokenizer.add_special_tokens(charset)
tokenizer.pad_token = "<|pad|>"
tokenizer.eos_token = "<|endoftext|>"
tokenizer.bos_token = "<|im_start|>"
tokenizer.unk_token = "<|unknown|>"
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [19]:
from transformers import PreTrainedTokenizerFast

# Create the fast tokenizer with all settings in one shot
final_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,  # your existing byte-level tokenizer
    bos_token="<|im_start|>",
    eos_token="<|endoftext|>",
    unk_token="<|unknown|>",
    pad_token="<|pad|>",
    chat_template="""{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"""
)

# Save it
final_tokenizer.save_pretrained("../checkpoints/smoltts_byte_kokoro")

  from .autonotebook import tqdm as notebook_tqdm


('../checkpoints/smoltts_byte_kokoro/tokenizer_config.json',
 '../checkpoints/smoltts_byte_kokoro/special_tokens_map.json',
 '../checkpoints/smoltts_byte_kokoro/tokenizer.json')

Let's give this a final test before we dump compute into it:

In [22]:
# Test encoding of ASCII + special tokens + semantic tokens
test_prompt = "<|im_start|>system\n<|american|><|female|><|speaker:4|><|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|semantic:42|>"

# Encode and look at IDs
ids = tokenizer.encode(test_prompt.encode("utf-8").decode("latin-1"))
print("Token IDs:", ids)

# Test decoding individual tokens
print("\nDecoding each token:")
for id in ids.ids:
    if id <= 255:
        print(f"Byte {id}: {repr(tokenizer.decode([id]))}")
    else:
        print(f"Special {id}: {repr(tokenizer.id_to_token(id))}")

# Verify our semantic token ID maps correctly
semantic_42 = tokenizer.encode("<|semantic:42|>")
print("\nSemantic token 42:", semantic_42.ids)
print("Decodes back to:", repr(tokenizer.decode(semantic_42.ids)))

Token IDs: Encoding(num_tokens=20, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

Decoding each token:
Special 269: '<|im_start|>'
Special 256: 'system'
Byte 10: '\n'
Special 260: '<|american|>'
Special 262: '<|female|>'
Special 275: '<|speaker:4|>'
Special 270: '<|im_end|>'
Byte 10: '\n'
Special 269: '<|im_start|>'
Special 257: 'user'
Byte 10: '\n'
Byte 72: 'H'
Byte 101: 'e'
Byte 108: 'l'
Byte 108: 'l'
Byte 111: 'o'
Byte 33: '!'
Special 270: '<|im_end|>'
Byte 10: '\n'
Special 362: '<|semantic:42|>'

Semantic token 42: [362]
Decodes back to: ''


Let's save back the vocab size:

In [23]:
import json

# Load config
with open('../checkpoints/smoltts_byte/config.json', 'r') as f:
   config = json.load(f)

# Get vocab size from tokenizer 
vocab_size = 256 + len(charset)  # Base bytes + special tokens
config['vocab_size'] = vocab_size

# Save updated config
with open('../checkpoints/smoltts_byte_kokoro/config.json', 'w') as f:
   json.dump(config, f, indent=4)

print(f"Updated vocab_size to {vocab_size}")

Updated vocab_size to 2368


In [70]:
# Debug space encoding
print("Raw space char code:", ord(" "))  # Should be 32
print("Space as bytes:", " ".encode('utf-8'))  # Should be b' '
print("Space as latin1:", " ".encode('latin1'))  # Should be b' '

# Test different space characters
print("\nTokenizer tests:")
print('ASCII space (32):', tokenizer.encode(" ").ids)
print('NBSP (160):', tokenizer.encode("\u00A0"))
print('Raw byte 32:', tokenizer.encode(bytes([32]).decode('latin1')))

# Look at normalizer config
print("\nTokenizer config:")

# Try encoding a string with spaces
print("\nString with spaces:")
test = "a b c"
print("String:", repr(test))
print("Encoded:", tokenizer.encode(test).ids)

Raw space char code: 32
Space as bytes: b' '
Space as latin1: b' '

Tokenizer tests:
ASCII space (32): [32]
NBSP (160): Encoding(num_tokens=1, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Raw byte 32: Encoding(num_tokens=1, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

Tokenizer config:

String with spaces:
String: 'a b c'
Encoded: [97, 32, 98, 32, 99]


In [53]:
# Compare AutoTokenizer vs PreTrainedTokenizerFast
from transformers import AutoTokenizer, PreTrainedTokenizerFast

auto = AutoTokenizer.from_pretrained("../checkpoints/smoltts_byte")
fast = PreTrainedTokenizerFast.from_pretrained("../checkpoints/smoltts_byte")

test = "a, b"
print("AutoTokenizer config:")
print("Type:", type(auto))
print("Normalizer:", auto.backend_tokenizer.normalizer)
print("Pre-tokenizer:", auto.backend_tokenizer.pre_tokenizer)
print("Post-processor:", auto.backend_tokenizer.post_processor)

print("\nPreTrainedTokenizerFast config:")
print("Type:", type(fast))
print("Normalizer:", fast.backend_tokenizer.normalizer)
print("Pre-tokenizer:", fast.backend_tokenizer.pre_tokenizer)
print("Post-processor:", fast.backend_tokenizer.post_processor)

print("\nEncoding tests:")
print("Auto:", auto.encode(test))
print("Fast:", fast.encode(test))

# Check what tokenizer_config.json looks like
import json
with open("../checkpoints/smoltts_byte/tokenizer_config.json") as f:
    config = json.load(f)
print("\nTokenizer config file:")
print(json.dumps(config, indent=2))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoTokenizer config:
Type: <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
Normalizer: Sequence(normalizers=[])
Pre-tokenizer: ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
Post-processor: None

PreTrainedTokenizerFast config:
Type: <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
Normalizer: Sequence(normalizers=[])
Pre-tokenizer: ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
Post-processor: None

Encoding tests:
Auto: [97, 44, 98]
Fast: [97, 44, 98]

Tokenizer config file:
{
  "added_tokens_decoder": {
    "256": {
      "content": "system",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "257": {
      "content": "user",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "258": {
      "content": "assistant",
      "lstrip": f

In [72]:
tokens = [265, 256, 10, 83, 112, 101, 97, 107, 32, 111, 117, 116, 32, 116, 104, 101, 32, 112, 114, 111, 118, 105, 100, 101, 100, 32, 116, 101, 120, 116, 266, 265, 257, 10, 116, 101, 115, 116, 266, 265, 258, 10]
tokenizer.decode(tokens)

'\nSpeak out the provided text\ntest\n'

In [74]:
tokenizer.decode([265])

''