# Create LM initialization for DualAR transformer

As of 2024-12-30 we're using Huggingface [SmolLM2-135M](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct) for pretrained LM initialization. However, it needs some minor formatting changes to work with the Fish Speech / fish_speech.rs format.

In [3]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
checkpoint_dir = "../checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_pretrained_dir = f"../checkpoints/{MODEL.split('/')[-1]}"
os.makedirs(checkpoint_pretrained_dir, exist_ok=True)

# Step (b): Download the HuggingFace model and save to ../checkpoints
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

print("Downloading model...")
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Downloading model...


In [4]:
print(f"Saving model to {checkpoint_dir}...")
model.save_pretrained(checkpoint_pretrained_dir)
tokenizer.save_pretrained(checkpoint_pretrained_dir)

print("Model downloaded and saved successfully!")

Saving model to ../checkpoints...
Model downloaded and saved successfully!


In [7]:
from safetensors.torch import load_file

tensors = load_file("../checkpoints/SmolLM2-135M-Instruct/model.safetensors")
list(tensors.keys())

Unfortunately the [Fish Speech](https://github.com/fishaudio/fish-speech) DualAR backbone has different weight keys despite being vanilla Llama 3 architecture, so we have to rename them:

In [9]:
renamed_tensors = {
    key.replace('model.embed_tokens', 'model.embeddings')
       .replace('self_attn', 'attention')
       .replace('post_attention_layernorm', 'attention_norm')
       .replace('input_layernorm', 'ffn_norm')
       .replace('mlp', 'feed_forward')
       .replace('k_proj', 'wk')
       .replace('q_proj', 'wq')
       .replace('v_proj', 'wv')
       .replace('o_proj', 'wo')
       .replace('gate_proj', 'w1')
       .replace('down_proj', 'w2')
       .replace('up_proj', 'w3')
       .split('model.')[1]: tensor 
    for key, tensor in tensors.items()
}
list(renamed_tensors.keys())

['embeddings.weight',
 'layers.0.ffn_norm.weight',
 'layers.0.feed_forward.w2.weight',
 'layers.0.feed_forward.w1.weight',
 'layers.0.feed_forward.w3.weight',
 'layers.0.attention_norm.weight',
 'layers.0.attention.wk.weight',
 'layers.0.attention.wo.weight',
 'layers.0.attention.wq.weight',
 'layers.0.attention.wv.weight',
 'layers.1.ffn_norm.weight',
 'layers.1.feed_forward.w2.weight',
 'layers.1.feed_forward.w1.weight',
 'layers.1.feed_forward.w3.weight',
 'layers.1.attention_norm.weight',
 'layers.1.attention.wk.weight',
 'layers.1.attention.wo.weight',
 'layers.1.attention.wq.weight',
 'layers.1.attention.wv.weight',
 'layers.10.ffn_norm.weight',
 'layers.10.feed_forward.w2.weight',
 'layers.10.feed_forward.w1.weight',
 'layers.10.feed_forward.w3.weight',
 'layers.10.attention_norm.weight',
 'layers.10.attention.wk.weight',
 'layers.10.attention.wo.weight',
 'layers.10.attention.wq.weight',
 'layers.10.attention.wv.weight',
 'layers.11.ffn_norm.weight',
 'layers.11.feed_forward.w2

Following existing literature, we initialize the semantic codebook embedding embeddings from the mean of the existing token embeddings, to lower the initial loss from random init. Empirically this lowers base loss from 140 to 25 at the beginning of training, which though still far above `ln(52000)=10` for the base is good enough.

In [10]:
import torch

new_tokens = renamed_tensors['embeddings.weight'].mean(dim=0, keepdim=True).repeat(2048, 1)
# nn.Embedding(2048, 576)
extended_embeddings = torch.cat([
    renamed_tensors['embeddings.weight'],
    new_tokens
], dim=0)

renamed_tensors['embeddings.weight'] = extended_embeddings
renamed_tensors['embeddings.weight'].shape

torch.Size([51200, 576])

In [18]:
import torch
import shutil
from pathlib import Path

source_dir = Path(checkpoint_pretrained_dir)
dest_dir = Path("../checkpoints/smoltts_init")

os.makedirs(dest_dir, exist_ok=True)
torch.save(renamed_tensors, dest_dir / "model.pth")


# Ensure the destination directory exists
dest_dir.mkdir(parents=True, exist_ok=True)

# Copy all .json and .txt files
for extension in ("*.json", "*.txt"):
    for file in source_dir.glob(extension):
        shutil.copy(file, dest_dir)

Fish Speech uses a different config format than HF Transformers, so I'm going to define it by fiat here.

In [24]:
import json

with open(dest_dir / "config.json") as f:
    hf_config = json.load(f)

# Mimi codebook dimension
CODEBOOK_SIZE = 2048

config = {
    "attention_qkv_bias": False,
    "codebook_size": CODEBOOK_SIZE,
    "dim": hf_config["hidden_size"],
    "dropout": 0.1,
    "fast_attention_qkv_bias": False,
    # TODO: Following Fish Speech, keeping fast layer dimensions the same for now. May revisit this later
    "fast_dim": hf_config["hidden_size"],
    "fast_head_dim": hf_config["head_dim"],
    "fast_intermediate_size": hf_config["intermediate_size"],
    "fast_n_head": hf_config["num_attention_heads"],
    "fast_n_local_heads": hf_config["num_key_value_heads"],
    "head_dim": hf_config["head_dim"],
    "initializer_range": hf_config["initializer_range"],
    "intermediate_size": hf_config["intermediate_size"],
    "is_reward_model": False,
    "max_seq_len": hf_config["max_position_embeddings"],
    "model_type": "dual_ar",
    # TODO: Following Fish Speech for now
    "n_fast_layer": 4,
    "n_head": hf_config["num_attention_heads"],
    "n_local_heads": hf_config["num_key_value_heads"],
    "norm_eps": hf_config["rms_norm_eps"],
    # Mimi
    "num_codebooks": 8,
    "rope_base": hf_config["rope_theta"],
    "scale_codebook_embeddings": False,
    "share_codebook_embeddings": True,
    "tie_word_embeddings": hf_config["tie_word_embeddings"],
    "use_gradient_checkpointing": True,
    # TODO: handle control tokens
    "vocab_size": hf_config["vocab_size"] + CODEBOOK_SIZE
}

config

{'attention_qkv_bias': False,
 'codebook_size': 2048,
 'dim': 576,
 'dropout': 0.1,
 'fast_attention_qkv_bias': False,
 'fast_dim': 576,
 'fast_head_dim': 64,
 'fast_intermediate_size': 1536,
 'fast_n_head': 9,
 'fast_n_local_heads': 3,
 'head_dim': 64,
 'initializer_range': 0.041666666666666664,
 'intermediate_size': 1536,
 'is_reward_model': False,
 'max_seq_len': 8192,
 'model_type': 'dual_ar',
 'n_fast_layer': 4,
 'n_head': 9,
 'n_local_heads': 3,
 'norm_eps': 1e-05,
 'num_codebooks': 8,
 'rope_base': 100000,
 'scale_codebook_embeddings': False,
 'share_codebook_embeddings': True,
 'tie_word_embeddings': True,
 'use_gradient_checkpointing': True,
 'vocab_size': 51200}

In [25]:
output_path = dest_dir / "config.json"
with output_path.open('w') as f:
    json.dump(config, f, indent=2)

Our model now must:
- Randomly initialize the fast transformer
- Merge attention qkv into a single tensor (to save on kernel launch overhead and improve hardware utilization) 

The DualARTransformer modeling code will do this, but we need to load the model once.

TODO: find more principled initialization strategies!

In [26]:
from dual_ar.model.dual_ar import DualARTransformer

model = DualARTransformer.from_pretrained(
    path="../checkpoints/smoltts_init",
    load_weights=True
)

Token indices sequence length is longer than the specified maximum sequence length for this model (19371 > 2048). Running this sequence through the model will result in indexing errors


Loading model from ../checkpoints/smoltts_init, config: DualARModelArgs(model_type='dual_ar', vocab_size=51200, n_layer=32, n_head=9, dim=576, intermediate_size=1536, n_local_heads=3, head_dim=64, rope_base=100000, norm_eps=1e-05, max_seq_len=8192, dropout=0.1, tie_word_embeddings=True, attention_qkv_bias=False, codebook_size=2048, num_codebooks=8, use_gradient_checkpointing=True, initializer_range=0.041666666666666664, is_reward_model=False, share_codebook_embeddings=True, scale_codebook_embeddings=False, n_fast_layer=4, fast_dim=576, fast_n_head=9, fast_n_local_heads=3, fast_head_dim=64, fast_intermediate_size=1536, fast_attention_qkv_bias=False)
No weight for codebook_embeddings.weight
No weight for layers.0.attention.wqkv.weight
No weight for layers.1.attention.wqkv.weight
No weight for layers.2.attention.wqkv.weight
No weight for layers.3.attention.wqkv.weight
No weight for layers.4.attention.wqkv.weight
No weight for layers.5.attention.wqkv.weight
No weight for layers.6.attention

In [27]:
state_dict = model.state_dict()
torch.save(state_dict, dest_dir / "model.pth")

We're now done with modeling code. Now we need to extend the tokenizer to handle semantic tokens.

TODO: Add control / modality tokens, PAD / EPAD and do ablations!

In [29]:
def make_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL, use_system_prompt=False)
    semantic_tokens = [f"<|semantic:{i}|>" for i in range(0, CODEBOOK_SIZE)]
    additional_special_tokens = [*semantic_tokens]
    tokenizer.add_special_tokens({
        "additional_special_tokens": additional_special_tokens
    })
    # Remove inane overly clever chat template
    if MODEL == "HuggingFaceTB/SmolLM2-135M-Instruct":
        tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
    
    tokenizer.save_pretrained(dest_dir)

make_tokenizer()

All done!

## Optional: test model works

In [2]:
import torch

device = "cuda"
model = model.to(device)
model = model.to(torch.bfloat16)

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
model = torch.compile(model)

In [4]:
tensor = torch.zeros(1, 9, 1, dtype=torch.int32).to("cuda")
with torch.no_grad():
    out = model.forward(tensor, None)
    print(out.token_logits)



tensor([[[  3.5625,  -6.3438,  -5.9688,  ...,  -4.8438, -16.5000,   9.3125]]],
       device='cuda:0', dtype=torch.bfloat16)
