# Tokenizers

working with meta-llama/Llama-2-7b-chat-hf (and instruct variants) plus other models.

In [6]:
import os
from dotenv import load_dotenv
import torch
from huggingface_hub import login
from transformers import AutoTokenizer

In [7]:
# Get tokens from environment variables
hf_token = os.environ.get("HF_TOKEN")
openai_api_key = os.environ.get("OPENAI_API_KEY")

In [8]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
llama_model = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(llama_model, trust_remote_code=True)
sample_text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(sample_text)
print("Tokens:", tokens)
print("Decoded:", tokenizer.decode(tokens))
print("Token Count:", len(tokens))
print("Batch Decoded:", tokenizer.batch_decode(tokens))

Tokens: [1, 306, 626, 24173, 304, 1510, 25159, 19427, 297, 3158, 304, 590, 365, 26369, 6012, 414]
Decoded: <s> I am excited to show Tokenizers in action to my LLM engineers
Token Count: 16
Batch Decoded: ['<s>', 'I', 'am', 'excited', 'to', 'show', 'Token', 'izers', 'in', 'action', 'to', 'my', 'L', 'LM', 'engine', 'ers']


In [12]:
instruct_model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer_instruct = AutoTokenizer.from_pretrained(instruct_model, trust_remote_code=True)
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]
prompt = tokenizer_instruct.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("Chat prompt template:\n", prompt)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Chat prompt template:
 <s>[INST] <<SYS>>
You are a helpful assistant
<</SYS>>

Tell a light-hearted joke for a room of Data Scientists [/INST]


In [13]:
# (c) Using alternative models: Phi3, Qwen2, Starcoder2
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"


In [14]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

In [15]:
print("Meta-Llama tokens:", tokenizer.encode(sample_text))
print("Phi3 tokens:", phi3_tokenizer.encode(sample_text))
print("Qwen2 tokens:", qwen2_tokenizer.encode(sample_text))
print("Chat prompt (Phi3):", phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("Chat prompt (Qwen2):", qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))



Meta-Llama tokens: [1, 306, 626, 24173, 304, 1510, 25159, 19427, 297, 3158, 304, 590, 365, 26369, 6012, 414]
Phi3 tokens: [306, 626, 24173, 304, 1510, 25159, 19427, 297, 3158, 304, 590, 365, 26369, 6012, 414]
Qwen2 tokens: [40, 1079, 12035, 311, 1473, 9660, 12230, 304, 1917, 311, 847, 444, 10994, 24198]
Chat prompt (Phi3): <|system|>
You are a helpful assistant<|end|>
<|user|>
Tell a light-hearted joke for a room of Data Scientists<|end|>
<|assistant|>

Chat prompt (Qwen2): <|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke for a room of Data Scientists<|im_end|>
<|im_start|>assistant



In [16]:
sample_code = """
def hello_world(person):
    print("Hello", person)
"""
tokens_code = starcoder2_tokenizer.encode(sample_code)
print("Starcoder2 tokens for code:")
for token in tokens_code:
    print(f"{token} = {starcoder2_tokenizer.decode(token)}")


Starcoder2 tokens for code:
222 = 

610 = def
17966 =  hello
100 = _
5879 = world
45 = (
6427 = person
731 = ):
303 = 
   
1489 =  print
459 = ("
8302 = Hello
411 = ",
4944 =  person
46 = )
222 = 

