# Tokenizers

For this I explore the world of Tokenizers


In [1]:
from huggingface_hub import login
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv
from huggingface_hub import login

In [2]:
# Load the .env file
load_dotenv()

# Read and login with the token
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("HF_TOKEN not found in .env file")
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Accessing Llama 3.1 from Meta

In order to use the fantastic Llama 3.1, Meta does require you to sign their terms of service.

Visit their model instructions page in Hugging Face:
https://huggingface.co/meta-llama/Meta-Llama-3.1-8B

At the top of the page are instructions on how to agree to their terms. If possible, you should use the same email as your huggingface account.

If the next cell gives you an error, then please check:  
1. Are you logged in to HuggingFace? Try running `login()` to check your key works
2. Did you set up your API key with full read and write permissions?
3. If you visit the Llama3.1 page with the link above, does it show that you have access to the model near the top?


In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
text = "I’m genuinely thrilled to demonstrate Tokenizers in action for our LLM engineering team, highlighting how they " \
        "streamline text processing, optimize our training pipelines, and unlock superior model performance."

tokens = tokenizer.encode(text)
tokens

[128000,
 40,
 4344,
 36297,
 38564,
 311,
 20461,
 9857,
 12509,
 304,
 1957,
 369,
 1057,
 445,
 11237,
 15009,
 2128,
 11,
 39686,
 1268,
 814,
 82703,
 1495,
 8863,
 11,
 30536,
 1057,
 4967,
 58773,
 11,
 323,
 15405,
 16757,
 1646,
 5178,
 13]

In [6]:
len(tokens)

36

In [7]:
tokenizer.decode(tokens)

'<|begin_of_text|>I’m genuinely thrilled to demonstrate Tokenizers in action for our LLM engineering team, highlighting how they streamline text processing, optimize our training pipelines, and unlock superior model performance.'

In [8]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 '’m',
 ' genuinely',
 ' thrilled',
 ' to',
 ' demonstrate',
 ' Token',
 'izers',
 ' in',
 ' action',
 ' for',
 ' our',
 ' L',
 'LM',
 ' engineering',
 ' team',
 ',',
 ' highlighting',
 ' how',
 ' they',
 ' streamline',
 ' text',
 ' processing',
 ',',
 ' optimize',
 ' our',
 ' training',
 ' pipelines',
 ',',
 ' and',
 ' unlock',
 ' superior',
 ' model',
 ' performance',
 '.']

In [9]:
tokenizer.vocab

{'-on': 10539,
 'Ġwriteln': 82047,
 'Plans': 98828,
 'atomic': 6756,
 'Ġphon': 51923,
 '_deep': 88144,
 'Å¡it': 110255,
 'à¸µà¸¢à¸£à¸ķ': 126525,
 '=device': 20297,
 'łĢ': 64319,
 'ĠCharSequence': 58557,
 'thane': 86185,
 'Ġwalls': 14620,
 ']));čĊ': 62121,
 'Boom': 95334,
 'Ġfemmes': 31656,
 'Structure': 23807,
 'åħ·æľī': 110667,
 'Ġrhetoric': 34731,
 'Ġbeats': 34427,
 'uer': 8977,
 'Ġpremiere': 35952,
 'PostExecute': 66546,
 'Ġháº¥p': 111961,
 'ĠsÃ¸': 31430,
 'establish': 34500,
 'Ġblocking': 22978,
 'ĠTal': 18051,
 'ACHED': 53745,
 'Ġ#-': 31007,
 'Ġpaginate': 83074,
 'Ġpositioning': 39825,
 'atasets': 77749,
 'ï¾Ħ': 105225,
 '_MASTER': 48168,
 '.bpm': 94739,
 'SEO': 63879,
 'quential': 16216,
 'Ġpatched': 72832,
 'Ġolder': 9191,
 'Ġrgba': 24431,
 'ĠFirearms': 83617,
 'Ġobjs': 54037,
 'Ġimaginable': 90267,
 'à¸¥à¸Ńà¸ĩ': 112751,
 'ï»¿Ċ': 62619,
 'gain': 60246,
 '_descriptor': 34874,
 'loit': 79649,
 'Parcelable': 68367,
 '_wp': 40387,
 'lasÄ±': 114973,
 'ransition': 86054,
 'anker': 993

In [10]:
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

# Instruct variants of models

Many models offer a chat-optimized variant—typically suffixed with “Instruct”—that’s been trained to handle prompts structured into system, user, and assistant roles. To streamline preparing these inputs, you can call the apply_chat_template utility, which takes our familiar messages list and transforms it into the exact prompt format the model expects.

In [11]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)

In [12]:

messages = [
    {
        "role": "system",
        "content": (
            "You are a friendly, expert AI assistant with deep knowledge of data science "
            "and machine learning. You explain technical ideas clearly and engagingly, "
            "and you can sprinkle in just the right amount of wit to keep your audience smiling."
        )
    },
    {
        "role": "user",
        "content": (
            "Could you please share a light-hearted, data-science-themed joke for a room full of "
            "Data Scientists? Feel free to reference familiar tools, funny debugging moments, or "
            "common algorithmic headaches to make it extra relatable and entertaining."
        )
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly, expert AI assistant with deep knowledge of data science and machine learning. You explain technical ideas clearly and engagingly, and you can sprinkle in just the right amount of wit to keep your audience smiling.<|eot_id|><|start_header_id|>user<|end_header_id|>

Could you please share a light-hearted, data-science-themed joke for a room full of Data Scientists? Feel free to reference familiar tools, funny debugging moments, or common algorithmic headaches to make it extra relatable and entertaining.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




# Trying new models

Three new models:

- Phi3 from Microsoft
- Qwen2 from Alibaba Cloud
- Starcoder2 from BigCode (ServiceNow + HuggingFace + NVidia)

In [13]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [14]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)

text = "I’m genuinely thrilled to demonstrate Tokenizers in action for our LLM engineering team, highlighting how they " \
        "streamline text processing, optimize our training pipelines, and unlock superior model performance."
print(tokenizer.encode(text))
print()
tokens = phi3_tokenizer.encode(text)
print(phi3_tokenizer.batch_decode(tokens))


[128000, 40, 4344, 36297, 38564, 311, 20461, 9857, 12509, 304, 1957, 369, 1057, 445, 11237, 15009, 2128, 11, 39686, 1268, 814, 82703, 1495, 8863, 11, 30536, 1057, 4967, 58773, 11, 323, 15405, 16757, 1646, 5178, 13]

['I', '’', 'm', 'genu', 'in', 'ely', 'thr', 'illed', 'to', 'demonstrate', 'Token', 'izers', 'in', 'action', 'for', 'our', 'L', 'LM', 'engineering', 'team', ',', 'highlight', 'ing', 'how', 'they', 'stream', 'line', 'text', 'processing', ',', 'optimize', 'our', 'training', 'pip', 'elines', ',', 'and', 'un', 'lock', 'superior', 'model', 'performance', '.']


In [15]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly, expert AI assistant with deep knowledge of data science and machine learning. You explain technical ideas clearly and engagingly, and you can sprinkle in just the right amount of wit to keep your audience smiling.<|eot_id|><|start_header_id|>user<|end_header_id|>

Could you please share a light-hearted, data-science-themed joke for a room full of Data Scientists? Feel free to reference familiar tools, funny debugging moments, or common algorithmic headaches to make it extra relatable and entertaining.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



<|system|>
You are a friendly, expert AI assistant with deep knowledge of data science and machine learning. You explain technical ideas clearly and engagingly, and you can sprinkle in just the right amount of wit to keep your audience smiling.<|end|>
<|user|>
Could you please share a light

In [16]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

text = "I’m genuinely thrilled to demonstrate Tokenizers in action for our LLM engineering team, highlighting how they " \
        "streamline text processing, optimize our training pipelines, and unlock superior model performance."
print(tokenizer.encode(text))
print()
print(phi3_tokenizer.encode(text))
print()
print(qwen2_tokenizer.encode(text))

[128000, 40, 4344, 36297, 38564, 311, 20461, 9857, 12509, 304, 1957, 369, 1057, 445, 11237, 15009, 2128, 11, 39686, 1268, 814, 82703, 1495, 8863, 11, 30536, 1057, 4967, 58773, 11, 323, 15405, 16757, 1646, 5178, 13]

[306, 30010, 29885, 29120, 262, 873, 1468, 24455, 304, 22222, 25159, 19427, 297, 3158, 363, 1749, 365, 26369, 21639, 3815, 29892, 12141, 292, 920, 896, 4840, 1220, 1426, 9068, 29892, 24656, 1749, 6694, 8450, 24210, 29892, 322, 443, 908, 11558, 1904, 4180, 29889]

[40, 4249, 35197, 37464, 311, 19869, 9660, 12230, 304, 1917, 369, 1039, 444, 10994, 14667, 2083, 11, 38586, 1246, 807, 81603, 1467, 8692, 11, 29436, 1039, 4862, 57673, 11, 323, 15055, 16353, 1614, 5068, 13]


In [17]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly, expert AI assistant with deep knowledge of data science and machine learning. You explain technical ideas clearly and engagingly, and you can sprinkle in just the right amount of wit to keep your audience smiling.<|eot_id|><|start_header_id|>user<|end_header_id|>

Could you please share a light-hearted, data-science-themed joke for a room full of Data Scientists? Feel free to reference familiar tools, funny debugging moments, or common algorithmic headaches to make it extra relatable and entertaining.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



<|system|>
You are a friendly, expert AI assistant with deep knowledge of data science and machine learning. You explain technical ideas clearly and engagingly, and you can sprinkle in just the right amount of wit to keep your audience smiling.<|end|>
<|user|>
Could you please share a light

In [18]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)
code = """
def hello_world(person: str) -> None:
    \"\"\"Print a warm, personalized greeting for the given person.

    Args:
        person (str): The name of the person to greet.

    Raises:
        TypeError: If `person` is not a string.

    Example:
        >>> hello_world("Alex")
        Hello, Alex! Hope you’re having a fantastic day.
    \"\"\"
    if not isinstance(person, str):
        raise TypeError(f"Expected 'person' to be a str, but got {type(person).__name__}")
    
    # Construct and display the greeting
    greeting = f"Hello, {person}! Hope you’re having a fantastic day."
    print(greeting)
"""

tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={starcoder2_tokenizer.decode(token)}")

222=

610=def
17966= hello
100=_
5879=world
45=(
6427=person
63=:
615= str
46=)
984= ->
1686= None
63=:
303=
   
1547= """
4026=Print
331= a
33027= warm
49=,
12285= personal
1209=ized
504= g
32648=reeting
456= for
341= the
2716= given
4944= person
51=.
465=

   
10485= Args
63=:
310=
       
4944= person
327= (
484=str
731=):
906= The
655= name
451= of
341= the
4944= person
391= to
504= g
7111=reet
51=.
465=

   
34911= Raises
63=:
310=
       
12726= TypeError
63=:
1691= If
548= `
6427=person
101=`
458= is
666= not
331= a
821= string
51=.
465=

   
5958= Example
63=:
310=
       
8257= >>>
17966= hello
100=_
5879=world
459=("
27414=Alex
678=")
310=
       
12009= Hello
49=,
21199= Alex
38=!
36744= Hope
863= you
2754=’
287=re
7712= having
331= a
315= f
710=ant
6452=astic
5830= day
51=.
303=
   
1547= """
303=
   
434= if
666= not
6968= isinstance
45=(
6427=person
49=,
615= str
731=):
310=
       
3655= raise
12726= TypeError
45=(
107=f
39="
7705=Expected
349= '
6427=person
44='
391= to