## Tokenizers

In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv

In [None]:
load_dotenv(override=True)
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

## Acccessing llama 3.1 from meta

In [None]:
model='meta-llama/Meta-llama-3.1-8b'
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)

In [None]:
text = "I am excited to show tokenizers in action to my llm engineers"
tokens = tokenizer.encode(text)
tokens

In [None]:
len(tokens)

In [None]:
tokenizer.decode(tokens)

In [None]:
tokenizer.batch_decode(tokens)

In [None]:
# Get the vocabulary that was added to the tokenizer (tokens not present in the original pretrained model)
added_vocab = tokenizer.get_added_vocab()
print("Added vocabulary:", added_vocab)
# Explanation:
# The get_added_vocab() method returns a dictionary of tokens that were added to the tokenizer after it was loaded or created.
# This is useful when you've customized the tokenizer with new special tokens, domain-specific words, or other additions.

## Instruct variants of models

In [None]:
model = 'meta-llama/Meta-Llama-3.1-8b-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)

In [None]:
messages = [
    {"role": "system", "content":"You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientist"}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

## Trying new models

In [None]:
PHI3_MODEL_NAME="microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME="Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME= "bigcode/starcoder2-3b"

In [None]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)

text = "I am excited to show Tokenizers in action to my llm engineers"
print(tokenizer.encode(text))
print()
tokens = phi3_tokenizer.encode(text)
print(phi3_tokenizer.batch_decode(tokens))

In [None]:
print(tokenizer.apply_chat_template(messages, tokeinize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

text = "I am excited to show Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print()
print(phi3_tokenizer.encode(text))
print()
print(qwen2_tokenizer.encode(text))


In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={starcoder2_tokenizer.decode(token)}")