# 1 Imports

In [17]:
# Imports

import os
from dotenv import load_dotenv
from transformers import AutoTokenizer

# 2 Configuration

In [30]:
# Load environment variables in a file called .env

load_dotenv()
hf_token = os.getenv('HF_TOKEN')

# 3 Try-out

## 3.1 Tokenizer

In [19]:
#tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b", trust_remote_code=True)

In [20]:
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(text)
tokens

[2,
 235285,
 1144,
 12826,
 577,
 1500,
 20107,
 31250,
 575,
 3105,
 577,
 970,
 629,
 18622,
 26838]

## 3.2 Detokenizer

In [21]:
tokenizer.decode(tokens)

'<bos>I am excited to show Tokenizers in action to my LLM engineers'

In [22]:
tokenizer.batch_decode(tokens)

['<bos>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' show',
 ' Token',
 'izers',
 ' in',
 ' action',
 ' to',
 ' my',
 ' L',
 'LM',
 ' engineers']

In [23]:
tokenizer.vocab

{'bbat': 130732,
 'bees': 82290,
 'Cw': 182462,
 '▁ਮ': 75614,
 '有何': 203929,
 'lge': 50006,
 'barrier': 97353,
 '▁felicidad': 92528,
 'NAR': 115911,
 '▁solch': 224866,
 '▁paja': 199480,
 'اليد': 93484,
 '▁rodeo': 129302,
 'ikan': 7573,
 '▁Blueberry': 160481,
 '▁Artists': 47863,
 'lege': 3931,
 '▁conectado': 137227,
 '▁moda': 17038,
 '▁風': 25232,
 'Bern': 37257,
 '楳': 250613,
 '▁economic': 6578,
 'Edition': 67527,
 '▁ensuring': 26936,
 'hiki': 181971,
 'poc': 103326,
 'Omega': 14731,
 'comet': 208767,
 '▁Kristin': 110560,
 '努力': 35429,
 '酸化': 204587,
 '▁murieron': 219449,
 'Hun': 88084,
 'jelen': 148153,
 'maschinen': 123775,
 'μέν': 64701,
 'టి': 90647,
 'maga': 53792,
 'டுத்த': 174298,
 '▁berubah': 96065,
 '▁alemanes': 223288,
 '▁eagles': 103058,
 '▁muñeca': 97172,
 '▁創': 128515,
 '▁Extreme': 43101,
 'ordnen': 219150,
 'Coeff': 150375,
 '▁Boyce': 167233,
 '▁günstig': 118337,
 'onora': 174679,
 '^{': 1235,
 'FS': 11103,
 'lma': 101463,
 'rosserie': 227472,
 '▁Nun': 36180,
 'ebenarnya':

In [24]:
tokenizer.get_added_vocab

<bound method PreTrainedTokenizerFast.get_added_vocab of GemmaTokenizerFast(name_or_path='google/gemma-7b', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedT

## 3.3 Instructed LLM

In [25]:
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)


In [29]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

You are a helpful assistant

User: Tell a light-hearted joke for a room of Data Scientists

Assistant:
