### Encoding & Decoding

In [None]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
input1 = "The quarterly sales report exceeded expectations."
input2 = "The quarterly sales report exceeded expectations. artifically"

# Step 1: Word Piece Tokenization
tokens1 = tokenizer.tokenize(input1)
print("Tokens:", tokens1)

tokens2 = tokenizer.tokenize(input2)
print("Tokens:", tokens2)

Tokens: ['the', 'quarterly', 'sales', 'report', 'exceeded', 'expectations', '.']
Tokens: ['the', 'quarterly', 'sales', 'report', 'exceeded', 'expectations', '.', 'art', '##ific', '##ally']


In [None]:
# Step 2: IDs for tokens
input_ids1 = tokenizer.convert_tokens_to_ids(tokens1)
print("Input IDs for tokens1:", input_ids1)
print()
input_ids2 = tokenizer.convert_tokens_to_ids(tokens2)
print("Input IDs for tokens2:", input_ids2)

Input IDs for tokens1: [1996, 12174, 4341, 3189, 14872, 10908, 1012]

Input IDs for tokens2: [1996, 12174, 4341, 3189, 14872, 10908, 1012, 2396, 18513, 3973]


In [None]:
# Note: Steps (1) and (2) are combined in the tokenizer's __call__ method and called encoding. 

encoding1 = tokenizer(input1)
encoding1 
encoding2 = tokenizer(input2) 
encoding2

{'input_ids': [101, 1996, 12174, 4341, 3189, 14872, 10908, 1012, 2396, 18513, 3973, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
# Step 3: Decoding from IDs back to text

decoded_input = tokenizer.decode(encoding1['input_ids'])
print("Decoded Input 1:", decoded_input)

Decoded Input 1: [CLS] the quarterly sales report exceeded expectations. [SEP]


In [13]:
# Step 4: Handle Missing Inputs / Sentences (Batch Encoding)

inputs = [
    'The quarterly sales report exceeded expectations.',
    'I love using AI'
]

encoded_inputs = tokenizer(inputs, padding=True)

encoded_inputs

{'input_ids': [[101, 1996, 12174, 4341, 3189, 14872, 10908, 1012, 102], [101, 1045, 2293, 2478, 9932, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [19]:
# Step 5: Truncation (useful for cost control)

inputs = [
    'The quarterly sales report exceeded expectations.',
    'I love using AI.'
]

encoded_inputs_trunc = tokenizer(inputs, padding=True, truncation=True, max_length=5)

encoded_inputs_trunc

{'input_ids': [[101, 1996, 12174, 4341, 102], [101, 1045, 2293, 2478, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [None]:
# Step 6: Explore other tokenizer models 
from transformers import AutoTokenizer

# tokenizer1 = AutoTokenizer.from_pretrained("openai-community/gpt2") 
tokenizer2 = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") 

input1 = "The quarterly sales report exceeded expectations."

tokens1 = tokenizer.tokenize(input1)
print("Tokens:", tokens1)
print()
tokens2 = tokenizer2.tokenize(input1)
print("Tokens:", tokens2)

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Tokens: ['The', 'Ġquarterly', 'Ġsales', 'Ġreport', 'Ġexceeded', 'Ġexpectations', '.']
Tokens: ['▁The', '▁quarter', 'ly', '▁sales', '▁report', '▁exceed', 'ed', '▁expectations', '.']
