In [12]:
from transformers import AutoTokenizer
import sentencepiece as spm

  from .autonotebook import tqdm as notebook_tqdm


#### 0

In [None]:
# Compare GPT-2 vs LLaMA tokenization

In [13]:
def compare_tokenizers(texts):
    gpt2 = AutoTokenizer.from_pretrained('gpt2')
    llama =  AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")

    for i in texts:
        print(f"\nOriginal: '{i}'")
        
        # GPT-2 (BPE)
        gpt2_token = gpt2.encode(i)
        gpt_decoded = [gpt2.decode([x]) for x in gpt2_token]
        print(f"gpt token: {gpt2_token}")
        print(f"gpt decoded: {gpt_decoded}")

        # LLAMA
        llama_token = llama.encode(i)
        llama_decoded = [llama.decode([x]) for x in llama_token]
        print(f"LLaMA Tokens: {llama_token}")
        print(f"LLaMA Decoded: {llama_decoded}")

In [14]:
# Test with various texts
test_texts = [
    "Hello world!",
    "I'm learning AI.",
    "üá≥üá¨ Nigeria",  # Emoji + text
    "def calculate_loss():",  # Code
    "„Åì„Çì„Å´„Å°„ÅØ",  # Japanese
    "I will understand tokenization. Let's figure it out."
]

compare_tokenizers(test_texts)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin


Original: 'Hello world!'
gpt token: [15496, 995, 0]
gpt decoded: ['Hello', ' world', '!']
LLaMA Tokens: [1, 16644, 924, 31905]
LLaMA Decoded: ['<s>', 'Hello', 'world', '!']

Original: 'I'm learning AI.'
gpt token: [40, 1101, 4673, 9552, 13]
gpt decoded: ['I', "'m", ' learning', ' AI', '.']
LLaMA Tokens: [1, 312, 31876, 31836, 3187, 7421, 31843]
LLaMA Decoded: ['<s>', 'I', "'", 'm', 'learning', 'AI', '.']

Original: 'üá≥üá¨ Nigeria'
gpt token: [8582, 229, 111, 8582, 229, 105, 19398]
gpt decoded: ['ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', ' Nigeria']
LLaMA Tokens: [1, 31822, 243, 162, 138, 182, 243, 162, 138, 175, 8700]
LLaMA Decoded: ['<s>', '', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'ÔøΩ', 'Nigeria']

Original: 'def calculate_loss():'
gpt token: [4299, 15284, 62, 22462, 33529]
gpt decoded: ['def', ' calculate', '_', 'loss', '():']
LLaMA Tokens: [1, 918, 15667, 31889, 19388, 20940]
LLaMA Decoded: ['<s>', 'def', 'calculate', '_', 'loss', '():']

Original: '„Åì„Çì„Å´„Å°„ÅØ'
g

**Line-by-line:**

1. **Define function** `compare_tokenizers` that takes `texts`.
2. **Load GPT-2 tokenizer** (BPE method).
3. **Load OpenLLaMA tokenizer** (SentencePiece method).
4. **Loop** through each text `i`.
5. **Print** original text.
6. **GPT-2**: Encode text to token IDs.
7. **GPT-2**: Decode each token back to string.
8. **Print** GPT-2 tokens and decoded strings.
9. **LLaMA**: Encode text to token IDs.
10. **LLaMA**: Decode each token back to string.
11. **Print** LLaMA tokens and decoded strings.

Shows how each tokenizer splits and reconstructs text.

In [16]:
# <!-- NOTE: '<s>' == (start of sentence) -->

In [16]:
def compare_again(texts):
    gpt2 = AutoTokenizer.from_pretrained('gpt2')
    llama =  AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")

    for i in texts:
        print(f"\nOriginal: '{i}'")
        
        gpt2_token = gpt2.encode(i)
        gpt_pieces = gpt2.convert_ids_to_tokens(gpt2_token)
        print(f"gpt token: {gpt2_token}")
        print(f"gpt pieces: {gpt_pieces}")
        
        llama_token = llama.encode(i)
        llama_pieces = llama.convert_ids_to_tokens(llama_token)
        print(f"LLaMA Tokens: {llama_token}")
        print(f"LLaMA Pieces: {llama_pieces}")

In [17]:
# Test with various texts
test_texts = [
    "Hello world!",
    "I'm learning AI.",
    "üá≥üá¨ Nigeria",  # Emoji + text
    "def calculate_loss():",  # Code
    "„Åì„Çì„Å´„Å°„ÅØ",  # Japanese
    "I will understand tokenization. Let's figure it out."
]

compare_again(test_texts)


Original: 'Hello world!'
gpt token: [15496, 995, 0]
gpt pieces: ['Hello', 'ƒ†world', '!']
LLaMA Tokens: [1, 16644, 924, 31905]
LLaMA Pieces: ['<s>', '‚ñÅHello', '‚ñÅworld', '!']

Original: 'I'm learning AI.'
gpt token: [40, 1101, 4673, 9552, 13]
gpt pieces: ['I', "'m", 'ƒ†learning', 'ƒ†AI', '.']
LLaMA Tokens: [1, 312, 31876, 31836, 3187, 7421, 31843]
LLaMA Pieces: ['<s>', '‚ñÅI', "'", 'm', '‚ñÅlearning', '‚ñÅAI', '.']

Original: 'üá≥üá¨ Nigeria'
gpt token: [8582, 229, 111, 8582, 229, 105, 19398]
gpt pieces: ['√∞≈Å', 'ƒ©', '¬≥', '√∞≈Å', 'ƒ©', '¬¨', 'ƒ†Nigeria']
LLaMA Tokens: [1, 31822, 243, 162, 138, 182, 243, 162, 138, 175, 8700]
LLaMA Pieces: ['<s>', '‚ñÅ', '<0xF0>', '<0x9F>', '<0x87>', '<0xB3>', '<0xF0>', '<0x9F>', '<0x87>', '<0xAC>', '‚ñÅNigeria']

Original: 'def calculate_loss():'
gpt token: [4299, 15284, 62, 22462, 33529]
gpt pieces: ['def', 'ƒ†calculate', '_', 'loss', '():']
LLaMA Tokens: [1, 918, 15667, 31889, 19388, 20940]
LLaMA Pieces: ['<s>', '‚ñÅdef', '‚ñÅcalculate', '_',

**Line-by-line:**

1. **Define function** `compare_again` taking `texts`.
2. **Load GPT-2 tokenizer**.
3. **Load OpenLLaMA tokenizer**.
4. **Loop** through each text `i`.
5. **Print** original text.
6. **GPT-2**: Encode text to token IDs.
7. **GPT-2**: Convert token IDs to subword pieces.
8. **Print** GPT-2 tokens and pieces.
9. **LLaMA**: Encode text to token IDs.
10. **LLaMA**: Convert token IDs to subword pieces.
11. **Print** LLaMA tokens and pieces.

Shows actual subwords (like `ƒ†` or `<0xE3>`) instead of decoded strings.

**ƒ†**: GPT-2's marker for a space before a word.

**<0xE3>**: LLaMA's marker for the byte `0xE3` (part of a UTF-8 character).

Both show how tokenizers represent raw text internally.

#### 1   ---->   Advanced Tokenization Analysis

In [24]:
def analyze(texts, model):
    tokenizer = AutoTokenizer.from_pretrained(model)

    # Get Tokens and Their Positions
    encoding = tokenizer(texts, return_offsets_mapping=True)
    tokens = encoding.input_ids
    offsets = encoding.offset_mapping
    decodedth = [tokenizer.decode([i]) for i in tokens]

    print(f"\n--- {model} ---")
    print(f"Text: '{texts}'")
    print(f"Total offsets: {offsets}")
    print(f"Total tokens: {tokens}")
    print(f"Total tokens: {len(tokens)}")
    print(f"-------------------------------------------------------------------------------------------->")

    # show token --to--> text mapping
    for i, (token, (start, end)) in enumerate(zip(tokens, offsets)):
        token_text = texts[start:end]
        decoded = tokenizer.decode([token])
        print(f"Token {i:2d}: {token:5d} --> '{token_text}' (decode: '{decoded}')")

    return tokens, decodedth

In [25]:
texts = "I will understand tokenization. Let's figure it out."

model = 'gpt2'
# model = "openlm-research/open_llama_7b"

analyze(texts, model)


--- gpt2 ---
Text: 'I will understand tokenization. Let's figure it out.'
Total offsets: [(0, 1), (1, 6), (6, 17), (17, 23), (23, 30), (30, 31), (31, 35), (35, 37), (37, 44), (44, 47), (47, 51), (51, 52)]
Total tokens: [40, 481, 1833, 11241, 1634, 13, 3914, 338, 3785, 340, 503, 13]
Total tokens: 12
-------------------------------------------------------------------------------------------->
Token  0:    40 --> 'I' (decode: 'I')
Token  1:   481 --> ' will' (decode: ' will')
Token  2:  1833 --> ' understand' (decode: ' understand')
Token  3: 11241 --> ' token' (decode: ' token')
Token  4:  1634 --> 'ization' (decode: 'ization')
Token  5:    13 --> '.' (decode: '.')
Token  6:  3914 --> ' Let' (decode: ' Let')
Token  7:   338 --> ''s' (decode: ''s')
Token  8:  3785 --> ' figure' (decode: ' figure')
Token  9:   340 --> ' it' (decode: ' it')
Token 10:   503 --> ' out' (decode: ' out')
Token 11:    13 --> '.' (decode: '.')


([40, 481, 1833, 11241, 1634, 13, 3914, 338, 3785, 340, 503, 13],
 ['I',
  ' will',
  ' understand',
  ' token',
  'ization',
  '.',
  ' Let',
  "'s",
  ' figure',
  ' it',
  ' out',
  '.'])

**Line-by-line:**

1. **Define function** `analyze` taking `texts` and `model`.
2. **Load tokenizer** for given model.
3. **Encode text** with `return_offsets_mapping=True` to get character positions.
4. **Extract token IDs** and offset mappings.
5. **Print** model name.
6. **Print** original text.
7. **Print** all offsets.
8. **Print** all tokens.
9. **Print** total token count.
10. **Print** separator line.
11. **Loop** through each token, its start/end position.
12. **Slice** original text to get substring.
13. **Decode** token to string.
14. **Print** token index, ID, substring, and decoded string.
15. **Return** tokens and last decoded string.

Shows exact text-to-token mapping with character positions.

#### 2 .....  Custom Tokenizer Training (Advanced)

In [26]:
from tokenizers import ByteLevelBPETokenizer

In [2]:
# Train a simple BPE tokenizer from scratch

In [6]:
def train_tokenizer(texts, vocab_size=1000):
    tokenizer = ByteLevelBPETokenizer()

    with open('training_text.txt', 'w') as w:
        for text in texts:
            w.write(text + '\n')

    # Train tokenizer
    tokenizer.train(
        files = ['training_text.txt'],
        vocab_size = vocab_size,
        min_frequency = 2,
        special_tokens = ["<|endoftext|>", "<|pad|>", "<|unk|>"]
    )

    # Test it
    encoded = tokenizer.encode("Hello Sir?")
    print(f"Custom tokens: {encoded.tokens}")
    print(f"Custom IDs: {encoded.ids}")

    return tokenizer

In [7]:
# Train on sample data
sample = [
    "The quick brown fox jumps over the lazy dog.",
    "Hello world! This is a test.",
    "Machine learning is fascinating.",
    "Tokenization converts text to numbers."
]

In [8]:
custom_tokenizer = train_tokenizer(sample)

Custom tokens: ['H', 'e', 'l', 'l', 'o', 'ƒ†', 'S', 'i', 'r', '?']
Custom IDs: [42, 71, 78, 78, 81, 223, 53, 75, 84, 33]


In [29]:
print(custom_tokenizer)

Tokenizer(vocabulary_size=274, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)


**Line-by-line:**

1. **Define function** `train_tokenizer` with `texts` and `vocab_size` (default 1000).
2. **Initialize** a Byte-Pair Encoding (BPE) tokenizer.
3. **Open file** `training_text.txt` for writing.
4. **Loop** through texts, write each to file.
5. **Train tokenizer** on the file:
   - `vocab_size`: Target vocabulary size.
   - `min_frequency`: Minimum times a token must appear.
   - `special_tokens`: Added special tokens.
6. **Test tokenizer** on "Hello Sir?".
7. **Print** resulting subword tokens.
8. **Print** their token IDs.
9. **Return** trained tokenizer.
10. **Define sample text** list.
11. **Call function** to train and get `custom_tokenizer`.

Trains a new BPE tokenizer from scratch on custom data.