In [4]:
from typing import Dict, List, Optional
import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
vocab = tokenizer.get_vocab()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
from typing import Dict, List, Optional
import re
import tiktoken
from tiktoken._educational import *
enc = tiktoken.get_encoding("cl100k_base")
len(enc._mergeable_ranks.keys())
list(enc._mergeable_ranks.keys())[-10]
enc._mergeable_ranks
vocab = enc._mergeable_ranks
# tokenizer = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
# vocab = tokenizer.mergeable_ranks



In [None]:
def bpe(mergeable_ranks, token, max_rank):
    # helper function used in get_gpt4_merges() to reconstruct the merge forest
    parts = [bytes([b]) for b in token]
    while True:
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
    return parts


def recover_merges(mergeable_ranks):
    # the `merges` are already the byte sequences in their merged state.
    # so we have to recover the original pairings. We can do this by doing
    # a small BPE training run on all the tokens, in their order.
    # also see https://github.com/openai/tiktoken/issues/60
    # also see https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306
    merges = {}
    for token, rank in mergeable_ranks.items():
        if len(token) == 1:
            continue # skip raw bytes
        pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
        assert len(pair) == 2
        # recover the integer ranks of the pair
        ix0 = mergeable_ranks[pair[0]]
        ix1 = mergeable_ranks[pair[1]]
        merges[(ix0, ix1)] = rank

    return merges
recover_merges(enc._mergeable_ranks)

In [51]:
from tiktoken._educational import *
SimpleBytePairEncoding.from_tiktoken("cl100k_base").encode("Hello World")

[48;5;167mH[48;5;179me[48;5;185ml[48;5;77ml[48;5;80mo[0m
[48;5;167mH[48;5;179mel[48;5;77ml[48;5;80mo[0m
[48;5;167mH[48;5;179mel[48;5;77mlo[0m
[48;5;167mH[48;5;179mello[0m
[48;5;167mHello[0m

[48;5;167m [48;5;179mW[48;5;185mo[48;5;77mr[48;5;80ml[48;5;68md[0m
[48;5;167m [48;5;179mW[48;5;185mor[48;5;80ml[48;5;68md[0m
[48;5;167m W[48;5;185mor[48;5;80ml[48;5;68md[0m
[48;5;167m W[48;5;185mor[48;5;80mld[0m
[48;5;167m W[48;5;185morld[0m
[48;5;167m World[0m



[9906, 4435]

In [None]:
# [key for key, value in vocab.items() if key.startswith("I") and key.]

In [4]:
from typing import Dict, List, Optional
import re
import tiktoken
from tiktoken._educational import *
enc = tiktoken.get_encoding("cl100k_base")
len(enc._mergeable_ranks.keys())
list(enc._mergeable_ranks.keys())[-10]
enc._mergeable_ranks
vocab = enc._mergeable_ranks
# tokenizer = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
# vocab = tokenizer.mergeable_ranks


def find_matching_tokens(prefix: str, vocab: Dict[bytes, int]) -> List[bytes]:
    """
    Find all tokens in vocabulary that start with the given prefix
    
    Args:
        prefix: String prefix to match against
        vocab: Dictionary mapping token bytes to token ids
        
    Returns:
        List of tokens that match the prefix
    """
    # Convert prefix to bytes for matching
    prefix_bytes = prefix.encode('utf-8')
    
    # Find all matching tokens
    matches = []
    for token in vocab.keys():
        # Convert token to string representation
        if token.startswith(b'\xc4\xa0'): # Ġ in UTF-8
            token_str = b' ' + token[2:] 
        elif token.startswith(b'##'):
            token_str = token[2:]
        else:
            token_str = token
            
        if token_str.startswith(prefix_bytes):
            matches.append(token)
            
    return matches

def find_tokens_from_right(sentence: str, vocab: Dict[bytes, int]) -> List[List[bytes]]:
    """
    Find matching tokens by taking progressively longer prefixes from the right side
    
    Args:
        sentence: Input sentence to analyze
        vocab: Dictionary mapping token bytes to token ids
        
    Returns:
        List of lists, where each inner list contains tokens matching the prefix
        starting from that position
    """
    matches_by_position = []
    
    # Start from the end and work backwards
    for i in range(len(sentence)):
        prefix = sentence[-(i+1):]  # Take i+1 chars from the right
        matching_tokens = find_matching_tokens(prefix, vocab)
        
        # Filter tokens based on whether they maintain token boundaries when re-encoded
        filtered_tokens = []
        for token in matching_tokens:
            # Convert token to string representation
            if token.startswith(b'\xc4\xa0'): # Ġ in UTF-8
                token_str = b' ' + token[2:]
            elif token.startswith(b'##'):
                token_str = token[2:]
            else:
                token_str = token
                
            # Create test sentence by replacing prefix with this token
            test_str = sentence[:-(i+1)].encode('utf-8') + token_str
            # print(token_str,":",test_str)
            # Re-encode the test sentence
            test_tokens = enc.encode(test_str.decode('utf-8', errors='ignore'))
            
            # Get tokens for original sentence up to the replacement point
            original_tokens = enc.encode(sentence[:-(i+1)])
            # Combine with our test token
            expected_tokens = original_tokens + test_tokens[-1:]
            # Re-encode full test string to compare
            actual_tokens = enc.encode(test_str.decode('utf-8', errors='ignore'))
            # print(expected_tokens,":",actual_tokens)
            # Check if tokens match except for the last one we replaced
            if expected_tokens == actual_tokens:
                # print("Match")
                filtered_tokens.append(token)
        matching_tokens = filtered_tokens
        matches_by_position.append(matching_tokens)
        
    return matches_by_position

# Example usage
test_sentence = "The agreement was signed unconditionall"
test_sentence = "The agreement was signed unconditiona"
test_sentence = "He introduced an intermediar"
test_sentence = "We found a hidden correla"
# test_sentence = "I bought some apple"
# test_sentence = "I am indivi"

right_matches = find_tokens_from_right(test_sentence, vocab)
combinations = []
print("\nMatching tokens from right to left:")
for i, matches in enumerate(right_matches):
    prefix = test_sentence[-(i+1):]
    print(test_sentence[:-(i+1)])
    print(f"\nPrefix '{prefix}' matches '{len(matches)}' tokens")
    if len(matches) > 0:
        combinations.append({
            'position': i,
            'prefix': test_sentence[:-(i+1)],
            'matches': matches
        })
    for token in matches[:5]:  # Show first 5 matches for each position
        print(f"  {token}")
        
combinations



Matching tokens from right to left:
We found a hidden correl

Prefix 'a' matches '2218' tokens
  b'a'
  b'at'
  b'an'
  b'ar'
  b'al'
We found a hidden corre

Prefix 'la' matches '13' tokens
  b'lass'
  b'label'
  b'land'
  b'lands'
  b'labels'
We found a hidden corr

Prefix 'ela' matches '0' tokens
We found a hidden cor

Prefix 'rela' matches '1' tokens
  b'relative'
We found a hidden co

Prefix 'rrela' matches '0' tokens
We found a hidden c

Prefix 'orrela' matches '0' tokens
We found a hidden 

Prefix 'correla' matches '0' tokens
We found a hidden

Prefix ' correla' matches '5' tokens
  b' correlation'
  b' correlated'
  b' correlations'
  b' correlate'
  b' correlates'
We found a hidde

Prefix 'n correla' matches '0' tokens
We found a hidd

Prefix 'en correla' matches '0' tokens
We found a hid

Prefix 'den correla' matches '0' tokens
We found a hi

Prefix 'dden correla' matches '0' tokens
We found a h

Prefix 'idden correla' matches '0' tokens
We found a 

Prefix 'hidden correla' 

[{'position': 0,
  'prefix': 'We found a hidden correl',
  'matches': [b'a',
   b'at',
   b'an',
   b'ar',
   b'al',
   b'as',
   b'am',
   b'ad',
   b'ag',
   b'ay',
   b'ab',
   b'ame',
   b'ap',
   b'av',
   b'age',
   b'ata',
   b'ain',
   b'art',
   b'ack',
   b'able',
   b'ant',
   b'ase',
   b'ave',
   b'ang',
   b'act',
   b'alue',
   b'all',
   b'are',
   b'ast',
   b'ard',
   b'ace',
   b'ac',
   b'ak',
   b'ath',
   b'ans',
   b'ail',
   b'ach',
   b'aram',
   b'ary',
   b'aw',
   b'app',
   b'ance',
   b'ax',
   b'alse',
   b'add',
   b'ake',
   b'ally',
   b'atch',
   b'atic',
   b'ark',
   b'ange',
   b'ator',
   b'arg',
   b'ays',
   b'ault',
   b'ater',
   b'ames',
   b'ash',
   b'ader',
   b'atus',
   b'ann',
   b'ade',
   b'ask',
   b'ating',
   b'arch',
   b'arr',
   b'amp',
   b'als',
   b'anc',
   b'ages',
   b'ayer',
   b'ank',
   b'arget',
   b'air',
   b'action',
   b'ait',
   b'ateg',
   b'aph',
   b'az',
   b'arent',
   b'ayout',
   b'ager',
   b'ackage',
   b

In [13]:
enc.decode([23936])

'isible'