# My Tokenizer

This is MyTokenizer, let's build 🚀

#### Import and Patterns

In [90]:
import regex as re
import os

GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
GPT4O_SPLIT_PATTERN = "|".join(
    [
        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
        r"""\p{N}{1,3}""",
        r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
        r"""\s*[\r\n]+""",
        r"""\s+(?!\S)""",
        r"""\s+""",
    ]
)

#### MyTokenizer Class

In [91]:

class MyTokenizer:
    def __init__(self, vocab_size=276, pattern=GPT2_SPLIT_PATTERN):
        self.vocab_size = vocab_size
        self.compiled_pattern = re.compile(pattern)
        self.num_merges = vocab_size - 256
        self.vocab = {index: bytes([index]) for index in range(256)}
        self.merges = {}
        self._build_vocab()


    def _build_vocab(self):
        for i in range(256):
            self.vocab[i] = bytes([i])
        for i in range(256, self.vocab_size):
            self.vocab[i] = bytes([i % 256]) + bytes([i // 256])

    def _get_stats(self, ids, counts=None):
        counts = {} if counts is None else counts
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, index):
        new_ids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                new_ids.append(index)
                i += 2
            else:
                new_ids.append(ids[i])
                i += 1
        return new_ids

    def train(self, text):
        # Split the text in chunks
        text_chunks = re.findall(self.compiled_pattern, text)

        # Input text processing
        ids = [list(text_chunk.encode('utf-8')) for text_chunk in text_chunks]

        print("🛠️ Training tokenizer...")
        for i in range(self.num_merges):
            stats = {}

            for chunk_ids in ids:
                self._get_stats(chunk_ids, stats)

            # Find the pair with the highest occ
            pair = max(stats, key=stats.get)
            index = 256 + i

            # Print every 1000 Merge
            if index % 1000 == 0:
                print(f"⚙️ Merge {i}: {pair} -> {index}")

            # Replace
            ids = [self._merge(chunk_ids, pair, index) for chunk_ids in ids]
            self.merges[pair] = index
            self.vocab[index] = self.vocab[pair[0]] + self.vocab[pair[1]]

    def load(self, file_prefix):
        """
        Recharge le tokenizer à partir des fichiers file_prefix.model et file_prefix.vocab.
        """
        print("🔄 Loading tokenizer...")
        model_file = file_prefix + ".model"
        if not os.path.exists(model_file):
            print(f"⚠️ Model file '{model_file}' not found. It will be created after training.")
            return
        with open(model_file, 'r', encoding='utf-8') as f:
            # Lire la version (ignorée ici)
            version = f.readline().strip()
            print(f"📜 Version: {version}")
            # Lire les merges
            self.merges = {}
            for line in f:
                token1, token2, idx = line.strip().split()
                self.merges[(int(token1), int(token2))] = int(idx)

        # Reconstruire le vocabulaire à partir des merges
        self.vocab = {i: bytes([i]) for i in range(256)}
        for (token1, token2), idx in self.merges.items():
            self.vocab[idx] = self.vocab[token1] + self.vocab[token2]

    def save(self, file_prefix, version_name="my_tokenizer_v1"):
        """
        Sauvegarde deux fichiers : file_prefix.model et file_prefix.vocab
        - Le fichier .model contient les informations nécessaires pour recharger le tokenizer.
        - Le fichier .vocab est une version lisible pour inspection humaine.
        """
        # Sauvegarder le fichier modèle
        model_file = file_prefix + ".model"
        with open(model_file, 'w', encoding='utf-8') as f:
            # Écrire une version pour identifier le format
            f.write(version_name + "\n")
            # Écrire les merges
            for pair, idx in self.merges.items():
                f.write(f"{pair[0]} {pair[1]} {idx}\n")

        # Sauvegarder le fichier vocabulaire
        vocab_file = file_prefix + ".vocab"
        with open(vocab_file, 'w', encoding='utf-8') as f:
            for idx, token in self.vocab.items():
                # Convertir les tokens en chaînes de caractères lisibles
                token_str = token.decode('utf-8', errors='replace')
                # Écrire uniquement les tokens standards (pas de tokens spéciaux)
                if idx >= 256:  # Supposons que les tokens spéciaux sont en dessous de 256
                    f.write(f"{idx}: {token_str}\n")
        print(f"💾 {version_name} saved to {model_file} and {vocab_file}")

    def encode(self, text):
        # Split the text in chunks
        text_chunks = re.findall(self.compiled_pattern, text)

        # Identifiers
        ids = []

        for text_chunk in text_chunks:
            text_bytes_chunk = text_chunk.encode('utf-8')
            ids_chunk = list(text_bytes_chunk)
            while len(ids_chunk) > 2:
                stats = self._get_stats(ids_chunk)
                min_pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
                if min_pair not in self.merges:
                    break
                index = self.merges[min_pair]
                ids_chunk = self._merge(ids_chunk, min_pair, index)
            ids.extend(ids_chunk)

        return ids

    def decode(self, ids):
        tokens = b"".join([self.vocab[index] for index in ids])
        text = tokens.decode('utf-8', errors='replace')
        return text


#### My Tokenizer v1 Train

In [92]:
my_basic_tokenizer = MyTokenizer(vocab_size=5000)
# Load the tokenizer from files
my_basic_tokenizer.load('my_basic_tokenizer')

# Train the tokenizer on a text file
with open('data_code.txt', 'r', encoding='utf-8') as f:
    my_basic_tokenizer.train(f.read())

# Save the tokenizer vocab and encoder files
my_basic_tokenizer.save('my_basic_tokenizer')

🔄 Loading tokenizer...
⚠️ Model file 'my_basic_tokenizer.model' not found. It will be created after training.
🛠️ Training tokenizer...
⚙️ Merge 744: (104, 726) -> 1000
⚙️ Merge 1744: (323, 1871) -> 2000
⚙️ Merge 2744: (949, 583) -> 3000
⚙️ Merge 3744: (3999, 1033) -> 4000
💾 my_tokenizer_v1 saved to my_basic_tokenizer.model and my_basic_tokenizer.vocab


#### My Tokenizer v2 Train

In [93]:
my_tokenizer = MyTokenizer(vocab_size=5000, pattern=GPT4_SPLIT_PATTERN)
# Load the tokenizer from files
my_tokenizer.load('my_tokenizer')

# Train the tokenizer on a text file
with open('data_code.txt', 'r', encoding='utf-8') as f:
    my_tokenizer.train(f.read())

# Save the tokenizer vocab and encoder files
my_tokenizer.save('my_tokenizer', version_name='my_tokenizer_v2')

🔄 Loading tokenizer...
⚠️ Model file 'my_tokenizer.model' not found. It will be created after training.
🛠️ Training tokenizer...
⚙️ Merge 744: (32, 76) -> 1000
⚙️ Merge 1744: (1999, 121) -> 2000
⚙️ Merge 2744: (363, 2500) -> 3000
⚙️ Merge 3744: (3999, 305) -> 4000
💾 my_tokenizer_v2 saved to my_tokenizer.model and my_tokenizer.vocab


### Let's compare 3 differents encoding

✖️ Vocabularies

✔️ Vocabularies (GPT2 Pattern)

☑️ Vocabularies Ameliored (GPT4 Pattern)

#### Load the tokenizer

In [94]:
# Load the tokenizer
tokenizer_without_files = MyTokenizer()
my_basic_tokenizer = MyTokenizer(vocab_size=5000)
my_basic_tokenizer.load('my_basic_tokenizer')
my_tokenizer = MyTokenizer(vocab_size=5000, pattern=GPT4_SPLIT_PATTERN)
my_tokenizer.load('my_tokenizer')

🔄 Loading tokenizer...
📜 Version: my_tokenizer_v1
🔄 Loading tokenizer...
📜 Version: my_tokenizer_v2


#### Comparaison

In [95]:
str = "Hello Tokenizer's Tokenizer’s TOKENIZER'S Tokenizer'S"

# ✖️ Vocabularies
encoded = tokenizer_without_files.encode(str)
decoded = tokenizer_without_files.decode(encoded)
print(f"Size: {len(encoded)}")
print("✖️ Encoded:", encoded)
print("Decoded:", decoded)

print('-' * 20)

# ✔️ Vocabularies (GPT2 Pattern)
encoded = my_basic_tokenizer.encode(str)
decoded = my_basic_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("✔️ Encoded:", encoded)
print("Decoded:", decoded)

print('-' * 20)

# ☑️ Vocabularies Ameliored (GPT4 Pattern)
encoded = my_tokenizer.encode(str)
decoded = my_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("☑️ Encoded:", encoded)
print("Decoded:", decoded)

Size: 55
✖️ Encoded: [72, 101, 108, 108, 111, 32, 84, 111, 107, 101, 110, 105, 122, 101, 114, 39, 115, 32, 84, 111, 107, 101, 110, 105, 122, 101, 114, 226, 128, 153, 115, 32, 84, 79, 75, 69, 78, 73, 90, 69, 82, 39, 83, 32, 84, 111, 107, 101, 110, 105, 122, 101, 114, 39, 83]
Decoded: Hello Tokenizer's Tokenizer’s TOKENIZER'S Tokenizer'S
--------------------
Size: 36
✔️ Encoded: [72, 463, 321, 330, 1513, 286, 1470, 290, 39, 115, 330, 1513, 286, 1470, 290, 3988, 153, 115, 330, 79, 75, 69, 78, 73, 90, 69, 82, 39, 83, 330, 1513, 286, 1470, 290, 39, 83]
Decoded: Hello Tokenizer's Tokenizer’s TOKENIZER'S Tokenizer'S
--------------------
Size: 36
☑️ Encoded: [72, 472, 321, 332, 1743, 285, 1609, 289, 39, 115, 332, 1743, 285, 1609, 289, 4584, 153, 115, 332, 79, 75, 69, 78, 73, 90, 69, 82, 39, 83, 332, 1743, 285, 1609, 289, 39, 83]
Decoded: Hello Tokenizer's Tokenizer’s TOKENIZER'S Tokenizer'S


In [96]:
python_code = """
from tensorflow.python.util.compat import collections_abc
from tensorflow.python.util.deprecation import deprecated_args
from tensorflow.python.util.tf_export import kwarg_only
from tensorflow.python.util.tf_export import tf_export

_T = TypeVar("_T")
GraphType = TypeVar("GraphType", bound="Graph")
OpStatsType = TypeVar("OpStatsType", bound="OpStats")
OperationType = TypeVar("OperationType", bound="Operation")
EagerTensorType = TypeVar("EagerTensorType", bound="_EagerTensorBase")


# TODO(b/307794935): Remove after bug is fixed.
is_oss = True  # Updated by copybara

# Temporary global switches determining if we should enable the work-in-progress
# calls to the C API. These will be removed once all functionality is supported.
_USE_C_API: bool = True
_USE_C_SHAPES: bool = True


_api_usage_gauge = monitoring.BoolGauge(
    "/tensorflow/api/ops_eager_execution",
    "Whether ops.enable_eager_execution() is called.")

_control_flow_api_gauge = monitoring.BoolGauge(
    "/tensorflow/api/enable_control_flow_v2",
    "Whether enable_control_flow_v2() is called.")

_tf_function_api_gauge = monitoring.BoolGauge(
    "/tensorflow/api/tf_function",
    "Whether tf.function() is used.")

# pylint: disable=protected-access
_DTYPES_INTERN_TABLE: dict[types_pb2.DataType, dtypes.DType] = (
    dtypes._INTERN_TABLE)
# pylint: enable=protected-access


def tensor_id(tensor) -> Any:
  ""Returns a unique identifier for this Tensor.""
  return tensor._id  # pylint: disable=protected-access


class _UserDeviceSpec(object):
  ""Store user-specified device and provide computation of merged device.""

  def __init__(self, device_name_or_function) -> None:
    self._device_name_or_function = device_name_or_function
    self.display_name = str(self._device_name_or_function)
    self.function = device_name_or_function
    self.raw_string = None

    if isinstance(device_name_or_function, pydev.MergeDevice):
      self.is_null_merge = device_name_or_function.is_null_merge

    elif callable(device_name_or_function):
      self.is_null_merge = False
      dev_func = self._device_name_or_function
      func_name = function_utils.get_func_name(dev_func)
      func_code = function_utils.get_func_code(dev_func)
      if func_code:
        fname = func_code.co_filename
        lineno = func_code.co_firstlineno
      else:
        fname = "unknown"
        lineno = -1
      self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)

    elif device_name_or_function is None:
      # NOTE(taylorrobie): This MUST be False. None signals a break in the
      #   device stack, so `is_null_merge` must be False for such a case to
      #   allow callers to safely skip over null merges without missing a None.
      self.is_null_merge = False

    else:
      self.raw_string = device_name_or_function
      self.function = pydev.merge_device(device_name_or_function)
      self.is_null_merge = self.function.is_null_merge

    # We perform this check in __init__ because it is of non-trivial cost,
    # and self.string_merge is typically called many times.
    self.fast_string_merge = isinstance(self.function, pydev.MergeDevice)

  def string_merge(self, node_def) -> str:
    if self.fast_string_merge:
      return self.function.shortcut_string_merge(node_def)

    return compat.as_str(_device_string(self.function(node_def)))

"""

# ✔️ Vocabularies
encoded = my_basic_tokenizer.encode(python_code)
decoded = my_basic_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("✔️ Encoded:", encoded)

print('-' * 20)

# ☑️ Vocabularies Ameliored
encoded = my_tokenizer.encode(python_code)
decoded = my_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("☑️ Encoded:", encoded)

Size: 1527
✔️ Encoded: [10, 102, 424, 426, 525, 46, 497, 572, 46, 117, 777, 46, 613, 300, 298, 571, 582, 115, 95, 1848, 99, 10, 102, 424, 426, 525, 46, 497, 572, 46, 117, 777, 46, 263, 1648, 298, 571, 509, 1194, 95, 846, 115, 10, 102, 424, 426, 525, 46, 497, 572, 46, 117, 777, 46, 116, 102, 95, 453, 672, 298, 571, 2757, 846, 95, 258, 415, 10, 102, 424, 426, 525, 46, 497, 572, 46, 117, 777, 46, 116, 102, 95, 453, 672, 298, 571, 259, 102, 95, 453, 672, 10, 10, 95, 84, 32, 61, 1978, 375, 526, 95, 84, 34, 41, 10, 469, 676, 32, 61, 1978, 375, 40, 34, 469, 676, 34, 44, 317, 1859, 61, 34, 71, 302, 34, 41, 10, 2420, 676, 32, 61, 1978, 375, 40, 34, 2420, 676, 34, 44, 317, 1859, 61, 34, 524, 1492, 34, 41, 10, 459, 676, 32, 61, 1978, 375, 40, 34, 459, 676, 34, 44, 317, 1859, 61, 34, 79, 385, 34, 41, 10, 759, 676, 32, 61, 1978, 375, 40, 34, 759, 676, 34, 44, 317, 1859, 1081, 95, 759, 1988, 34, 41, 10, 10, 10, 35, 1082, 79, 40, 98, 47, 4084, 53, 41, 58, 1079, 1158, 270, 1103, 833, 103, 32, 275, 408

### Test the new pattern used for GPT4o

The name's called o200k_base, it used 200k of tokens for vocabularies. Here we used a little part of data for the trainning with only 5k tokens

#### My Tokenizer v3 Train

In [97]:
my_advanced_tokenizer = MyTokenizer(vocab_size=5000, pattern=GPT4O_SPLIT_PATTERN)
# Load the tokenizer from files
my_advanced_tokenizer.load('my_advanced_tokenizer')

# Train the tokenizer on a text file
with open('data_code.txt', 'r', encoding='utf-8') as f:
    my_advanced_tokenizer.train(f.read())

# Save the tokenizer vocab and encoder files
my_advanced_tokenizer.save('my_advanced_tokenizer', version_name='my_tokenizer_v3')

🔄 Loading tokenizer...
⚠️ Model file 'my_advanced_tokenizer.model' not found. It will be created after training.
🛠️ Training tokenizer...
⚙️ Merge 744: (46, 357) -> 1000
⚙️ Merge 1744: (67, 1367) -> 2000
⚙️ Merge 2744: (2154, 34) -> 3000
⚙️ Merge 3744: (1762, 417) -> 4000
💾 my_tokenizer_v3 saved to my_advanced_tokenizer.model and my_advanced_tokenizer.vocab


In [101]:
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."


# ✔️ Vocabularies
encoded = my_basic_tokenizer.encode(text)
decoded = my_basic_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("✔️ Encoded:", encoded)

print('-' * 20)

# ☑️ Vocabularies Ameliored
encoded = my_tokenizer.encode(text)
decoded = my_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("☑️ Encoded:", encoded)

print('-' * 20)

# ☑️☑️ Vocabularies Advanced
encoded = my_advanced_tokenizer.encode(text)
decoded = my_advanced_tokenizer.decode(encoded)
print(f"Size: {len(encoded)}")
print("☑️☑️ Encoded:", encoded)

Size: 329
✔️ Encoded: [239, 188, 181, 239, 189, 142, 239, 189, 137, 239, 189, 131, 239, 189, 143, 239, 189, 132, 239, 189, 133, 33, 32, 240, 159, 133, 164, 240, 159, 133, 157, 240, 159, 133, 152, 240, 159, 133, 146, 240, 159, 133, 158, 240, 159, 133, 147, 240, 159, 133, 148, 3988, 189, 32, 240, 159, 135, 186, 3988, 140, 240, 159, 135, 179, 3988, 140, 240, 159, 135, 174, 3988, 140, 240, 159, 135, 168, 3988, 140, 240, 159, 135, 180, 3988, 140, 240, 159, 135, 169, 3988, 140, 240, 159, 135, 170, 33, 32, 240, 159, 152, 132, 407, 101, 32, 2044, 32, 289, 489, 105, 1464, 292, 101, 375, 342, 100, 270, 1469, 299, 408, 274, 101, 596, 101, 375, 468, 281, 102, 1765, 276, 114, 115, 4318, 108, 100, 119, 2095, 46, 944, 101, 270, 430, 1491, 1561, 311, 101, 972, 1671, 116, 259, 111, 32, 3988, 156, 115, 510, 672, 1262, 105, 1275, 3988, 157, 32, 262, 281, 306, 4023, 1550, 32, 40, 119, 393, 267, 427, 274, 300, 2340, 115, 3988, 148, 390, 733, 412, 828, 311, 465, 375, 95, 116, 292, 261, 270, 430, 274, 101, 6