# Learning TikToken

- tiktoken is a fast open-source tokenizer by OpenAI.

In [1]:
import tiktoken

In [2]:
print("Version of TikToken:", tiktoken.__version__)

Version of TikToken: 0.9.0


In [3]:
# ENCODING_NAME: str = "r50k_base"
# ENCODING_NAME: str = "p50k_base"
# ENCODING_NAME: str = "cl100k_base"
ENCODING_NAME: str = "o200k_base"

### When you want to using a new encoding, We must wait until the related files downloaded!

- It takes about 3 to 6 minutes downloading each encoding related file(s)!

In [4]:
text: str = "Hello, World!"

encoding = tiktoken.get_encoding(encoding_name=ENCODING_NAME)
assert encoding.decode(tokens=encoding.encode(text=text)) == text

In [5]:
encoding = tiktoken.get_encoding(encoding_name=ENCODING_NAME)
tokens: list[int] = encoding.encode(text=text)

print("Tokens:", tokens)

Tokens: [13225, 11, 5922, 0]


In [6]:
print("Tokens count:", len(tokens))

Tokens count: 4


In [7]:
decoded = encoding.decode_single_token_bytes(token=tokens[0])

print(decoded)

b'Hello'


In [8]:
for token in tokens:
    decoded_token_bytes = encoding.decode_single_token_bytes(token=token)

    # print(token, decoded_token_bytes)
    print(str(token).ljust(10, " "), decoded_token_bytes)

13225      b'Hello'
11         b','
5922       b' World'
0          b'!'


In [9]:
for token in tokens:
    decoded_token_bytes = encoding.decode_single_token_bytes(token=token)

    # decoded_token_string = decoded_token_bytes.decode(encoding="utf-8")
    decoded_token_string = decoded_token_bytes.decode(encoding="utf-8").replace(
        " ", "_"
    )

    print(
        str(token).ljust(10, " "),
        decoded_token_string.ljust(10, " "),
        decoded_token_bytes,
    )

13225      Hello      b'Hello'
11         ,          b','
5922       _World     b' World'
0          !          b'!'


In [10]:
ENCODING_NAME: str = "gpt-4o-mini"

encoding = tiktoken.encoding_for_model(model_name=ENCODING_NAME)
encoded_text = tokens = encoding.encode(text=text)

print(encoded_text)

[13225, 11, 5922, 0]


In [11]:
ENCODING_NAME: str = "gpt-4o"

encoding = tiktoken.encoding_for_model(model_name=ENCODING_NAME)
encoded_text = tokens = encoding.encode(text=text)

print(encoded_text)

[13225, 11, 5922, 0]


In [12]:
encoding_names_list = tiktoken.list_encoding_names()

print(encoding_names_list)

['gpt2', 'r50k_base', 'p50k_base', 'p50k_edit', 'cl100k_base', 'o200k_base']
