In [1]:
import tiktoken

In [3]:
encoding = tiktoken.get_encoding("gpt2")

In [4]:
encoding.encode("tiktoken is great!")

[83, 1134, 30001, 318, 1049, 0]

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [6]:
num_tokens_from_string("tiktoken is great!", "gpt2")

6

In [7]:
encoding.decode([83, 1134, 30001, 318, 1049, 0])

'tiktoken is great!'

In [8]:
[encoding.decode_single_token_bytes(token) for token in [83, 1134, 30001, 318, 1049, 0]]

[b't', b'ik', b'token', b' is', b' great', b'!']

In [9]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")

In [10]:
compare_encodings("antidisestablishmentarianism")


Example string: "antidisestablishmentarianism"

gpt2: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

p50k_base: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

cl100k_base: 6 tokens
token integers: [519, 85342, 34500, 479, 8997, 2191]
token bytes: [b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']


In [11]:
compare_encodings("2 + 2 = 4")


Example string: "2 + 2 = 4"

gpt2: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

p50k_base: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

cl100k_base: 7 tokens
token integers: [17, 489, 220, 17, 284, 220, 19]
token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']


In [12]:
compare_encodings("お誕生日おめでとう")


Example string: "お誕生日おめでとう"

gpt2: 14 tokens
token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]
token bytes: [b'\xe3\x81', b'\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97', b'\xa5', b'\xe3\x81', b'\x8a', b'\xe3\x82', b'\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8', b'\xe3\x81\x86']

p50k_base: 14 tokens
token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]
token bytes: [b'\xe3\x81', b'\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97', b'\xa5', b'\xe3\x81', b'\x8a', b'\xe3\x82', b'\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8', b'\xe3\x81\x86']

cl100k_base: 9 tokens
token integers: [33334, 45918, 243, 21990, 9080, 33334, 62004, 16556, 78699]
token bytes: [b'\xe3\x81\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97\xa5', b'\xe3\x81\x8a', b'\xe3\x82\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8\xe3\x81\x86']


In [13]:
compare_encodings("모두가 인공지능에 대해 환호 할 때 그 한계를 한번 생각해 보자.")


Example string: "모두가 인공지능에 대해 환호 할 때 그 한계를 한번 생각해 보자."

gpt2: 76 tokens
token integers: [167, 103, 101, 167, 239, 238, 166, 108, 222, 23821, 251, 116, 166, 111, 113, 168, 100, 222, 167, 232, 98, 168, 245, 238, 31619, 234, 222, 47991, 112, 220, 169, 247, 246, 169, 246, 116, 220, 47991, 254, 31619, 243, 234, 220, 166, 115, 116, 220, 47991, 250, 166, 111, 226, 167, 98, 120, 220, 47991, 250, 167, 110, 230, 23821, 225, 251, 166, 108, 223, 47991, 112, 31619, 111, 112, 168, 252, 238, 13]
token bytes: [b'\xeb', b'\xaa', b'\xa8', b'\xeb', b'\x91', b'\x90', b'\xea', b'\xb0', b'\x80', b' \xec', b'\x9d', b'\xb8', b'\xea', b'\xb3', b'\xb5', b'\xec', b'\xa7', b'\x80', b'\xeb', b'\x8a', b'\xa5', b'\xec', b'\x97', b'\x90', b' \xeb', b'\x8c', b'\x80', b'\xed\x95', b'\xb4', b' ', b'\xed', b'\x99', b'\x98', b'\xed', b'\x98', b'\xb8', b' ', b'\xed\x95', b'\xa0', b' \xeb', b'\x95', b'\x8c', b' ', b'\xea', b'\xb7', b'\xb8', b' ', b'\xed\x95', b'\x9c', b'\xea', b'\xb3', b'\x84', b'\xeb', b'\xa5', b'\xbc', b

In [14]:
compare_encodings("Olympia is the capital of the U.S. state of Washington and the county seat and largest city of Thurston County.")


Example string: "Olympia is the capital of the U.S. state of Washington and the county seat and largest city of Thurston County."

gpt2: 27 tokens
token integers: [46, 6760, 544, 318, 262, 3139, 286, 262, 471, 13, 50, 13, 1181, 286, 2669, 290, 262, 7968, 5852, 290, 4387, 1748, 286, 36975, 3743, 3418, 13]
token bytes: [b'O', b'lymp', b'ia', b' is', b' the', b' capital', b' of', b' the', b' U', b'.', b'S', b'.', b' state', b' of', b' Washington', b' and', b' the', b' county', b' seat', b' and', b' largest', b' city', b' of', b' Thur', b'ston', b' County', b'.']

p50k_base: 27 tokens
token integers: [46, 6760, 544, 318, 262, 3139, 286, 262, 471, 13, 50, 13, 1181, 286, 2669, 290, 262, 7968, 5852, 290, 4387, 1748, 286, 36975, 3743, 3418, 13]
token bytes: [b'O', b'lymp', b'ia', b' is', b' the', b' capital', b' of', b' the', b' U', b'.', b'S', b'.', b' state', b' of', b' Washington', b' and', b' the', b' county', b' seat', b' and', b' largest', b' city', b' of', b' Thur', b'ston', b' County'