In [1]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

In [2]:
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp

import tiktoken
from transformers import AutoTokenizer

# Number of parallel threads (adjust as needed)
NUM_CPUS = mp.cpu_count()

In [3]:
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
gpt_4_tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4")

llama_vocab = llama_tokenizer.get_vocab()
gpt_4_vocab = gpt_4_tokenizer.get_vocab()

len(llama_vocab), len(gpt_4_vocab)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(128256, 100263)

In [4]:
# Iterate vocabs and print out the differences
llama_only = []
gpt_4_only = []
for key in llama_vocab:
    if key not in gpt_4_vocab:
        llama_only.append(key)
    
for key in gpt_4_vocab:
    if key not in llama_vocab:
        gpt_4_only.append(key)

print(f"LLAMA only: {len(llama_only)}")
print(f"GPT-4 only: {len(gpt_4_only)}")

LLAMA only: 28000
GPT-4 only: 7


In [5]:
gpt_4_only, gpt_4_tokenizer.special_tokens_map_extended

(['<|im_end|>',
  '<|fim_middle|>',
  '<|endoftext|>',
  '<|fim_suffix|>',
  '<|fim_prefix|>',
  '<|im_start|>',
  '<|endofprompt|>'],
 {'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
  'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
  'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)})

In [6]:
# Calculate the intersection
intersection = set(llama_vocab.keys()).intersection(set(gpt_4_vocab.keys()))
print(f"Intersection: {len(intersection)}")

Intersection: 100256


## Compare GPT-4 to tiktoken

In [7]:
from datasets import load_dataset

split = "train"
english_dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
korean_dataset = load_dataset("lcw99/wikipedia-korean-20221001", split=split)
code_dataset = load_dataset("code_search_net", "python", split=split, trust_remote_code=True)
code_dataset = code_dataset.rename_column("whole_func_string", "text")  # Rename whole_func_string to text
print(len(english_dataset), len(korean_dataset), len(code_dataset))
print(len(english_dataset) + len(korean_dataset) + len(code_dataset))

n = 100000
final_dataset = (
    english_dataset.shuffle(42).select(range(min(n, len(english_dataset))))["text"] +
    korean_dataset.shuffle(42).select(range(min(n, len(korean_dataset))))["text"] +
    code_dataset.shuffle(42).select(range(min(n, len(code_dataset))))["text"]
)
print(f"{len(final_dataset)=}")

Downloading data:   0%|          | 0.00/375M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/162M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/607256 [00:00<?, ? examples/s]

1801350 607256 412178
2820784
len(final_dataset)=300000


In [8]:
final_dataset[min(n, len(english_dataset))][:50]

'왕종린(, 1961년 ~ )은 미국의 중국계 물리학자로 중국 과학원 외국계 원사이다. 해양'

In [10]:
gpt_4_tiktoken_tokenizer = tiktoken.get_encoding("cl100k_base")


def check_tokenizers_worker(test_string):
    hf_output = gpt_4_tokenizer.encode(test_string)
    tiktoken_output = gpt_4_tiktoken_tokenizer.encode(test_string)
    return hf_output == tiktoken_output


with ThreadPoolExecutor(max_workers=NUM_CPUS) as executor:
    # Apply the function to each item in parallel
    results = list(executor.map(check_tokenizers_worker, final_dataset))

all(results)

True

So the conclusions (at least for the dataset sampled above) seem to be:

1. The first 100256 tokens of the Hugging Face implementations of `Llama 3` and `GPT-4` tokenizers seem to be the same.
1. Hugging Face's `GPT-4` tokenizer is identical to the one from `tiktoken`, at least for the dataset sampled above.
1. So `Llama 3` and `GPT-4` have very similar vocabularies, even though OpenAI never detailed how they trained their tokenizer.
1. What's going on???