In [7]:
from transformers import AutoTokenizer
from collections import defaultdict

In [20]:
words = defaultdict(set)
model_names = ['EleutherAI/gpt-neo-125m', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-neo-2.7B', 'EleutherAI/gpt-j-6b',
                 'bert-base-uncased', 'bert-large-uncased',
                 'roberta-base', 'roberta-large',
                 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
                 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2',
                 ]
for model_name in model_names:
    print(model_name)
    tok = AutoTokenizer.from_pretrained(model_name)
    for word, i in tok.vocab.items():
        new_word = tok.decode(i).strip().lower()
        words[model_name].add(new_word)

EleutherAI/gpt-neo-125m
EleutherAI/gpt-neo-1.3B
EleutherAI/gpt-neo-2.7B
EleutherAI/gpt-j-6b
bert-base-uncased
bert-large-uncased
roberta-base
roberta-large
albert-base-v1
albert-large-v1
albert-xlarge-v1
albert-base-v2
albert-large-v2
albert-xlarge-v2


In [21]:
for w in words:
    print(len(words[w]))

32762
32762
32762
32905
30522
30522
32770
32770
26589
26589
26589
26589
26589
26589


In [22]:
vocab_intersection = set()
for vocab in words.values():
    if len(vocab_intersection) == 0:
        vocab_intersection = vocab
    else:
        vocab_intersection = vocab_intersection & vocab

In [23]:
len(vocab_intersection)

16353

In [86]:
import tiktoken
target_model = 'gpt-3.5-turbo'
encoding = tiktoken.encoding_for_model(target_model)

In [109]:
chatgpt_vocab = set()
for i in range(encoding.n_vocab):
    if i not in [100256, 100261, 100262, 100263, 100264, 100265, 100266, 100267, 100268, 100269,
                 100270, 100271, 100272, 100273, 100274, 100275]:
        w = encoding.decode([i])
        chatgpt_vocab.add(w.strip().lower())

In [110]:
len(chatgpt_vocab)

62643

In [111]:
final_vocab_intersection = vocab_intersection & chatgpt_vocab

In [114]:
final_vocab_intersection = list(final_vocab_intersection)

In [115]:
import json
with open('vocab_intersection_including_chatgpt.json', 'w') as fout:
    json.dump(final_vocab_intersection, fout)