In [1]:
import wandb; wandb.login()
from transformers import GPT2TokenizerFast, GPTNeoForCausalLM, GPTNeoConfig
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk
from transformers import RobertaForCausalLM

config = GPTNeoConfig(

    # number of tokens in the vocabulary 
    vocab_size = 10_000, 
    # embedding size (vector length) of each token 
    hidden_size=512, 
    # we thus have an embedding block of 512 x 10'000 parameters

    # maximum sequence length, though inputs longer than `hidden_size` will be iteratively processed
    max_position_embeddings = 512, 

    # number of transformer blocks. div by 2 for attention_types
    num_layers=2, 
    # for global and local attention (GPT-Neo-specific)
    attention_types=[[["global", "local"], 1]], 

    num_heads=4,     # attention heads
    window_size=256, # for local attention (GPT-Neo-specific)

    intermediate_size=1024, # size of 'up-projection' layer in FFN
)

[34m[1mwandb[0m: Currently logged in as: [33mr-bragamedeirosmotaborges[0m ([33mtiny-transformers[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained('EleutherAI/gpt-neo-125m', model_max_length=config.hidden_size)

# in theory, window_size x num_layers is possible. 
# But, this project is not about the efficacy of sliding window attention (though it could be!)
assert tokenizer.model_max_length == 512 
# assert tokenizer.vocab_size == config.vocab_size

# printing this because of a bug in tokenizers (should be fixed now) https://github.com/huggingface/transformers/issues/26500
print(f'padding token is {tokenizer.pad_token}')
# HF wasn't saving this nor the tokenizer's pad_token
config.pad_token_id = tokenizer.pad_token_id

padding token is None


In [8]:
import json 
tokenizer_state = json.loads(tokenizer.backend_tokenizer.model.__getstate__())

n_words = len(tokenizer_state['vocab'])
print(f'vocabulary: {n_words}') # would be lovely if this was 10'000

# index n_words - 1 contains the special eos token `<|endoftext|>`
new_vocab = {k: v for k, v in tokenizer_state['vocab'].items() if v < 10_000-1 or v == n_words-1}
tokenizer_state['vocab'] = new_vocab

# you can see that most common tokens are listed first (individual characters, pairs of chars, triples, etc.)
print(f'new vocab : {len(new_vocab)}, {list(new_vocab.keys())[:3]}, {list(new_vocab.keys())[-3:]}')


# Updating the tokenizer with new vocab: of course this doesn't work the first try, for some reason. 
from tokenizers import models
model_class = getattr(models, tokenizer_state.pop('type'))

# 'str' object cannot be converted to 'PyTuple'
# tokenizer.backend_tokenizer.model = model_class(**tokenizer_state) 

# Let's manually create tuple objects, maybe Rust's type safety is keeping us safe. 
new_merges = [tuple(m.split()) for m in tokenizer_state['merges']]
print(f'new merges: {len(new_merges)}, {new_merges[:3]}, {new_merges[-3:]}') # Ġ means space ' ' 
tokenizer_state['merges'] = new_merges

# tokenizer.backend_tokenizer.model = model_class(**tokenizer_state) # Token `ordon` out of vocabulary

vocabulary: 50257
new vocab : 10000, ['!', '"', '#'], ['oret', 'ths', '<|endoftext|>']
new merges: 50000, [('Ġ', 't'), ('Ġ', 'a'), ('h', 'e')], [('ĠColl', 'ider'), ('Ġinform', 'ants'), ('Ġg', 'azed')]


KeyboardInterrupt: 

In [17]:
new_merges = new_merges[:3000-257] # 256 says: "Token `ordon` out of vocabulary"
tokenizer_state['merges'] = new_merges
print(f'new merges: {len(new_merges)}, {new_merges[:3]}, {new_merges[-3:]}') # Ġ means space ' '


tokenizer.backend_tokenizer.model = model_class(**tokenizer_state)
print(f'our tokenizer now has the {len(tokenizer)} most frequent tokens (from whatever dataset it was trained on)')


new merges: 2743, [('Ġ', 't'), ('Ġ', 'a'), ('h', 'e')], [('6', '5'), ('Ġb', 'illion'), ('0', '7')]
our tokenizer now has the 10000 most frequent tokens (from whatever dataset it was trained on)


In [19]:
# tokenizer.save_pretrained('10k-tokenizer')
print(tokenizer.decode(tokenizer.encode('hello worldaisudhgiashg asdugh')))
tokenizer.vocab_size

# NOTE: Saving hangs indefinitely, I don't know why and it requires me to look at the rust impl.
# instead, I just ended up modifying the merges.txt and vocab.json files manually, following
# exactly what I did above. 

tokenizer.name_or_path = '10k-tokenizer'
# tokenizer.save_pretrained('10k-tokenizer') 
# tokenizer.save_vocabulary('.', '10k-tokenizer.json')


hello worldaisudhgiashg asdugh


KeyboardInterrupt: 

In [18]:
new_merges = new_merges[:3000-257] # 256 says: "Token `ordon` out of vocabulary"
tokenizer_state['merges'] = new_merges
merge_file = open("3k-tok-bpe/merges.txt", "w")
merge_file.writelines([merge[0] + " " + merge[1] + "\n" for merge in new_merges])
merge_file.close()


In [35]:
new_merges = new_merges[:3000-257]
tokenizer_state['merges'] = new_merges
tokenizer.backend_tokenizer.model = model_class(**tokenizer_state)
new_vocab = {k: v for k, v in tokenizer_state['vocab'].items() if v < 3_000-1 or v == n_words-1}
new_vocab["!"] = 2995
new_vocab['"'] = 2996
new_vocab["#"] = 2997
new_vocab["$"] = 2998
new_vocab.pop("Ġevents")
new_vocab.pop("65")
new_vocab.pop("Ġbillion")
new_vocab.pop("07")
new_vocab["<pad>"] = 0
new_vocab["<s>"] = 1
new_vocab["</s>"] = 2
new_vocab["<unk>"] = 3
new_vocab["<mask>"] = 2999
new_vocab.pop("<|endoftext|>")
new_vocab = {k: v for k, v in sorted(new_vocab.items(), key=lambda item: item[1])}

merge_file = open("3k-tok-bpe/vocab.json", "w")
merge_file.write(str(new_vocab))
merge_file.close()