In [1]:
from ipywidgets import Layout, interact, interactive, fixed, interact_manual, widgets
from IPython.display import display

In [2]:
import pandas as pd
from pprint import pprint

In [1]:
import re, collections

def get_stats(vocab):
  pairs = collections.defaultdict(int)
  for word, freq in vocab.items():
    symbols = word.split()
    for i in range(len(symbols)-1):
      pairs[symbols[i],symbols[i+1]] += freq
  return pairs

def merge_vocab(pair, v_in):
  v_out = {}
  bigram = re.escape(' '.join(pair))
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  for word in v_in:
  	w_out = p.sub(''.join(pair), word)
  	v_out[w_out] = v_in[word]
  return v_out

vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2,
'n e w e s t </w>':6, 'w i d e s t </w>':3}
num_merges = 10

for i in range(num_merges):
  pairs = get_stats(vocab)
  best = max(pairs, key=pairs.get)
  vocab = merge_vocab(best, vocab)
print(best)


('w', 'i')


In [3]:
# to display a pair of subtokens to be merged in a slider
def get_pairs(pair:int):
    """
    pair: index of the pair. 
    """
    if pair>0:
        left, right = lines[pair].strip('\n').split(' ')
        print(f'{left} , {right}')
        
# to display token ids  in a slider
def display_token_id(id):
    token,id = vocab_sorted[id]
    print(f'id:{id} \t token:{token}')

# 1.

In [4]:
from datasets import load_dataset


In [5]:
subset = load_dataset('bookcorpus',split='all')
pprint(subset)

Found cached dataset bookcorpus (/home/sachin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


Dataset({
    features: ['text'],
    num_rows: 74004228
})


In [6]:
subset = subset.select(range(0, len(subset), 7))

In [7]:
subset

Dataset({
    features: ['text'],
    num_rows: 10572033
})

In [8]:
subset[:6]

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'mason barely acknowledged her .',
  'mason was already registering off the charts in height and weight according to his pediatrician .',
  'she never wanted anything in the world to hurt him , and she knew that being rejected by his father would .',
  "aidan was her mother 's baby brother and only son of the family .",
  "while it had been no question that she wanted him as godfather for mason , she had been extremely honored when he and his wife , emma , had asked her to be their son , noah 's , godmother ."]}

In [9]:
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers import Tokenizer

In [10]:
model = BPE(unk_token = '[UNK]')
tokenizer = Tokenizer(model)

In [11]:
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

In [12]:
from tokenizers.trainers import BpeTrainer
def trainer_with_vocab_size(vocab_size=10000):
  trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=['GO', 'UNK', 'PAD', 'EOS'])

In [13]:
def get_examples(batch_size=1000):
  for i in range(0, len(subset), batch_size):
    yield subset[i: i+batch_size]['text']

In [14]:
# Get the tokenizer with the vocab_size 
trainer = trainer_with_vocab_size(vocab_size=5000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer, length=len(subset))






In [15]:
tokenizer.save('hopper5k.json')

In [16]:
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)

Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [17]:
trainer10 = trainer_with_vocab_size(vocab_size=10000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.save('hopper10k.json')






In [23]:
trainer15 = trainer_with_vocab_size(vocab_size=15000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer15, length=len(subset))
tokenizer.save('hopper15k.json')






In [24]:
trainer32 = trainer_with_vocab_size(vocab_size=32000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer32, length=len(subset))
tokenizer.save('hopper32k.json')






In [19]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper5k.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [20]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper10k.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [21]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=25, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [29]:
tokenizer.model.from_file('./model/hopper10-vocab.json', 'model/hopper10-merges.txt')

<tokenizers.models.BPE at 0x7df0cb11b070>

In [30]:
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)

Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [35]:
trainer10 = trainer_with_vocab_size(vocab_size=15000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.model.save('model', prefix='hopper15')
tokenizer = tokenizer.model.from_file('./model/hopper15-vocab.json', 'model/hopper15-merges.txt')
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)






AttributeError: 'tokenizers.models.BPE' object has no attribute 'encode'

In [None]:
trainer10 = trainer_with_vocab_size(vocab_size=32000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.model.save('model', prefix='hopper32')
tokenizer = tokenizer.model.from_file('./model/hopper32-vocab.json', 'model/hopper32-merges.txt')
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)




Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [None]:
trainer = trainers.BpeTrainer(
    vocab_size=5000,  # Change to 10000, 15000, 32000 as needed
    special_tokens=["[GO]", "[UNK]", "[PAD]", "[EOS]"]
)

# Suppose `dataset` is a list of 10,572,033 strings
# dataset = load_bookcorpus_every_7th_sample()
tokenizer.train_from_iterator(subset, trainer)

# Save and reload for reuse
tokenizer.save("custom_bpe_5000.json")

In [14]:
from tokenizers import Tokenizer

hopper_tokenizer = Tokenizer.from_file("hopper.json")
tokens = hopper_tokenizer.encode(input_text)
print("Tokens (hopper):", len(tokens.tokens))

Exception: No such file or directory (os error 2)

In [15]:
hopper_tokenizer.model.add_tokens(["FY"])
tokens_after = hopper_tokenizer.encode(input_text)
print("Tokens after adding FY:", len(tokens_after.tokens))

NameError: name 'hopper_tokenizer' is not defined

In [16]:
from transformers import AutoTokenizer

bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2_tok = AutoTokenizer.from_pretrained("gpt2")

print("BERT special tokens:", bert_tok.special_tokens_map)
print("GPT2 special tokens:", gpt2_tok.special_tokens_map)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"vocab.txt";:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"vocab.json";:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)"merges.txt";:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

BERT special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
GPT2 special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [17]:
from datasets import load_dataset

imdb = load_dataset("imdb")
all_texts = imdb["train"]["text"] + imdb["test"]["text"]

# Define all tokenizers
tokenizers_list = {
    "1": tokenizer,  # Custom 32K tokenizer
    "2": AutoTokenizer.from_pretrained("bert-base-uncased"),
    "3": AutoTokenizer.from_pretrained("gpt2"),
    "4": Tokenizer.from_file("hopper.json")
}

# Count tokens
token_counts = {}
for k, tok in tokenizers_list.items():
    total = 0
    for text in all_texts:
        if isinstance(tok, Tokenizer):
            total += len(tok.encode(text).tokens)
        else:
            total += len(tok.encode(text).input_ids)
    token_counts[k] = total

print("Token counts:", sorted(token_counts.items(), key=lambda x: x[1]))

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading and preparing dataset None/plain_text to /home/sachin/.cache/huggingface/datasets/parquet/plain_text-c403a23b02a09219/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

ExpectedMoreSplits: {'unsupervised'}

In [18]:
batch = ["This is a short sentence.", "This is a much longer sentence with more tokens than the previous one."] * 4
tok = AutoTokenizer.from_pretrained("bert-base-uncased", padding=True, truncation=True, max_length=128)
output = tok(batch, padding=True, return_tensors="pt")
print("Shape:", output['input_ids'].shape)

Shape: torch.Size([8, 17])
