# Tutorial

## Dataset

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
from pprint import pprint

In [4]:
ds = load_dataset('bookcorpus', split='all')
pprint(ds)

Dataset({
    features: ['text'],
    num_rows: 74004228
})


## Print some samples

In [5]:
num_samples = 6
for idx, sample in enumerate(ds[0:num_samples]['text']):
  print(f'{idx} : {sample}')

0 : usually , he would be tearing around the living room , playing with his toys .
1 : but just one look at a minion sent him practically catatonic .
2 : that had been megan 's plan when she got him dressed earlier .
3 : he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .
4 : she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .
5 : `` are n't you being a good boy ? ''


## Tokenization

In [6]:
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers import Tokenizer

In [7]:
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)

In [8]:
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

In [9]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(vocab_size=32000, special_tokens=['[PAD]', '[UNK]'], continuing_subword_prefix='##')

In [10]:
from multiprocessing import cpu_count
print(cpu_count())

8


## Now the pipeline is ready

## Batch processing

In [11]:
def get_examples(batch_size=1000):
  for i in range(0, len(ds), batch_size):
    yield ds[i : i + batch_size]['text']

In [12]:
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer, length=len(ds))

## Saving the model

In [13]:
tokenizer.model.save('model', prefix='hopper')

['model\\hopper-vocab.json', 'model\\hopper-merges.txt']

## Display last n merges

In [14]:
with open('model/hopper-merges.txt', 'r') as file:
  row = 0
  num_lines = 10
  for line in reversed(file.readlines()):
    print(line)
    row+=1
    if row >= num_lines:
      break

mel ##anthe

black ##er

ad ##ject

v ##ang

betroth ##al

tiptoe ##ing

restroom ##s

consol ##ing

esp ##ionage

influ ##x



## View the number of merges

In [15]:
with open('model/hopper-merges.txt', 'r') as file:
  lines = file.readlines()

In [16]:
print(f'Number of merges: {len(lines)}')

Number of merges: 31871


In [17]:
print(f"vocab size: {tokenizer.get_vocab_size()}")

vocab size: 32000


## Get the vocab

In [18]:
vocab = tokenizer.get_vocab()

In [19]:
vocab_sorted = sorted(vocab.items(), key=lambda item: item[1])

## Encoding

In [20]:
sample = ds[0]['text']
print(f'sample: {sample}')
encoding = tokenizer.encode(sample)
print(encoding)

sample: usually , he would be tearing around the living room , playing with his toys .
Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [21]:
token_ids = encoding.ids
tokens = encoding.tokens
type_ids = encoding.type_ids
attention_mask = encoding.attention_mask

In [22]:
from IPython.display import display, HTML
display(HTML("<b>Hello, notebook!</b>"))

In [23]:
from IPython.display import display, HTML

In [24]:
from tokenizers.tools import EncodingVisualizer
visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=False)
html = visualizer(text=sample)
display(HTML(html))

## Let's try to understand the attributes

In [25]:
out_dict = {'tokens': tokens, 'ids':token_ids, 'type_ids':type_ids, 'attention_mask':attention_mask}
df = pd.DataFrame(out_dict)
df

Unnamed: 0,tokens,ids,type_ids,attention_mask
0,usually,2462,0,1
1,",",19,0,1
2,he,149,0,1
3,would,277,0,1
4,be,162,0,1
5,tearing,6456,0,1
6,around,422,0,1
7,the,131,0,1
8,living,1559,0,1
9,room,536,0,1


## Batch Encoding

In [26]:
samples = ds[0:4]['text']

In [27]:
batch_encoding = tokenizer.encode_batch(samples)
pprint(batch_encoding)

[Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]
Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


### Padding

In [28]:
# all default args
tokenizer.enable_padding(direction='right',
                         pad_id=0,
                         pad_type_id=0,
                         pad_token = '[PAD]',
                         length=None, # None default to max_len in the batch
                         pad_to_multiple_of = None)

tokenizer.enable_truncation(max_length=512)

### Quick test

In [29]:
text = 'All this is so simple to do in HF %$.'
encoded = tokenizer.encode(text).tokens
print(encoded)

['all', 'this', 'is', 'so', 'simple', 'to', 'do', 'in', 'h', '##f', '%', '[UNK]', '##.']


### Applying encoding with padding

In [30]:
batch_encoding = tokenizer.encode_batch(samples)
pprint(batch_encoding)

[Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


## Save the model with every params

In [31]:
# tokenizer.save('hopper.json')
from transformers import PreTrainedTokenizerFast
awesome_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
awesome_tokenizer.save_pretrained("awesome_tokenizer")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


('awesome_tokenizer\\tokenizer_config.json',
 'awesome_tokenizer\\special_tokens_map.json',
 'awesome_tokenizer\\tokenizer.json')

In [41]:
import json
with open('hopper.json', 'r') as file:
  json_data = json.load(file)

In [42]:
pprint(json_data, depth=1)

{'added_tokens': [...],
 'decoder': None,
 'model': {...},
 'normalizer': {...},
 'padding': {...},
 'post_processor': None,
 'pre_tokenizer': {...},
 'truncation': {...},
 'version': '1.0'}


In [43]:
## Loading back

In [44]:
trained_tokenizer = Tokenizer(BPE())

In [None]:
trained_tokenizer = trained_tokenizer.from_file('hopper.json')

In [46]:
tokens = trained_tokenizer.encode(text).tokens
print(tokens)

['all', 'this', 'is', 'so', 'simple', 'to', 'do', 'in', 'h', '##f', '%', '[UNK]', '##.']


## BERT-like Tokenizer

In [47]:
bert_tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
bert_tokenizer.normalizer = Lowercase()
bert_tokenizer.pre_tokenizer = Whitespace()
bert_trainer = BpeTrainer(vocab_size=32000,
                          special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
                          continuing_subword_prefix='##')

In [48]:
from tokenizers.processors import TemplateProcessing

In [49]:
bert_tokenizer.post_processor = TemplateProcessing(single='[CLS] $0 [SEP]',
                                                   pair="[CLS] $A [SEP] $B:1",
                                                   special_tokens=[('[CLS]', 2), ('[SEP]', 3)],
                                                   )

## Train the tokenizer

In [50]:
bert_tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=bert_trainer, length=len(ds))

## Save the model

In [51]:
bert_tokenizer.model.save('model', prefix='bert')

['model\\bert-vocab.json', 'model\\bert-merges.txt']

## Test the BERT-like tokenizer

In [52]:
text = "All these are so simple to do in HF. Let's do more"
encoded = bert_tokenizer.encode(text)
tokens = encoded.tokens
ids = encoded.ids
out_dict = {'tokens': tokens, 'ids': ids}
pprint(out_dict, depth=2, compact=True)

{'ids': [2, 270, 956, 336, 231, 2534, 141, 206, 157, 56, 98, 24, 462, 17, 67,
         206, 387, 3],
 'tokens': ['[CLS]', 'all', 'these', 'are', 'so', 'simple', 'to', 'do', 'in',
            'h', '##f', '.', 'let', "'", 's', 'do', 'more', '[SEP]']}


### For a pair of sentences

In [53]:
text = "All these are so simple to do in HF. Let's do more"
pair = "We have a long way to go!"
encoded = bert_tokenizer.encode(text, pair)
tokens = encoded.tokens
ids = encoded.ids
out_dict = {'tokens': tokens, 'ids': ids}
pprint(out_dict, depth=2, compact=True)

{'ids': [2, 270, 956, 336, 231, 2534, 141, 206, 157, 56, 98, 24, 462, 17, 67,
         206, 387, 3, 214, 250, 49, 490, 415, 141, 260, 12],
 'tokens': ['[CLS]', 'all', 'these', 'are', 'so', 'simple', 'to', 'do', 'in',
            'h', '##f', '.', 'let', "'", 's', 'do', 'more', '[SEP]', 'we',
            'have', 'a', 'long', 'way', 'to', 'go', '!']}


## Decoding

In [54]:
plain_tokens = bert_tokenizer.decode(ids)
plain_tokens

"all these are so simple to do in h ##f . let ' s do more we have a long way to go !"

## Appropriate Decoder

In [55]:
from tokenizers.decoders import WordPiece

In [56]:
bert_tokenizer.decoder = WordPiece(prefix='##')

In [57]:
plain_tokens =  bert_tokenizer.decode(ids)
plain_tokens

"all these are so simple to do in hf. let ' s do more we have a long way to go!"

## Pre Trained Tokenizer

In [59]:
from transformers import PreTrainedTokenizerFast

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [60]:
pt_tokenizer = PreTrainedTokenizerFast(tokenizer_file='hopper.json',
                                       unk_token='[UNK]',
                                       pad_token='[PAD]',
                                       model_input_names=['input_ids', 'token_type_ids', 'attention_mask'],
                                       )

# Model Inputs

In [61]:
model_inputs = pt_tokenizer(text)
pprint(model_inputs, compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [267, 953, 333, 228, 2531, 138, 203, 154, 53, 95, 21, 459, 14, 64,
               203, 384],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [62]:
model_inputs = pt_tokenizer(text, text_pair=pair)
pprint(model_inputs, compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1],
 'input_ids': [267, 953, 333, 228, 2531, 138, 203, 154, 53, 95, 21, 459, 14, 64,
               203, 384, 211, 247, 46, 487, 412, 138, 257, 9],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                    1, 1, 1, 1]}


In [63]:
batch_text = ['I like the book The Psychology of Money', 'I enjoyed watching the Transformers movie', 'oh! thanks for this']

In [64]:
model_inputs = pt_tokenizer(batch_text)
pprint(model_inputs, compact=True)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1]],
 'input_ids': [[54, 281, 131, 1701, 131, 19478, 153, 1564],
               [54, 4096, 1443, 131, 7744, 307, 3760],
               [772, 9, 1767, 200, 254]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0]]}


In [65]:
# With padding
model_inputs = pt_tokenizer(batch_text, padding=True)
pprint(model_inputs, compact=True)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0],
                    [1, 1, 1, 1, 1, 0, 0, 0]],
 'input_ids': [[54, 281, 131, 1701, 131, 19478, 153, 1564],
               [54, 4096, 1443, 131, 7744, 307, 3760, 0],
               [772, 9, 1767, 200, 254, 0, 0, 0]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0]]}


# Assignment

In [1]:
from ipywidgets import Layout, interact, interactive, fixed, interact_manual, widgets
from IPython.display import display

In [1]:
import re, collections

def get_stats(vocab):
  pairs = collections.defaultdict(int)
  for word, freq in vocab.items():
    symbols = word.split()
    for i in range(len(symbols)-1):
      pairs[symbols[i],symbols[i+1]] += freq
  return pairs

def merge_vocab(pair, v_in):
  v_out = {}
  bigram = re.escape(' '.join(pair))
  p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
  for word in v_in:
  	w_out = p.sub(''.join(pair), word)
  	v_out[w_out] = v_in[word]
  return v_out

vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2,
'n e w e s t </w>':6, 'w i d e s t </w>':3}
num_merges = 10

for i in range(num_merges):
  pairs = get_stats(vocab)
  best = max(pairs, key=pairs.get)
  vocab = merge_vocab(best, vocab)
print(best)


('w', 'i')


In [3]:
# to display a pair of subtokens to be merged in a slider
def get_pairs(pair:int):
    """
    pair: index of the pair. 
    """
    if pair>0:
        left, right = lines[pair].strip('\n').split(' ')
        print(f'{left} , {right}')
        
# to display token ids  in a slider
def display_token_id(id):
    token,id = vocab_sorted[id]
    print(f'id:{id} \t token:{token}')

# 1.

In [4]:
from datasets import load_dataset


In [5]:
subset = load_dataset('bookcorpus',split='all')
pprint(subset)

Found cached dataset bookcorpus (/home/sachin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


Dataset({
    features: ['text'],
    num_rows: 74004228
})


In [6]:
subset = subset.select(range(0, len(subset), 7))

In [7]:
subset

Dataset({
    features: ['text'],
    num_rows: 10572033
})

In [8]:
subset[:6]

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'mason barely acknowledged her .',
  'mason was already registering off the charts in height and weight according to his pediatrician .',
  'she never wanted anything in the world to hurt him , and she knew that being rejected by his father would .',
  "aidan was her mother 's baby brother and only son of the family .",
  "while it had been no question that she wanted him as godfather for mason , she had been extremely honored when he and his wife , emma , had asked her to be their son , noah 's , godmother ."]}

In [9]:
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers import Tokenizer

In [10]:
model = BPE(unk_token = '[UNK]')
tokenizer = Tokenizer(model)

In [11]:
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

In [12]:
from tokenizers.trainers import BpeTrainer
def trainer_with_vocab_size(vocab_size=10000):
  trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=['GO', 'UNK', 'PAD', 'EOS'])

In [13]:
def get_examples(batch_size=1000):
  for i in range(0, len(subset), batch_size):
    yield subset[i: i+batch_size]['text']

In [14]:
# Get the tokenizer with the vocab_size 
trainer = trainer_with_vocab_size(vocab_size=5000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer, length=len(subset))






In [15]:
tokenizer.save('hopper5k.json')

In [16]:
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)

Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [17]:
trainer10 = trainer_with_vocab_size(vocab_size=10000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.save('hopper10k.json')






In [23]:
trainer15 = trainer_with_vocab_size(vocab_size=15000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer15, length=len(subset))
tokenizer.save('hopper15k.json')






In [24]:
trainer32 = trainer_with_vocab_size(vocab_size=32000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer32, length=len(subset))
tokenizer.save('hopper32k.json')






In [19]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper5k.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [20]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper10k.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [21]:
trained_tokenizer = Tokenizer(BPE())
trained_tokenizer = trained_tokenizer.from_file('hopper.json')
tokens = trained_tokenizer.encode(input_text)
print(tokens)

Encoding(num_tokens=25, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [29]:
tokenizer.model.from_file('./model/hopper10-vocab.json', 'model/hopper10-merges.txt')

<tokenizers.models.BPE at 0x7df0cb11b070>

In [30]:
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)

Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [35]:
trainer10 = trainer_with_vocab_size(vocab_size=15000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.model.save('model', prefix='hopper15')
tokenizer = tokenizer.model.from_file('./model/hopper15-vocab.json', 'model/hopper15-merges.txt')
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)






AttributeError: 'tokenizers.models.BPE' object has no attribute 'encode'

In [None]:
trainer10 = trainer_with_vocab_size(vocab_size=32000)
tokenizer.train_from_iterator(get_examples(batch_size=10000), trainer=trainer10, length=len(subset))
tokenizer.model.save('model', prefix='hopper32')
tokenizer = tokenizer.model.from_file('./model/hopper32-vocab.json', 'model/hopper32-merges.txt')
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
output = tokenizer.encode(input_text)
print("Token count:", output)




Token count: Encoding(num_tokens=22, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [None]:
trainer = trainers.BpeTrainer(
    vocab_size=5000,  # Change to 10000, 15000, 32000 as needed
    special_tokens=["[GO]", "[UNK]", "[PAD]", "[EOS]"]
)

# Suppose `dataset` is a list of 10,572,033 strings
# dataset = load_bookcorpus_every_7th_sample()
tokenizer.train_from_iterator(subset, trainer)

# Save and reload for reuse
tokenizer.save("custom_bpe_5000.json")

In [14]:
from tokenizers import Tokenizer

hopper_tokenizer = Tokenizer.from_file("hopper.json")
tokens = hopper_tokenizer.encode(input_text)
print("Tokens (hopper):", len(tokens.tokens))

Exception: No such file or directory (os error 2)

In [15]:
hopper_tokenizer.model.add_tokens(["FY"])
tokens_after = hopper_tokenizer.encode(input_text)
print("Tokens after adding FY:", len(tokens_after.tokens))

NameError: name 'hopper_tokenizer' is not defined

In [16]:
from transformers import AutoTokenizer

bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2_tok = AutoTokenizer.from_pretrained("gpt2")

print("BERT special tokens:", bert_tok.special_tokens_map)
print("GPT2 special tokens:", gpt2_tok.special_tokens_map)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"vocab.txt";:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"vocab.json";:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)"merges.txt";:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

BERT special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
GPT2 special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [17]:
from datasets import load_dataset

imdb = load_dataset("imdb")
all_texts = imdb["train"]["text"] + imdb["test"]["text"]

# Define all tokenizers
tokenizers_list = {
    "1": tokenizer,  # Custom 32K tokenizer
    "2": AutoTokenizer.from_pretrained("bert-base-uncased"),
    "3": AutoTokenizer.from_pretrained("gpt2"),
    "4": Tokenizer.from_file("hopper.json")
}

# Count tokens
token_counts = {}
for k, tok in tokenizers_list.items():
    total = 0
    for text in all_texts:
        if isinstance(tok, Tokenizer):
            total += len(tok.encode(text).tokens)
        else:
            total += len(tok.encode(text).input_ids)
    token_counts[k] = total

print("Token counts:", sorted(token_counts.items(), key=lambda x: x[1]))

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading and preparing dataset None/plain_text to /home/sachin/.cache/huggingface/datasets/parquet/plain_text-c403a23b02a09219/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

ExpectedMoreSplits: {'unsupervised'}

In [18]:
batch = ["This is a short sentence.", "This is a much longer sentence with more tokens than the previous one."] * 4
tok = AutoTokenizer.from_pretrained("bert-base-uncased", padding=True, truncation=True, max_length=128)
output = tok(batch, padding=True, return_tensors="pt")
print("Shape:", output['input_ids'].shape)

Shape: torch.Size([8, 17])
