In [1]:
#loading the corpus
!pip install datasets
from datasets import load_dataset

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
dataset=load_dataset("wikitext",name="wikitext-2-raw-v1",split="train")
#this function will yield batches of 1000 texts, which we will use to train the tokenizer
def get_training_corpus():
  for i in range(0,len(dataset),1000):
    yield dataset[i:i+1000]["text"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
#buliding a wordpiece tokenizer from scratch
#instantiating a tokenizer object with a model
from tokenizers import(
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)
#specifying the unknown token
tokenizer=Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [5]:
#normalization
#creating a bert normalizer from scratch
tokenizer.normalizer=normalizers.Sequence(
    [normalizers.NFD(),normalizers.Lowercase(),normalizers.StripAccents()]
)

In [6]:
#showing the result of the normalizer
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [7]:
#Pre Tokenization Step
#bert pre tokenizer from scratch
#tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

#from scratch-
tokenizer.pre_tokenizer=pre_tokenizers.Whitespace()

In [11]:
#implementation of the pre tokenizer
print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))

[('Let', (0, 3)), ("'", (3, 4)), ('s', (4, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre', (14, 17)), ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))]


In [12]:
# using sequence to compose several pre tokenizers
pre_tokenizer=pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(),pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [13]:
#running the inputs through the model
special_tokens=["[UNK]","[PAD]","[CLS]","[SEP]","[MASK]"]
trainer=trainers.WordPieceTrainer(
    vocab_size=25000,special_tokens=special_tokens
)

In [14]:
tokenizer.train_from_iterator(get_training_corpus(),trainer=trainer)
encoding=tokenizer.encode("Let's test my pre-tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'my', 'pre', '-', 'tok', '##eni', '##zer', '.']


In [15]:
#post processing
#adding cls token at the beginning and sep token at the end(after each sentence)

cls_token_id=tokenizer.token_to_id("[CLS]")
sep_token_id=tokenizer.token_to_id("[SEP]")
print(cls_token_id,sep_token_id)

2 3


In [16]:
#template processing (how to treat a single sentence and a pair of sentences)

tokenizer.post_processor=processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]",cls_token_id),("[SEP]",sep_token_id)],
)

In [17]:
encoding=tokenizer.encode("Let's test my pre-tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'my', 'pre', '-', 'tok', '##eni', '##zer', '.', '[SEP]']


In [19]:
encoding=tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [20]:
#including the decoder
tokenizer.decoder=decoders.WordPiece(prefix="##")
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

In [21]:
#saving the tokenizer in a single json file
tokenizer.save("tokenizer.json")

In [22]:
# to reload the tokenizer file
new_tokenizer=Tokenizer.from_file("tokenizer.json")

In [23]:
# to use this tokenizer we need to wrap this in a PreTrainedTokenizerFast

In [26]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer=PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json"
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
  )

#if we are using a specific tokenizer we will need to only specify the special tokens that sre different from the default ones