In [1]:
import os
from typing import *
import datasets
from datasets import *
import project_paths as pp
from tokenizers import Tokenizer, normalizers, pre_tokenizers, models, trainers, processors, decoders
from transformers import PreTrainedTokenizerFast

Steps to build a tokenizer:
* Normalization
* Pre-tokenization
* Model
* Post-processor
* Decoder
Refer to [this page](https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-tokenizer-block-by-block) for more details.

In [2]:
def train_word_piece_tokenizer(corpus: List[str], vocab_size: int = 4096) -> Tokenizer:
    '''Train a WordPiece tokenizer on a text corpus.

    Args:
        corpus: List of strings containing the training texts
        vocab_size: Size of vocabulary to learn (default: 4096)

    Returns:
        Tokenizer: Trained WordPiece tokenizer with the following components:
            - Normalizer: Strips whitespace, lowercases, removes accents
            - Pre-tokenizer: Splits on whitespace
            - Model: WordPiece with [UNK] token
            - Post-processor: Adds special tokens [BOS], [EOS], [CLS]
            - Decoder: WordPiece with '##' prefix
    '''
    # Helper function to yield batches of the corpus for training
    def training_corpus_iterator(batch_size=512):
        for i in range(0, len(corpus), batch_size):
            yield corpus[i: i + batch_size]

    # Initialize tokenizer with WordPiece model and [UNK] token for unknown words
    # The WordPiece model will learn subword units by breaking words into commonly occurring pieces
    tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
    
    # Set up normalizer sequence to clean and standardize text:
    # - Strip whitespace from both ends
    # - Convert to lowercase for consistency 
    # - NFD unicode normalization to separate characters and diacritics
    # - Remove accent marks while preserving base characters
    tokenizer.normalizer = normalizers.Sequence(
        [
            normalizers.Strip(left=True, right=True),
            normalizers.Lowercase(),
            normalizers.NFD(),
            normalizers.StripAccents()
        ]
    )
    
    # Configure pre-tokenizer to split text on whitespace boundaries
    # This creates initial word-level tokens before WordPiece subword tokenization
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [
            pre_tokenizers.Whitespace()
        ]
    )
    
    # Define special tokens used for various purposes:
    # [UNK] - Unknown tokens not in vocabulary
    # [BOS] - Beginning of sequence marker
    # [EOS] - End of sequence marker
    # [PAD] - Padding token for fixed length
    special_tokens = ['[UNK]', '[BOS]', '[EOS]', '[PAD]']
    
    # Set up WordPiece trainer with:
    # - Target vocabulary size
    # - Special tokens to reserve in vocabulary
    trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
    
    # Train on batches of text using iterator
    tokenizer.train_from_iterator(training_corpus_iterator(), trainer=trainer)
    
    # Add post-processor to wrap sequences with special tokens:
    # [BOS] at start, [EOS] at end
    # Maps special tokens to their vocabulary IDs
    tokenizer.post_processor = processors.TemplateProcessing(
        single=f'[BOS]:0 $A:0 [EOS]:0',
        special_tokens=[(special_token, tokenizer.token_to_id(special_token)) for special_token in special_tokens],
    )
    
    # Configure WordPiece decoder with '##' prefix
    # This helps reconstruct original text by marking subword continuations
    tokenizer.decoder = decoders.WordPiece(prefix='##')

    return tokenizer

In [3]:
train_dataset_folder_path = os.path.join(pp.aclImdb_dataset_folder_path, 'train')
train_dataset = datasets.load_from_disk(train_dataset_folder_path)
corpus_size = len(train_dataset)

In [4]:
# Set vocabulary size to 4096 tokens
# This determines how many unique subword tokens the WordPiece tokenizer will learn
# Smaller vocab = more subword splitting but smaller model
# Larger vocab = less splitting but larger model
vocab_size = 4096
word_piece_tokenizer = train_word_piece_tokenizer(train_dataset['text'])
word_piece_fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=word_piece_tokenizer,
    unk_token='[UNK]',
    bos_token='[BOS]',
    eos_token='[EOS]',
    pad_token='[PAD]'
)



In [5]:
# Save trained tokenizer to disk and reload it
# This ensures the tokenizer can be reused without retraining
# The tokenizer is saved with the vocabulary size in the folder name
tokenizer_folder_path = os.path.join(pp.word_piece_tokenizer_folder_path, str(vocab_size))
if not os.path.isdir(tokenizer_folder_path):
    os.makedirs(tokenizer_folder_path)
word_piece_fast_tokenizer.save_pretrained(tokenizer_folder_path)
word_piece_fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_folder_path)