## Create Word-Piece Tokenizer From Corpus

In [2]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import PreTrainedTokenizerFast

print("Initializing tokenizer...")
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Configure trainer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=special_tokens
)

# Train tokenizer
print("Training tokenizer...")
corpus_file = "./corpus/wiki_corpus.txt"
tokenizer.train([corpus_file], trainer)


tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)
# Save for reuse like AutoTokenizer

tokenizer.save_pretrained("./bert_tokenizer")


Initializing tokenizer...
Training tokenizer...
Tokenizer vocab size: 30000


('./bert_tokenizer\\tokenizer_config.json',
 './bert_tokenizer\\special_tokens_map.json',
 './bert_tokenizer\\tokenizer.json')

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./bert_tokenizer")

## Create Word-Piece Tokenizer From JSON

In [None]:
import json
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast

# Load filtered vocab

with open("filtered_tokenizer_vocab.json", "r", encoding="utf-8") as f:
    token_list = json.load(f)  

# Convert list to dict {token: id}
vocab_dict = {token: idx for idx, token in enumerate(token_list)}

# Initialize WordPiece tokenizer with your vocab
tokenizer = Tokenizer(WordPiece(vocab=vocab_dict, unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Optional: CLS/SEP post-processing (like BERT)
tokenizer.post_processor = BertProcessing(
    ("[SEP]", vocab_dict["[SEP]"]),
    ("[CLS]", vocab_dict["[CLS]"])
)


# Wrap in PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)

# -----------------------------
# 5️⃣ Save for reuse like AutoTokenizer
# -----------------------------
tokenizer.save_pretrained("./filtered_bert_tokenizer")


FileNotFoundError: [Errno 2] No such file or directory: 'filtered_tokenizer_vocab.json'

Testing the tokenizer

In [15]:
print(tokenizer.encode("vertical"))

[17278]


# Define Model

In [23]:

from transformers import BertConfig, BertForMaskedLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import torch

# Model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=64,  # small for testing
    num_hidden_layers=2,
    num_attention_heads=2,
    hidden_size=64,
    intermediate_size=64,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
)

model = BertForMaskedLM(config)

#torch.compile(model)


# Define Dataset

In [20]:
# Dataset
dataset = load_dataset(
    "text",
    data_files={"train": "./corpus/wiki_corpus.txt"},
)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

## Model Training

In [None]:
from transformers import Trainer

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mini_bert",
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    warmup_ratio=0.1,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=5_000,
    bf16=True,
    fp16=False,             # Disable FP16 to avoid conflicts
    torch_compile=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
)

trainer.train()


  0%|          | 0/9102 [00:00<?, ?it/s]

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [8]:
from transformers import AutoModelForMaskedLM, PreTrainedTokenizerFast, pipeline



# Load trained model
model = AutoModelForMaskedLM.from_pretrained("./mini_bert/checkpoint-9102/")

# Masked language modeling pipeline
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
)

fill_mask("The dog likes to run around [MASK] and also likes to bite ")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.016736194491386414,
  'token': 16,
  'token_str': ',',
  'sequence': 'The dog likes to run around , and also likes to bite'},
 {'score': 0.012121855281293392,
  'token': 8680,
  'token_str': 'the',
  'sequence': 'The dog likes to run around the and also likes to bite'},
 {'score': 0.007807798683643341,
  'token': 8807,
  'token_str': 'it',
  'sequence': 'The dog likes to run around it and also likes to bite'},
 {'score': 0.005880240350961685,
  'token': 8832,
  'token_str': 'which',
  'sequence': 'The dog likes to run around which and also likes to bite'},
 {'score': 0.005040692165493965,
  'token': 8775,
  'token_str': 'he',
  'sequence': 'The dog likes to run around he and also likes to bite'}]