## Create Word-Piece Tokenizer From Corpus

In [94]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import PreTrainedTokenizerFast

print("Initializing tokenizer...")
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Configure trainer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=special_tokens
)

# Train tokenizer
print("Training tokenizer...")
corpus_file = "./corpus/wiki_corpus.txt"
tokenizer.train([corpus_file], trainer)


tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)
# Save for reuse like AutoTokenizer

tokenizer.save_pretrained("./bert_tokenizer")


Initializing tokenizer...
Training tokenizer...
Tokenizer vocab size: 30000


('./bert_tokenizer\\tokenizer_config.json',
 './bert_tokenizer\\special_tokens_map.json',
 './bert_tokenizer\\tokenizer.json')

## Create Word-Piece Tokenizer From JSON

In [None]:
import json
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast

# Load filtered vocab

with open("filtered_tokenizer_vocab.json", "r", encoding="utf-8") as f:
    token_list = json.load(f)  

# Convert list to dict {token: id}
vocab_dict = {token: idx for idx, token in enumerate(token_list)}

# Initialize WordPiece tokenizer with your vocab
tokenizer = Tokenizer(WordPiece(vocab=vocab_dict, unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Optional: CLS/SEP post-processing (like BERT)
tokenizer.post_processor = BertProcessing(
    ("[SEP]", vocab_dict["[SEP]"]),
    ("[CLS]", vocab_dict["[CLS]"])
)


# Wrap in PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

print("Tokenizer vocab size:", tokenizer.vocab_size)

# -----------------------------
# 5️⃣ Save for reuse like AutoTokenizer
# -----------------------------
tokenizer.save_pretrained("./filtered_bert_tokenizer")


In [96]:
print(tokenizer.encode("vertical"))

[17278]


In [None]:
from transformers import PreTrainedTokenizerFast

#find out how to load tokenizer - 

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="distilbert_tokenizer.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [97]:

from transformers import BertConfig, BertForMaskedLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling



# Model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=64,  # small for testing
    num_hidden_layers=2,
    num_attention_heads=2,
    hidden_size=32,
    intermediate_size=64,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
)

model = BertForMaskedLM(config)

# Dataset
dataset = load_dataset(
    "text",
    data_files={"train": "./corpus/wiki_corpus.txt"},
)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=16,
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)


Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

## Model Training

In [104]:
from transformers import Trainer

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mini_bert",
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    warmup_ratio=0.1,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=5_000,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
)

trainer.train()


  0%|          | 0/9102 [00:00<?, ?it/s]

{'loss': 6.9841, 'grad_norm': 0.7625142335891724, 'learning_rate': 0.0001097694840834248, 'epoch': 0.02}
{'loss': 7.0163, 'grad_norm': 1.5524271726608276, 'learning_rate': 0.0002195389681668496, 'epoch': 0.04}
{'loss': 7.0367, 'grad_norm': 0.9017856121063232, 'learning_rate': 0.00032930845225027445, 'epoch': 0.07}
{'loss': 7.0043, 'grad_norm': 1.2243329286575317, 'learning_rate': 0.0004390779363336992, 'epoch': 0.09}
{'loss': 6.9634, 'grad_norm': 1.3926067352294922, 'learning_rate': 0.0004945672079111219, 'epoch': 0.11}
{'loss': 6.9214, 'grad_norm': 1.185500144958496, 'learning_rate': 0.00048235868636308145, 'epoch': 0.13}
{'loss': 6.9069, 'grad_norm': 1.281945824623108, 'learning_rate': 0.0004701501648150409, 'epoch': 0.15}
{'loss': 6.8767, 'grad_norm': 1.4221779108047485, 'learning_rate': 0.0004579416432670004, 'epoch': 0.18}
{'loss': 6.847, 'grad_norm': 1.5178356170654297, 'learning_rate': 0.00044573312171895986, 'epoch': 0.2}
{'loss': 6.8182, 'grad_norm': 1.6842323541641235, 'learn

TrainOutput(global_step=9102, training_loss=6.604493662750975, metrics={'train_runtime': 250.7406, 'train_samples_per_second': 4646.352, 'train_steps_per_second': 36.3, 'total_flos': 5398874869248.0, 'train_loss': 6.604493662750975, 'epoch': 1.0})

In [111]:
from transformers import AutoModelForMaskedLM, PreTrainedTokenizerFast, pipeline



# Load trained model
model = AutoModelForMaskedLM.from_pretrained("./mini_bert/checkpoint-9102/")

# Masked language modeling pipeline
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
)

fill_mask("The dog likes to run around [MASK] and also likes to bite ")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.05591578781604767,
  'token': 8680,
  'token_str': 'the',
  'sequence': 'The dog likes to run around the and also likes to bite'},
 {'score': 0.03488413617014885,
  'token': 69,
  'token_str': 'a',
  'sequence': 'The dog likes to run around a and also likes to bite'},
 {'score': 0.012131385505199432,
  'token': 16,
  'token_str': ',',
  'sequence': 'The dog likes to run around , and also likes to bite'},
 {'score': 0.010184370912611485,
  'token': 8700,
  'token_str': 'to',
  'sequence': 'The dog likes to run around to and also likes to bite'},
 {'score': 0.010054562240839005,
  'token': 8944,
  'token_str': 'been',
  'sequence': 'The dog likes to run around been and also likes to bite'}]