In [None]:
!pip install datasets

import re
from tqdm import tqdm
from datasets import load_dataset

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

dataset = load_dataset("squad")

### 1. Cleaning Text

In [None]:
sentencess_train      = [i['context'] for i in tqdm(dataset['train'])]
sentencess_validation = [i['context'] for i in tqdm(dataset['validation'])]

sentencess = sentencess_train + sentencess_validation

sents = []
for sent in tqdm(sentencess):
  sents += sent.split('.')

del sentencess, sentencess_train, sentencess_validation

def remove_special_characters(text):
    """Removes special characters from text."""
    pattern = r"[^a-zA-Z0-9\s]"  # Matches any character that is not alphanumeric or whitespace
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

def remove_brackets(text):
    """Removes brackets from text."""
    pattern = r"[\(\)\[\]\{\}]"  # Matches any type of bracket
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

def lowercase_text(text):
    """Converts text to lowercase."""
    cleaned_text = text.lower()
    return cleaned_text

def clean_text(text):
    """Applies all cleaning functions to text."""
    text = remove_special_characters(text)
    text = remove_brackets(text)
    text = lowercase_text(text)
    return text

cleaned_text = [clean_text(text) for text in tqdm(sents)]

100%|██████████| 87599/87599 [00:11<00:00, 7553.76it/s]
100%|██████████| 10570/10570 [00:01<00:00, 5772.89it/s]
100%|██████████| 98169/98169 [00:00<00:00, 283017.55it/s]
100%|██████████| 656224/656224 [00:05<00:00, 110273.87it/s]


### 2. Tokenization

In [None]:
tokenized_output = tokenizer(
    cleaned_text,
    padding="max_length",
    truncation=True,
    max_length=64,
    return_tensors="pt")


input_ids = tokenized_output["input_ids"]
attention_mask = tokenized_output["attention_mask"]

import torch

input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)

print("Tokenized Shape:", input_ids.shape)  # (num_samples, max_length)

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Tokenized Shape: torch.Size([656224, 64])


### 3. Creating Features & Labels

In [None]:
labels = input_ids.clone()

rand = torch.rand(input_ids.shape)
mask_arr = (rand < 0.15) * (input_ids != tokenizer.pad_token_id) * (input_ids != tokenizer.cls_token_id)

input_ids[mask_arr] = tokenizer.mask_token_id

print("Original Sentence:", tokenizer.convert_ids_to_tokens(labels[0].tolist()))
print("Masked Sentence:", tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))

Original Sentence: ['[CLS]', 'architectural', '##ly', 'the', 'school', 'has', 'a', 'catholic', 'character', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Masked Sentence: ['[CLS]', 'architectural', '[MASK]', 'the', 'school', 'has', 'a', 'catholic', '[MASK]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

### 4. Creating Data Generator

In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(input_ids, attention_mask, labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

print(f"Total batches: {len(train_loader)}")

Total batches: 41014


### 5. Defining TinyBERT

In [None]:
from transformers import BertConfig, BertForMaskedLM

# Define TinyBERT Configuration
tiny_bert_config = BertConfig(
    vocab_size=30522,           # Standard BERT vocabulary
    hidden_size=128,             # Tiny hidden size
    num_hidden_layers=2,         # Only 2 Transformer blocks
    num_attention_heads=2,       # 2 attention heads
    intermediate_size=512,       # Small feed-forward size
    hidden_act="gelu",           # Activation function
    hidden_dropout_prob=0.1,     # Dropout for regularization
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=64,  # Max sequence length
    type_vocab_size=2,           # Sentence A/B embeddings
    initializer_range=0.02        # Initialize weights
)

tiny_bert = BertForMaskedLM(config=tiny_bert_config)
print(f"TinyBERT Model Parameters: {tiny_bert.num_parameters()}")  # ~4.4M parameters

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


TinyBERT Model Parameters: 4359354


### 6. Train BERT

In [None]:
import torch
from transformers import AdamW
optimizer = AdamW(tiny_bert.parameters(), lr=5e-4)


# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tiny_bert.to(device)

# Training loop
epochs = 10
for epoch in range(epochs):
    total_loss = 0  # Track total loss

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Move data to GPU if available
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass (Masked Language Model)
        outputs = tiny_bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass & optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Loss = {total_loss / len(train_loader):.4f}")

100%|██████████| 41014/41014 [11:34<00:00, 59.06it/s]


Epoch 1: Loss = 0.3844


100%|██████████| 41014/41014 [11:39<00:00, 58.60it/s]


Epoch 2: Loss = 0.2864


100%|██████████| 41014/41014 [11:38<00:00, 58.68it/s]


Epoch 3: Loss = 0.2629


100%|██████████| 41014/41014 [11:40<00:00, 58.59it/s]


Epoch 4: Loss = 0.2509


100%|██████████| 41014/41014 [11:42<00:00, 58.36it/s]


Epoch 5: Loss = 0.2431


100%|██████████| 41014/41014 [11:41<00:00, 58.50it/s]


Epoch 6: Loss = 0.2374


100%|██████████| 41014/41014 [11:41<00:00, 58.50it/s]


Epoch 7: Loss = 0.2332


100%|██████████| 41014/41014 [11:36<00:00, 58.86it/s]


Epoch 8: Loss = 0.2298


100%|██████████| 41014/41014 [11:37<00:00, 58.84it/s]


Epoch 9: Loss = 0.2273


100%|██████████| 41014/41014 [11:39<00:00, 58.67it/s]

Epoch 10: Loss = 0.2253





### 7. Saving Model

In [None]:
tiny_bert.save_pretrained("tiny_bert_trained")
tokenizer.save_pretrained("tiny_bert_trained")
print("TinyBERT Model Saved Successfully! 🎉")

TinyBERT Model Saved Successfully! 🎉


### 8. Defining Predictions Function

In [None]:
import torch

def predict_masked_word(text):
    # Tokenize input and replace a word with [MASK]
    tokens = tokenizer.tokenize(text)
    masked_index = tokens.index("[MASK]") if "[MASK]" in tokens else None

    if masked_index is None:
        raise ValueError("Text must contain '[MASK]' token for prediction.")

    # Convert tokens to input IDs
    input_ids = tokenizer.encode(text, max_length=64, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # Perform inference
    with torch.no_grad():
        outputs = tiny_bert(input_ids)
        predictions = outputs.logits

    # Get the predicted word (top 3 guesses)
    predicted_token_ids = predictions[0, masked_index].topk(3).indices.tolist()
    predicted_words = tokenizer.convert_ids_to_tokens(predicted_token_ids)

    return predicted_words

### 9. Mass Prediction Function Define

In [None]:
import random
import torch

tiny_bert.eval()

def get_masked_sentence(cleaned_text):
    # Select a valid random sentence (must be at least 3 words long)
    while True:
        sentence = random.choice(cleaned_text)
        words = sentence.split()

        if len(words) > 2:  # Ensure at least 3 words for masking
            break

    # Choose a random position to mask (excluding first and last words)
    mask_index = random.randint(1, len(words) - 2)

    # Replace selected word with [MASK]
    words[mask_index] = "[MASK]"

    # Reconstruct masked sentence
    masked_sentence = " ".join(words)

    return masked_sentence

### 10. Getting bulk Predictions

In [None]:
for _ in range(10):
  masked_sentence = get_masked_sentence(cleaned_text)

  predicted_words = predict_masked_word(masked_sentence)

  print(f"Masked Sentence: {masked_sentence}")
  print(f"Predicted Words: {predicted_words}")
  print()

Masked Sentence: the television network has eight ownedandoperated and over 232 [MASK] television stations throughout the united states and its territories
Predicted Words: ['232', '230', '375']

Masked Sentence: scholars generally date these texts to around the 3rd century bce 100 to 200 years after the death of [MASK] buddha
Predicted Words: ['of', 'to', 'for']

Masked Sentence: the dissolution of the soviet union was formally enacted on december 26 [MASK] as a result of the declaration no
Predicted Words: ['26', '11', '17']

Masked Sentence: the foundation has since [MASK] to work with other charities in the city and also provided relief following hurricane ike three years later
Predicted Words: ['since', 'until', 'before']

Masked Sentence: this strategy was to a degree forced upon france geography coupled with the superiority of the british navy made it difficult for the french navy [MASK] provide significant supplies and support to french colonies
Predicted Words: ['navy', 'army'

### 11. Zip the Model to Download

In [None]:
!zip -r /content/tiny_bert_trained.zip /content/tiny_bert_trained

  adding: content/tiny_bert_trained/ (stored 0%)
  adding: content/tiny_bert_trained/config.json (deflated 48%)
  adding: content/tiny_bert_trained/model.safetensors (deflated 9%)
  adding: content/tiny_bert_trained/vocab.txt (deflated 53%)
  adding: content/tiny_bert_trained/tokenizer_config.json (deflated 75%)
  adding: content/tiny_bert_trained/generation_config.json (deflated 8%)
  adding: content/tiny_bert_trained/special_tokens_map.json (deflated 42%)
