In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Step One: Data Preparation

In [8]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


## Loading the English Wikipedia dataset from Hugging Face datasets

In [1]:
from datasets import load_dataset

wiki_dataset = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)

#### Check Available Splits

In [2]:
print("Available splits:", wiki_dataset.keys())

Available splits: dict_keys(['train'])


#### Access the "train" split directly

In [2]:
train_dataset = wiki_dataset["train"]

# Print the dataset features (column names)
print("Features:", train_dataset.features)

# Print the number of examples in the dataset
print("Number of examples:", len(train_dataset))

Features: {'id': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
Number of examples: 6458670


In [5]:
type(train_dataset)

datasets.arrow_dataset.Dataset

#### Print the first few examples in the dataset

In [6]:
for example_idx in range(5):
    example = train_dataset[example_idx]
    
    print({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": " ".join(example["text"].split(".")[:2]) + "..."  # Extract first 1 to 2 sentences
    })
    print("\n\n")

{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy  Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful...'}



{'id': '25', 'url': 'https://en.wikipedia.org/wiki/Autism', 'title': 'Autism', 'text': "Autism is a neurodevelopmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior  Parents often notice signs during the first three years of their child's life..."}



{'id': '39', 'url': 'https://en.wikipedia.org/wiki/Albedo', 'title': 'Albedo', 'text': 'Albedo (; ) is the measure of the diffuse reflection of solar radiation out of the total solar radiation and measured on a scale from 0, corresponding to a black body that absorbs all incident radiation, to 1, corresponding to a body t

## Train a new tokenizer specifically for the Wikipedia corpus.

In [None]:
pip install tokenizers

In [46]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from tqdm import tqdm

# Define tokenizer training parameters
vocab_size = 16000
max_token_length = 512
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize tokenizer settings
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.add_special_tokens(special_tokens)

# Train tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=special_tokens,
    min_frequency=2,
    show_progress=True,  # Enable the trainer's built-in progress bar
    max_token_length=max_token_length
)

# Iterate over the first 50,000 examples in the dataset and extract text
texts = []
for i in range(50000):
    texts.append(train_dataset[i]["text"])


# Define a custom progress bar
with tqdm(total=len(texts), desc='Training tokenizer') as pbar:
    # Train tokenizer on the extracted texts
    tokenizer.train_from_iterator(texts, trainer=trainer)

    # Update tqdm progress bar
    pbar.update(len(texts))
    
# Save trained tokenizer
tokenizer.save("wikipedia_tokenizer.json")


Training tokenizer:   0%|          | 0/50000 [00:00<?, ?it/s]






Training tokenizer: 100%|██████████| 50000/50000 [04:37<00:00, 180.09it/s]


#### Load the trained tokenizer

In [5]:
# Load the previously trained tokenizer from the uploaded file
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("/kaggle/input/wikipedia-tokenizer/wikipedia_tokenizer_updated.json")

#### Tokenize a sample text

In [6]:
sample_text = "This is a sample text for tokenization."

encoded = tokenizer.encode(sample_text)

# Get the tokens
tokens = encoded.tokens

# Get the token IDs
ids = encoded.ids

# Print tokens and corresponding token IDs
for token, token_id in zip(tokens, ids):
    print(f"Token: {token}, Token ID: {token_id}")

# Visualize the tokenization
print("\nTokenization visualization:")
print(encoded.tokens)
print(encoded.ids)


Token: ĠThis, Token ID: 769
Token: Ġis, Token ID: 301
Token: Ġa, Token ID: 216
Token: Ġsample, Token ID: 9698
Token: Ġtext, Token ID: 3230
Token: Ġfor, Token ID: 293
Token: Ġto, Token ID: 248
Token: ken, Token ID: 4149
Token: ization, Token ID: 1418
Token: ., Token ID: 18

Tokenization visualization:
['ĠThis', 'Ġis', 'Ġa', 'Ġsample', 'Ġtext', 'Ġfor', 'Ġto', 'ken', 'ization', '.']
[769, 301, 216, 9698, 3230, 293, 248, 4149, 1418, 18]


In [7]:
vocabulary = tokenizer.get_vocab()

num_words = len(vocabulary)

print("Total number of words in the tokenizer:", num_words)

Total number of words in the tokenizer: 16000


When training a tokenizer with a specified vocabulary size (vocab_size). 

For many tokenization algorithms like Byte Pair Encoding (BPE), once the vocabulary size reaches the specified limit, the tokenizer typically stops adding new tokens and the training process concludes. 

## Define a data collator for MLM tasks

# Step 2: Define and Prepare the BERT Model

In [7]:
import random

class CustomDataCollatorForLanguageModeling:
    def __init__(self, tokenizer, mlm_probability=0.2):
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        batch_inputs = []
        batch_labels = []

        for example in examples:
            encoded = self.tokenizer.encode(example['text'])
            inputs = encoded.ids
            labels = inputs[:]
            
            # Apply masking with MLM probability
            for i, token_id in enumerate(inputs):
                # Skip special tokens [CLS], [SEP], [PAD]
                if token_id in [self.tokenizer.token_to_id("[CLS]"), 
                                self.tokenizer.token_to_id("[SEP]"), 
                                self.tokenizer.token_to_id("[PAD]")]:
                    continue

                # Apply masking with MLM probability
                if random.random() < self.mlm_probability:
                    # 80% of the time, replace the token with [MASK]
                    if random.random() < 0.8:
                        inputs[i] = self.tokenizer.token_to_id("[MASK]")
                    # 10% of the time, keep the token unchanged
                    elif random.random() < 0.5:
                        pass  # Do nothing
                    # 10% of the time, replace the token with a random token
                    else:
                        inputs[i] = random.choice(range(self.tokenizer.get_vocab_size()))

                    # Record the original token in the labels for computing loss
                    labels[i] = token_id
            
            # Pad sequences to the maximum length (512)
            while len(inputs) < 512:
                inputs.append(self.tokenizer.token_to_id("[PAD]"))
                labels.append(self.tokenizer.token_to_id("[PAD]"))

            # Truncate sequences longer than 512
            inputs = inputs[:512]
            labels = labels[:512]
            
            batch_inputs.append(inputs)
            batch_labels.append(labels)
            
        return {
            "input_ids": batch_inputs,
            "labels": batch_labels
        }


In [8]:
from transformers import BertConfig, BertForMaskedLM

# Define the BERT configuration
config = BertConfig(
    vocab_size=16000,  # Specify the vocabulary size
    num_hidden_layers=8,    # Number of transformer layers
    hidden_size=256,        # Size of the hidden layers
    num_attention_heads=4,  # Number of attention heads
    max_position_embeddings=512,  # Maximum input sequence length
)

# Initialize the BERT model with masked language modeling head
model = BertForMaskedLM(config=config)

batch_size = 32
learning_rate = 1e-4

# Print the model architecture
print(model)


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(16000, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_aff

# Step 3: Overfitting on a Subset of The Data

In [56]:
import pandas as pd

# Initialize an empty list to store the data
data = []

# Iterate over the first 1000 examples in the dataset
for example_idx in range(100):
    example = train_dataset[example_idx]
    
    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": " ".join(example["text"].split(".")[:2]) + "..."  # Extract first 1 to 2 sentences
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [57]:
from datasets import Dataset
from torch.utils.data import DataLoader

subset_train_dataset = Dataset.from_pandas(df)

# Initialize the data collator
data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
train_dataloader = DataLoader(
    subset_train_dataset,
    batch_size=batch_size,
    collate_fn=data_collator,
)


In [16]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

num_epochs=5

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/5: Avg. Loss: 9.6651
Epoch 2/5: Avg. Loss: 9.5244
Epoch 3/5: Avg. Loss: 9.4156
Epoch 4/5: Avg. Loss: 9.3387
Epoch 5/5: Avg. Loss: 9.2943


#### Evaluation on the training subset

In [13]:
model.eval()
eval_loss = 0.0
for batch in train_dataloader:
    inputs = batch["input_ids"]
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        eval_loss += outputs.loss.item()

avg_eval_loss = eval_loss / len(train_dataloader)
print(f"Avg. Evaluation Loss on Training Subset: {avg_eval_loss:.4f}")

Avg. Evaluation Loss on Training Subset: 9.2897


# Step 4: Training on the Entire Dataset

#### Train on the first 100000 samples

In [9]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(10000):
    example = train_dataset[example_idx]
    
    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"],
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [10]:
from datasets import Dataset
from torch.utils.data import DataLoader

trainn_dataset = Dataset.from_pandas(df)

# Initialize the data collator
train_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
train_dataloader = DataLoader(
    trainn_dataset,
    batch_size=batch_size,
    collate_fn=train_data_collator,
)

#### Validation set containing 5000 samples

In [11]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(1000):
    example = train_dataset[10000 + example_idx]
    
    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"],
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

     id                                                url  \
0  2441  https://en.wikipedia.org/wiki/House%20of%20Asc...   
1  2443         https://en.wikipedia.org/wiki/Acceleration   
2  2444  https://en.wikipedia.org/wiki/Conservation%20a...   
3  2447      https://en.wikipedia.org/wiki/Anton%20Chekhov   
4  2448  https://en.wikipedia.org/wiki/Action%20Against...   

                                               title  \
0                                   House of Ascania   
1                                       Acceleration   
2  Conservation and restoration of cultural property   
3                                      Anton Chekhov   
4                              Action Against Hunger   

                                                text  
0  The House of Ascania () was a dynasty of Germa...  
1  In mechanics, acceleration is the rate of chan...  
2  The conservation and restoration of cultural p...  
3  Anton Pavlovich Chekhov (; 29 January 1860 – 1...  
4  Action Again

In [12]:
from datasets import Dataset

val_dataset = Dataset.from_pandas(df)

# Initialize the data collator
val_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=val_data_collator,
)

#### Training Loop

In [63]:
import torch
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup

# Initialize the BERT model with the same configuration
model = BertForMaskedLM(config=config)

num_epochs = 3

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    for batch in val_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")




Epoch 1/3: Avg. Loss: 9.1700
Avg. Validation Loss: 8.5872
Epoch 2/3: Avg. Loss: 8.1759
Avg. Validation Loss: 7.7928
Epoch 3/3: Avg. Loss: 7.6371
Avg. Validation Loss: 7.5218


#### Save The Trained Model

In [64]:
import os

# Define the directory where you want to save the trained model
output_dir = "bert_model_trained"

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model to the specified directory
model_save_path = os.path.join(output_dir, "bert_model.pth")
torch.save(model.state_dict(), model_save_path)

print("Trained model saved at:", model_save_path)


Trained model saved at: bert_model_trained/bert_model.pth


#### Test set containing 200 samples

In [16]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(50):
    example = train_dataset[11000 + example_idx]
    
    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"],
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

     id                                                url  \
0  2826  https://en.wikipedia.org/wiki/Antigonid%20dynasty   
1  2827             https://en.wikipedia.org/wiki/Abingdon   
2  2830           https://en.wikipedia.org/wiki/Abjuration   
3  2833              https://en.wikipedia.org/wiki/Abitibi   
4  2834  https://en.wikipedia.org/wiki/A%20Vindication%...   

                                  title  \
0                     Antigonid dynasty   
1                              Abingdon   
2                            Abjuration   
3                               Abitibi   
4  A Vindication of the Rights of Woman   

                                                text  
0  The Antigonid dynasty (; ) was a Hellenistic d...  
1  Abingdon may refer to:\n\nPlaces\n\nUnited Kin...  
2  Abjuration is the solemn repudiation, abandonm...  
3  Abitibi may refer to:\n\n Abitibi Canyon, Onta...  
4  A Vindication of the Rights of Woman: with Str...  


In [17]:
from datasets import Dataset

test_dataset = Dataset.from_pandas(df)

# Initialize the data collator
test_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=test_data_collator,
)

#### Top-10 Token Accuracy 

In [25]:
import torch

# Function to compute top-k token accuracy
def compute_top_k_accuracy(predictions, labels, k=10):
    correct_predictions = 0

    # Flatten the predictions tensor along the second dimension
    pred_flat = predictions.view(-1, predictions.size(-1))
    total_samples = len(pred_flat)

    # Get the top-k predicted tokens
    top_k_preds = torch.flip(pred_flat.argsort(), dims=[1])[:, :k]

    # Flatten the labels tensor
    labels_flat = labels.view(-1, 1)

    # Check if the true token is among the top-k predictions
    correct_predictions = torch.sum(torch.eq(top_k_preds, labels_flat)).item()

    accuracy = correct_predictions / total_samples
    return accuracy


# Evaluate the model on the test set with Top-10 Token Accuracy
model.eval()
test_loss = 0.0
top_k_predictions = []
all_labels = []

for batch in test_dataloader:
    inputs = batch["input_ids"]
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        test_loss += outputs.loss.item()

        # Get the model's predictions
        predictions = outputs.logits

        # Append the predictions to the list
        top_k_predictions.append(predictions)

        # Append the labels to the list
        labels = torch.tensor(labels)
        all_labels.append(labels)

avg_test_loss = test_loss / len(test_dataloader)

# Concatenate the lists of predictions and labels
top_k_predictions = torch.cat(top_k_predictions, dim=0)
all_labels = torch.cat(all_labels, dim=0)

# Compute Top-10 Token Accuracy
top_10_accuracy = compute_top_k_accuracy(top_k_predictions, all_labels)
print(f"Avg. Test Loss: {avg_test_loss:.4f}")
print(f"Top-10 Token Accuracy: {top_10_accuracy:.4f}")

Avg. Test Loss: 9.2926
Top-10 Token Accuracy: 0.0402


# Step 5: Optimization Techniques

In [23]:
import torch
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler

# Initialize the BERT model with the same configuration
model = BertForMaskedLM(config=config)

num_epochs = 3
learning_rate = 5e-5
warmup_steps = 5

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Initialize mixed precision training
scaler = GradScaler()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]
        
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():  # Use torch.cuda.amp.autocast for mixed precision training
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients to prevent explosion
        scaler.step(optimizer)
        scaler.update()
        
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    for batch in val_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

# Save trained model
model.save_pretrained("bert_masked_lm_trained")




Epoch 1/3: Avg. Loss: 9.4602
Avg. Validation Loss: 9.1395
Epoch 2/3: Avg. Loss: 8.9265
Avg. Validation Loss: 8.7240
Epoch 3/3: Avg. Loss: 8.6416
Avg. Validation Loss: 8.5793


1. AdamW Optimizer: The code uses the AdamW optimizer to optimize the parameters of the BERT model. AdamW is a variant of the Adam optimizer that incorporates weight decay regularization to prevent overfitting.
2. Learning Rate Scheduler with Warmup: The code utilizes the get_linear_schedule_with_warmup function to create a linear learning rate scheduler with warmup steps. This scheduler gradually increases the learning rate during the warmup phase and then linearly decreases it during the training phase.
3. Mixed Precision Training: The code implements mixed precision training using PyTorch's autocast and GradScaler. Mixed precision training leverages hardware capabilities (such as Tensor Cores on GPUs) to speed up training and reduce memory usage by performing some operations in lower precision (e.g., half-precision floating-point format) while others remain in full precision.

In [15]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler
from torch.nn.utils import clip_grad_norm_
from transformers import BertTokenizer

# Initialize the BERT model
model = BertForMaskedLM(config=config)

num_epochs = 1
learning_rate = 5e-5
warmup_steps = 0 
gradient_accumulation_steps = 4  # Accumulate gradients over 4 batches
max_grad_norm = 1.0  # Maximum gradient norm for clipping

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                            num_training_steps=len(train_dataloader) * num_epochs)

# Initialize mixed precision training
scaler = GradScaler()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for step, batch in enumerate(train_dataloader):
        inputs = batch["input_ids"]
        labels = batch["labels"]
        
        # Forward pass
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            loss = outputs.loss / gradient_accumulation_steps  # Scale the loss for gradient accumulation
        
        # Backward pass
        scaler.scale(loss).backward()
        
        # Gradient accumulation
        if (step + 1) % gradient_accumulation_steps == 0:
            # Clip gradients
            clip_grad_norm_(model.parameters(), max_grad_norm)
            
            # Update model parameters
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    for batch in val_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

# Save trained model
model.save_pretrained("bert_masked_lm_trained")


Epoch 1/1: Avg. Loss: 2.4213
Avg. Validation Loss: 9.6196


1. gradient_accumulation_steps is set to 4, meaning gradients are accumulated over 4 batches before performing an optimization step.
2. Gradient clipping is performed using clip_grad_norm_() with a maximum gradient norm of 1.0.

# Step 6: Evaluation on Test Set

In [22]:
import torch
from sklearn.metrics import f1_score

# Function to compute top-k token accuracy
def compute_top_k_accuracy(predictions, labels, k=10):
    correct_predictions = 0

    # Flatten the predictions tensor along the second dimension
    pred_flat = predictions.view(-1, predictions.size(-1))
    total_samples = len(pred_flat)

    # Get the top-k predicted tokens
    top_k_preds = torch.flip(pred_flat.argsort(), dims=[1])[:, :k]

    # Flatten the labels tensor
    labels_flat = labels.view(-1, 1)

    # Check if the true token is among the top-k predictions
    correct_predictions = torch.sum(torch.eq(top_k_preds, labels_flat)).item()

    accuracy = correct_predictions / total_samples
    return accuracy


# Function to compute F1 score
def compute_f1_score(predictions, labels):
    # Flatten the predictions and labels tensors
    pred_flat = predictions.argmax(dim=-1).view(-1)
    labels_flat = labels.view(-1)

    # Compute F1 score
    f1 = f1_score(labels_flat.cpu().numpy(), pred_flat.cpu().numpy(), average='micro')
    return f1


# Evaluate the model on the test set with Top-10 Token Accuracy and F1 score
model.eval()
test_loss = 0.0
top_k_predictions = []
all_labels = []

for batch in test_dataloader:
    inputs = batch["input_ids"]
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        test_loss += outputs.loss.item()

        # Get the model's predictions
        predictions = outputs.logits

        # Append the predictions to the list
        top_k_predictions.append(predictions)

        # Append the labels to the list
        labels = torch.tensor(labels)
        all_labels.append(labels)

avg_test_loss = test_loss / len(test_dataloader)

# Concatenate the lists of predictions and labels
top_k_predictions = torch.cat(top_k_predictions, dim=0)
all_labels = torch.cat(all_labels, dim=0)

# Compute Top-10 Token Accuracy
top_10_accuracy = compute_top_k_accuracy(top_k_predictions, all_labels)
print(f"Avg. Test Loss: {avg_test_loss:.4f}")
print(f"Top-10 Token Accuracy: {top_10_accuracy:.4f}")

# Compute F1 score
f1 = compute_f1_score(top_k_predictions, all_labels)
print(f"F1-score: {f1:.4f}")

Avg. Test Loss: 9.6166
Top-10 Token Accuracy: 0.0109
F1-score: 0.0040


In [29]:
import torch

# Example Showcase
def showcase_examples(model, tokenizer, dataloader, num_examples=5):
    model.eval()
    example_count = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_ids"]
            labels = batch["labels"]

            outputs = model(input_ids=torch.tensor(inputs))
            predictions = torch.argmax(outputs.logits, dim=-1)

            for input_ids, label, prediction in zip(inputs, labels, predictions):
                text = tokenizer.decode(input_ids, skip_special_tokens=True)
                label = tokenizer.decode(label, skip_special_tokens=True)
                prediction = tokenizer.decode(list(prediction), skip_special_tokens=True)

                print(f"Text: {text}")
                print(f"True Label: {label}")
                print(f"Predicted Label: {prediction}")
                print()

                example_count += 1
                if example_count >= num_examples:
                    return

                

# Showcase examples from the test set
showcase_examples(model, tokenizer, test_dataloader, num_examples=5)

Text: igon dynasty (; ) was a Hellenistic dynasty of Dorian Greek provenance, descended from Alexander the's Antonus Iophthal ("the OneEyed") that mainly in Macedonia 

History
Succeeding Antipatrid dynasty in of Macedonia, Antigus ruled mostly over Asia Cl Syria...
True Label:  The Antigonid dynasty (; ) was a Hellenistic dynasty of Dorian Greek provenance, descended from Alexander the Great's general Antigonus I Monophthalmus ("the One-Eyed") that ruled mainly in Macedonia 

History

Succeeding the Antipatrid dynasty in much of Macedonia, Antigonus ruled mostly over Asia Minor and northern Syria...
Predicted Label:  contrib and laid knownrows Wright rele anded and and commun pen is of hamlet rele Bulgar Bulgar Mazident Jin neighbourhood contribGorows androws Ok known volt Ministers vastrows Court contrib Cyr Maz Euro Pacific Cyrfive varieties 2009 estimisticrows stressAfAf hamlet contribpossrows Jin rational known contrib singer Representative laid probleyear known is rationalrows Ch

In [32]:
def analyze_examples(model, tokenizer, dataloader):
    model.eval()
    misclassified_examples = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_ids"]
            labels = batch["labels"]

            outputs = model(input_ids=torch.tensor(inputs))
            predictions = torch.argmax(outputs.logits, dim=-1)

            for input_ids, label, prediction in zip(inputs, labels, predictions):
                text = tokenizer.decode(input_ids, skip_special_tokens=True)
                label = tokenizer.decode(label, skip_special_tokens=True)
                prediction = tokenizer.decode(list(prediction), skip_special_tokens=True)

                if label != prediction:
                    misclassified_examples.append((text, label, prediction))

    # Print misclassified examples
    print("Misclassified Examples:")
    for idx, (text, label, prediction) in enumerate(misclassified_examples, start=1):
        print(f"Example {idx}:")
        print(f"Text: {text}")
        print(f"True Label: {label}")
        print(f"Predicted Label: {prediction}")
        print()

# Analyze examples to understand model performance, strengths, weaknesses, and potential areas for improvement
analyze_examples(model, tokenizer, test_dataloader)


Misclassified Examples:
Example 1:
Text:  The Antigonid dynasty ) was Hellenistic dynasty of Dorian Greek provenance, descended from the Great general Antonus I Monophthalmus ("the One-Ey") that mainly in Macedonia 

History

Sucing the Antipatrid dynasty in log Macedonia,igon ruled mostly over Asia Minor and northern...
True Label:  The Antigonid dynasty (; ) was a Hellenistic dynasty of Dorian Greek provenance, descended from Alexander the Great's general Antigonus I Monophthalmus ("the One-Eyed") that ruled mainly in Macedonia 

History

Succeeding the Antipatrid dynasty in much of Macedonia, Antigonus ruled mostly over Asia Minor and northern Syria...
Predicted Label:  The laid laid knownscape Wrightrows andedrows and commun pen is of hamlet rele Bulgar Bulgar Mazident Jin neighbourhood contribGo solution is absor Ok known volt Ministers vast Bulgar Court contrib ice Maz Euro PacificAffive varieties Er estimisticrows stressAfAf hamlet contribposs Jin Jin rational known seem singer 

### Downstream Fine-Tuning:

In [36]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load downstream task dataset (e.g., sentiment analysis)
dataset = load_dataset("glue", "sst2")

# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    per_device_train_batch_size=32,
    num_train_epochs=3,
    logging_dir='/kaggle/working/logs/',
    logging_steps=500,
)

# Define data preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainn_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model on the downstream task
trainer.train()

# Evaluate the fine-tuned model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ···


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 3
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


IndexError: Invalid key: 997 is out of bounds for size 0

### Interpretability Exploration:

In [38]:
pip install captum

  pid, fd = os.forkpty()


Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: captum
Successfully installed captum-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [48]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from captum.attr import IntegratedGradients, visualization

# Example text for interpretation
text = "This movie is really great and I enjoyed it a lot."

# Tokenize the text
inputs = tokenizer.encode(text)

# Perform forward pass
outputs = model(input_ids = inputs)
probabilities = torch.softmax(outputs.logits, dim=-1)

# Predicted label and probability
predicted_label = torch.argmax(probabilities, dim=-1).item()
predicted_probability = probabilities[0][predicted_label].item()
print(f"Predicted Label: {predicted_label}, Probability: {predicted_probability:.4f}")

# Integrated Gradients for interpretability
integrated_gradients = IntegratedGradients(model)

# Interpretation
attributions = integrated_gradients.attribute(inputs.input_ids, target=predicted_label)

# Visualization
visualization.visualize_text(vis_data_records=[(attributions.sum(), text)],
                             vis_type="heat_map",
                             overlay="text",
                             show=False).show()


TypeError: 'tokenizers.Encoding' object is not subscriptable