In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Step One: Data Preparation

In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

## Loading the English Wikipedia dataset from Hugging Face datasets

In [3]:
from datasets import load_dataset

wiki_dataset = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

#### Check Available Splits

In [4]:
print("Available splits:", wiki_dataset.keys())

Available splits: dict_keys(['train'])


#### Access the "train" split directly

In [5]:
train_dataset = wiki_dataset["train"]

# Print the dataset features (column names)
print("Features:", train_dataset.features)

# Print the number of examples in the dataset
print("Number of examples:", len(train_dataset))

Features: {'id': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
Number of examples: 6458670


In [6]:
type(train_dataset)

#### Print the first few examples in the dataset

In [7]:
for example_idx in range(5):
    example = train_dataset[example_idx]

    print({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": " ".join(example["text"].split(".")[:2]) + "..."  # Extract first 1 to 2 sentences
    })
    print("\n\n")

{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy  Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful...'}



{'id': '25', 'url': 'https://en.wikipedia.org/wiki/Autism', 'title': 'Autism', 'text': "Autism is a neurodevelopmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior  Parents often notice signs during the first three years of their child's life..."}



{'id': '39', 'url': 'https://en.wikipedia.org/wiki/Albedo', 'title': 'Albedo', 'text': 'Albedo (; ) is the measure of the diffuse reflection of solar radiation out of the total solar radiation and measured on a scale from 0, corresponding to a black body that absorbs all incident radiation, to 1, corresponding to a body t

## Train a new tokenizer specifically for the Wikipedia corpus.

In [8]:
pip install tokenizers



In [9]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from tqdm import tqdm

# Define tokenizer training parameters
vocab_size = 16000
max_token_length = 512
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize tokenizer settings
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.add_special_tokens(special_tokens)

# Train tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=special_tokens,
    min_frequency=2,
    show_progress=True,  # Enable the trainer's built-in progress bar
    max_token_length=max_token_length
)

# Iterate over the first 50,000 examples in the dataset and extract text
texts = []
for i in range(50000):
    texts.append(train_dataset[i]["text"])


# Define a custom progress bar
with tqdm(total=len(texts), desc='Training tokenizer') as pbar:
    # Train tokenizer on the extracted texts
    tokenizer.train_from_iterator(texts, trainer=trainer)

    # Update tqdm progress bar
    pbar.update(len(texts))

# Save trained tokenizer
tokenizer.save("wikipedia_tokenizer.json")


Training tokenizer: 100%|██████████| 50000/50000 [02:14<00:00, 372.69it/s]


#### Tokenize a sample text

In [10]:
sample_text = "This is a sample text for tokenization."

encoded = tokenizer.encode(sample_text)

# Get the tokens
tokens = encoded.tokens

# Get the token IDs
ids = encoded.ids

# Print tokens and corresponding token IDs
for token, token_id in zip(tokens, ids):
    print(f"Token: {token}, Token ID: {token_id}")

# Visualize the tokenization
print("\nTokenization visualization:")
print(encoded.tokens)
print(encoded.ids)


Token: ĠThis, Token ID: 735
Token: Ġis, Token ID: 289
Token: Ġa, Token ID: 215
Token: Ġsample, Token ID: 8434
Token: Ġtext, Token ID: 2578
Token: Ġfor, Token ID: 291
Token: Ġto, Token ID: 248
Token: ken, Token ID: 4590
Token: ization, Token ID: 1266
Token: ., Token ID: 18

Tokenization visualization:
['ĠThis', 'Ġis', 'Ġa', 'Ġsample', 'Ġtext', 'Ġfor', 'Ġto', 'ken', 'ization', '.']
[735, 289, 215, 8434, 2578, 291, 248, 4590, 1266, 18]


In [11]:
vocabulary = tokenizer.get_vocab()

num_words = len(vocabulary)

print("Total number of words in the tokenizer:", num_words)

Total number of words in the tokenizer: 16000


When training a tokenizer with a specified vocabulary size (vocab_size).

For many tokenization algorithms like Byte Pair Encoding (BPE), once the vocabulary size reaches the specified limit, the tokenizer typically stops adding new tokens and the training process concludes.

## Define a data collator for MLM tasks

# Step 2: Define and Prepare the BERT Model

In [12]:
import random

class CustomDataCollatorForLanguageModeling:
    def __init__(self, tokenizer, mlm_probability=0.2):
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        batch_inputs = []
        batch_labels = []

        for example in examples:
            encoded = self.tokenizer.encode(example['text'])
            inputs = encoded.ids
            labels = inputs[:]


            # Apply masking with MLM probability
            for i, token_id in enumerate(inputs):
                # Skip special tokens [CLS], [SEP], [PAD]
                if token_id in [self.tokenizer.token_to_id("[CLS]"),
                                self.tokenizer.token_to_id("[SEP]"),
                                self.tokenizer.token_to_id("[PAD]")]:
                    continue

                # Apply masking with MLM probability
                if random.random() < self.mlm_probability:
                    # 80% of the time, replace the token with [MASK]
                    if random.random() < 0.8:
                        inputs[i] = self.tokenizer.token_to_id("[MASK]")
                    # 10% of the time, keep the token unchanged
                    elif random.random() < 0.5:
                        pass  # Do nothing
                    # 10% of the time, replace the token with a random token
                    else:
                        inputs[i] = random.choice(range(self.tokenizer.get_vocab_size()))

                    # Record the original token in the labels for computing loss
                    labels[i] = token_id

            # Pad sequences to the maximum length (512)
            while len(inputs) < 512:
                inputs.append(self.tokenizer.token_to_id("[PAD]"))
                labels.append(self.tokenizer.token_to_id("[PAD]"))

            # Truncate sequences longer than 512
            inputs = inputs[:512]
            labels = labels[:512]

            batch_inputs.append(inputs)
            batch_labels.append(labels)

        return {
            "input_ids": batch_inputs,
            "labels": batch_labels
        }


In [13]:
from transformers import BertConfig, BertForMaskedLM

# Define the BERT configuration
config = BertConfig(
    vocab_size=vocab_size,  # Specify the vocabulary size
    num_hidden_layers=8,    # Number of transformer layers
    hidden_size=256,        # Size of the hidden layers
    num_attention_heads=4,  # Number of attention heads
    max_position_embeddings=max_token_length,  # Maximum input sequence length
)

# Initialize the BERT model with masked language modeling head
model = BertForMaskedLM(config=config)

batch_size = 32
learning_rate = 1e-4

# Print the model architecture
print(model)


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(16000, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_aff

# Step 3: Overfitting on a Subset of The Data

In [14]:
import pandas as pd

# Initialize an empty list to store the data
data = []

# Iterate over the first 1000 examples in the dataset
for example_idx in range(100):
    example = train_dataset[example_idx]

    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"], # Extract first 1 to 2 sentences
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [15]:
from datasets import Dataset
from torch.utils.data import DataLoader

subset_train_dataset = Dataset.from_pandas(df)

# Initialize the data collator
data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
train_dataloader = DataLoader(
    subset_train_dataset,
    batch_size=batch_size,
    collate_fn=data_collator,
)


In [19]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

num_epochs=100

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

Epoch 1/100: Avg. Loss: 6.9372
Epoch 2/100: Avg. Loss: 6.7697
Epoch 3/100: Avg. Loss: 6.5953
Epoch 4/100: Avg. Loss: 6.4227
Epoch 5/100: Avg. Loss: 6.2545
Epoch 6/100: Avg. Loss: 6.0860
Epoch 7/100: Avg. Loss: 5.9195
Epoch 8/100: Avg. Loss: 5.7574
Epoch 9/100: Avg. Loss: 5.5917
Epoch 10/100: Avg. Loss: 5.4248
Epoch 11/100: Avg. Loss: 5.2665
Epoch 12/100: Avg. Loss: 5.1162
Epoch 13/100: Avg. Loss: 4.9620
Epoch 14/100: Avg. Loss: 4.8002
Epoch 15/100: Avg. Loss: 4.6465
Epoch 16/100: Avg. Loss: 4.4999
Epoch 17/100: Avg. Loss: 4.3605
Epoch 18/100: Avg. Loss: 4.2082
Epoch 19/100: Avg. Loss: 4.0713
Epoch 20/100: Avg. Loss: 3.9330
Epoch 21/100: Avg. Loss: 3.7901
Epoch 22/100: Avg. Loss: 3.6547
Epoch 23/100: Avg. Loss: 3.5278
Epoch 24/100: Avg. Loss: 3.3995
Epoch 25/100: Avg. Loss: 3.2711
Epoch 26/100: Avg. Loss: 3.1487
Epoch 27/100: Avg. Loss: 3.0249
Epoch 28/100: Avg. Loss: 2.9047
Epoch 29/100: Avg. Loss: 2.7999
Epoch 30/100: Avg. Loss: 2.6842
Epoch 31/100: Avg. Loss: 2.5762
Epoch 32/100: Avg

#### Evaluation on the training subset

In [20]:
model.eval()
eval_loss = 0.0
for batch in train_dataloader:
    inputs = batch["input_ids"]
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        eval_loss += outputs.loss.item()

avg_eval_loss = eval_loss / len(train_dataloader)
print(f"Avg. Evaluation Loss on Training Subset: {avg_eval_loss:.4f}")

Avg. Evaluation Loss on Training Subset: 0.8041


# Step 4: Training on the Entire Dataset

#### Train on the first 100000 samples

In [21]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(100000):
    example = train_dataset[example_idx]

    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"]
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

    id                                      url      title  \
0   12  https://en.wikipedia.org/wiki/Anarchism  Anarchism   
1   25     https://en.wikipedia.org/wiki/Autism     Autism   
2   39     https://en.wikipedia.org/wiki/Albedo     Albedo   
3  290          https://en.wikipedia.org/wiki/A          A   
4  303    https://en.wikipedia.org/wiki/Alabama    Alabama   

                                                text  
0  Anarchism is a political philosophy and moveme...  
1  Autism is a neurodevelopmental disorder charac...  
2  Albedo (; ) is the measure of the diffuse refl...  
3  A, or a, is the first letter and the first vow...  
4  Alabama () is a state in the Southeastern regi...  


In [22]:
from datasets import Dataset
from torch.utils.data import DataLoader

trainn_dataset = Dataset.from_pandas(df)

# Initialize the data collator
train_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
train_dataloader = DataLoader(
    trainn_dataset,
    batch_size=batch_size,
    collate_fn=train_data_collator,
)

#### Validation set containing 5000 samples

In [23]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(10000):
    example = train_dataset[100000 + example_idx]

    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"]
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

       id                                                url  \
0  105190  https://en.wikipedia.org/wiki/Double%20Springs...   
1  105191    https://en.wikipedia.org/wiki/Lynn%2C%20Alabama   
2  105192  https://en.wikipedia.org/wiki/Waterford%20%28d...   
3  105193  https://en.wikipedia.org/wiki/Natural%20Bridge...   
4  105195               https://en.wikipedia.org/wiki/Greene   

                        title  \
0     Double Springs, Alabama   
1               Lynn, Alabama   
2  Waterford (disambiguation)   
3     Natural Bridge, Alabama   
4                      Greene   

                                                text  
0  Double Springs is a town in Winston County, Al...  
1  Lynn is a town in Winston County, Alabama, Uni...  
2  Waterford is a city in County Waterford, Irela...  
3  Natural Bridge is a town at the southwest edge...  
4  Greene may refer to:\n\nPlaces\n\nUnited State...  


In [24]:
from datasets import Dataset

val_dataset = Dataset.from_pandas(df)

# Initialize the data collator
val_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=val_data_collator,
)

#### Training Loop

In [26]:
import torch
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup

# Initialize the BERT model with the same configuration
model = BertForMaskedLM(config=config)

num_epochs = 1

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    for batch in val_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")


Epoch 1/1: Avg. Loss: 1.1513
Avg. Validation Loss: 0.1948


#### Save The Trained Model

In [27]:
import os

# Define the directory where you want to save the trained model
output_dir = "bert_model_trained"

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model to the specified directory
model_save_path = os.path.join(output_dir, "bert_model.pth")
torch.save(model.state_dict(), model_save_path)

print("Trained model saved at:", model_save_path)


Trained model saved at: bert_model_trained/bert_model.pth


#### Test set containing 200 samples

In [28]:
import pandas as pd

# Initialize an empty list to store the data
data = []

for example_idx in range(200):
    example = train_dataset[52000 + example_idx]

    # Extract relevant information and append to the data list
    data.append({
        "id": example["id"],
        "url": example["url"],
        "title": example["title"],
        "text": example["text"]  # Extract first 1 to 2 sentences
    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

       id                                                url  \
0  107836  https://en.wikipedia.org/wiki/Stanton%2C%20Cal...   
1  107837  https://en.wikipedia.org/wiki/Tustin%2C%20Cali...   
2  107838  https://en.wikipedia.org/wiki/North%20Tustin%2...   
3  107839  https://en.wikipedia.org/wiki/Villa%20Park%2C%...   
4  107840  https://en.wikipedia.org/wiki/Westminster%2C%2...   

                      title                                               text  
0       Stanton, California  Stanton is a city in northern Orange County, C...  
1        Tustin, California  Tustin is a city located in Orange County, Cal...  
2  North Tustin, California  North Tustin is a census-designated place and ...  
3    Villa Park, California  Villa Park is a city in Orange County, Califor...  
4   Westminster, California  Westminster is a city in northern Orange Count...  


In [29]:
from datasets import Dataset

test_dataset = Dataset.from_pandas(df)

# Initialize the data collator
test_data_collator = CustomDataCollatorForLanguageModeling(tokenizer)

# Initialize the data loader
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=test_data_collator,
)

#### Top-10 Token Accuracy

In [31]:
import torch

# Function to compute top-k token accuracy
def compute_top_k_accuracy(predictions, labels, k=10):
    correct_predictions = 0

    # Flatten the predictions tensor along the second dimension
    pred_flat = predictions.view(-1, predictions.size(-1))
    total_samples = len(pred_flat)

    # Get the top-k predicted tokens
    top_k_preds = torch.flip(pred_flat.argsort(), dims=[1])[:, :k]

    # Flatten the labels tensor
    labels_flat = labels.view(-1, 1)

    # Check if the true token is among the top-k predictions
    correct_predictions = torch.sum(torch.eq(top_k_preds, labels_flat)).item()

    accuracy = correct_predictions / total_samples
    return accuracy


# Evaluate the model on the test set with Top-10 Token Accuracy
model.eval()
test_loss = 0.0
top_k_predictions = []
all_labels = []

for batch in test_dataloader:
    inputs = batch["input_ids"]
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
        test_loss += outputs.loss.item()

        # Get the model's predictions
        predictions = outputs.logits

        # Append the predictions to the list
        top_k_predictions.append(predictions)

        # Append the labels to the list
        labels = torch.tensor(labels)
        all_labels.append(labels)

avg_test_loss = test_loss / len(test_dataloader)

# Concatenate the lists of predictions and labels
top_k_predictions = torch.cat(top_k_predictions, dim=0)
all_labels = torch.cat(all_labels, dim=0)

# Compute Top-10 Token Accuracy
top_10_accuracy = compute_top_k_accuracy(top_k_predictions, all_labels)
print(f"Avg. Test Loss: {avg_test_loss:.4f}")
print(f"Top-10 Token Accuracy: {top_10_accuracy:.4f}")

Avg. Test Loss: 0.2298
Top-10 Token Accuracy: 0.9867


# Step 5: Optimization Techniques

In [1]:
import torch
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler

# Initialize the BERT model with the same configuration
model = BertForMaskedLM(config=config)

num_epochs = 3
learning_rate = 5e-5
warmup_steps = 5

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Initialize mixed precision training
scaler = GradScaler()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():  # Use torch.cuda.amp.autocast for mixed precision training
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients to prevent explosion
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}: Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    for batch in val_dataloader:
        inputs = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model(input_ids=torch.tensor(inputs), labels=torch.tensor(labels))
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

# Save trained model
model.save_pretrained("bert_masked_lm_trained")


NameError: name 'config' is not defined

1. AdamW Optimizer: The code uses the AdamW optimizer to optimize the parameters of the BERT model. AdamW is a variant of the Adam optimizer that incorporates weight decay regularization to prevent overfitting.
2. Learning Rate Scheduler with Warmup: The code utilizes the get_linear_schedule_with_warmup function to create a linear learning rate scheduler with warmup steps. This scheduler gradually increases the learning rate during the warmup phase and then linearly decreases it during the training phase.
3. Mixed Precision Training: The code implements mixed precision training using PyTorch's autocast and GradScaler. Mixed precision training leverages hardware capabilities (such as Tensor Cores on GPUs) to speed up training and reduce memory usage by performing some operations in lower precision (e.g., half-precision floating-point format) while others remain in full precision.

# Step 6: Evaluation on Test Set

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Evaluation Metrics
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_ids"]
            labels = batch["labels"]

            outputs = model(input_ids=torch.tensor(inputs))
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate F1-score
    f1 = f1_score(all_labels, all_predictions, average='micro')

    return f1

# Example Showcase
def showcase_examples(model, tokenizer, dataloader, num_examples=5):
    model.eval()
    example_count = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_ids"]
            labels = batch["labels"]
            texts = batch["text"]

            outputs = model(input_ids=torch.tensor(inputs))
            predictions = torch.argmax(outputs.logits, dim=-1)

            for text, label, prediction in zip(texts, labels, predictions):
                text = tokenizer.decode(text, skip_special_tokens=True)
                label = tokenizer.decode(label, skip_special_tokens=True)
                prediction = tokenizer.decode(prediction, skip_special_tokens=True)

                print(f"Text: {text}")
                print(f"True Label: {label}")
                print(f"Predicted Label: {prediction}")
                print()

                example_count += 1
                if example_count >= num_examples:
                    return


# Evaluate the trained model on the test set
f1 = evaluate(model, test_dataloader)
print(f"F1-score: {f1:.4f}")

# Showcase examples from the test set
showcase_examples(model, tokenizer, test_dataloader, num_examples=5)

In [None]:
def analyze_examples(model, tokenizer, dataloader):
    model.eval()
    misclassified_examples = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_ids"]
            labels = batch["labels"]
            texts = batch["text"]

            outputs = model(input_ids=torch.tensor(inputs))
            predictions = torch.argmax(outputs.logits, dim=-1)

            for text, label, prediction in zip(texts, labels, predictions):
                text = tokenizer.decode(text, skip_special_tokens=True)
                label = tokenizer.decode(label, skip_special_tokens=True)
                prediction = tokenizer.decode(prediction, skip_special_tokens=True)

                if label != prediction:
                    misclassified_examples.append((text, label, prediction))

    # Print misclassified examples
    print("Misclassified Examples:")
    for idx, (text, label, prediction) in enumerate(misclassified_examples, start=1):
        print(f"Example {idx}:")
        print(f"Text: {text}")
        print(f"True Label: {label}")
        print(f"Predicted Label: {prediction}")
        print()

# Analyze examples to understand model performance, strengths, weaknesses, and potential areas for improvement
analyze_examples(model, tokenizer, test_dataloader)


### Downstream Fine-Tuning:

In [None]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load downstream task dataset (e.g., sentiment analysis)
dataset = load_dataset("glue", "sst2")

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=500,
)

# Define data preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Preprocess the dataset
train_dataset = dataset["train"].map(preprocess_function)
eval_dataset = dataset["validation"].map(preprocess_function)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model on the downstream task
trainer.train()

# Evaluate the fine-tuned model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


### Interpretability Exploration:

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from captum.attr import IntegratedGradients, visualization

# Example text for interpretation
text = "This movie is really great and I enjoyed it a lot."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Perform forward pass
outputs = model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=-1)

# Predicted label and probability
predicted_label = torch.argmax(probabilities, dim=-1).item()
predicted_probability = probabilities[0][predicted_label].item()
print(f"Predicted Label: {predicted_label}, Probability: {predicted_probability:.4f}")

# Integrated Gradients for interpretability
integrated_gradients = IntegratedGradients(model)

# Interpretation
attributions = integrated_gradients.attribute(inputs.input_ids, target=predicted_label)

# Visualization
visualization.visualize_text(vis_data_records=[(attributions.sum(), text)],
                             vis_type="heat_map",
                             overlay="text",
                             show=False).show()
