In [3]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForQuestionAnswering
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

from torch.utils.tensorboard import SummaryWriter
from torch.profiler import profile, record_function, ProfilerActivity

import datetime

# TensorBoard SummaryWriter
writer = SummaryWriter("runs/bert_qa_experiment")

# Load the BERT model
model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)

# Load tokenizer to prepare SQUAD data into BERT understandable input
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Define a custom BERT model wrapper class
class BertQA(nn.Module):
    def __init__(self):
        super(BertQA, self).__init__()
        self.bert_qa = BertForQuestionAnswering.from_pretrained(model_name)
    
    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None):
        outputs = self.bert_qa(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )
        return outputs

# Initialize the model
model = BertQA()

# Load SQUAD dataset
dataset = load_dataset("squad")

# Data preprocessing function
def preprocess_data(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]

    # Tokenize data for BERT
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    # Map answer positions to tokenized positions
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        if len(examples["answers"]) > i:
            answer = examples["answers"][i].get("text", [])
            if answer:
                answer = answer[0]
                start_char = examples["answers"][i].get("answer_start", [0])[0]
                end_char = start_char + len(answer)
                start_token = None
                end_token = None

                for j, (start, end) in enumerate(offsets):
                    if start_char >= start and start_char < end:
                        start_token = j
                    if end_char > start and end_char <= end:
                        end_token = j
                        break

                if start_token is not None and end_token is not None:
                    tokenized_examples["start_positions"].append(start_token)
                    tokenized_examples["end_positions"].append(end_token)
                else:
                    tokenized_examples["start_positions"].append(0)
                    tokenized_examples["end_positions"].append(0)
            else:
                tokenized_examples["start_positions"].append(0)
                tokenized_examples["end_positions"].append(0)
        else:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)

    return tokenized_examples

# Preprocess the train set
train_dataset = dataset["train"].map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)

# Set up DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

# Training parameters
epochs = 1
learning_rate = 3e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Start:", datetime.datetime.now())

# PyTorch Profiler configuration
profiler = profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=3,
        repeat=1
    ),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("runs/bert_qa_profiler"),
    record_shapes=True,
    with_stack=True
)

# Training loop
with profiler:  # Wrap the training loop with profiler
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        i = 0
        running_loss = 0

        for batch in train_loader:
            with record_function("batch_training"):  # Profiling label
                # Move batch to device
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                start_positions = batch["start_positions"].to(device)
                end_positions = batch["end_positions"].to(device)
                
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Log loss to TensorBoard
                writer.add_scalar("Training Loss", loss.item(), epoch * len(train_loader) + i)

                # Update profiler
                profiler.step()

                # Track the loss
                total_loss += loss.item()
                running_loss += loss.item()
                i += 1

                if i % 100 == 99:  # Print loss every 100 mini-batches
                    print('[%d, %5d] running_loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    running_loss = 0

        # Average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")
        writer.add_scalar("Average Epoch Loss", avg_loss, epoch)

# Close the TensorBoard writer
writer.close()
print("End:", datetime.datetime.now())


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Start: 2024-12-17 12:06:45.571104
[1,   100] running_loss: 4.426
[1,   200] running_loss: 3.591
[1,   300] running_loss: 2.869
[1,   400] running_loss: 2.600
[1,   500] running_loss: 2.364
[1,   600] running_loss: 2.259
[1,   700] running_loss: 2.177
[1,   800] running_loss: 2.048
[1,   900] running_loss: 1.985
[1,  1000] running_loss: 2.090
[1,  1100] running_loss: 1.783
[1,  1200] running_loss: 1.915
[1,  1300] running_loss: 1.850
[1,  1400] running_loss: 1.862
[1,  1500] running_loss: 1.769
[1,  1600] running_loss: 1.653
[1,  1700] running_loss: 1.800
[1,  1800] running_loss: 1.717
[1,  1900] running_loss: 1.648
[1,  2000] running_loss: 1.672
[1,  2100] running_loss: 1.649
[1,  2200] running_loss: 1.693
[1,  2300] running_loss: 1.757
[1,  2400] running_loss: 1.581
[1,  2500] running_loss: 1.581
[1,  2600] running_loss: 1.576
[1,  2700] running_loss: 1.688
[1,  2800] running_loss: 1.698
[1,  2900] running_loss: 1.631
[1,  3000] running_loss: 1.552
[1,  3100] running_loss: 1.528
[1,  