# HW4: Fine-tuning BERT for entity labeling
This notebook contains starter code for finetuning a BERT-style model for the task of entity recognition. It has minimal text so you can easily copy it to **handin.py** when you submit.  Please read all the comments in the code as they contain important information.

In [1]:
!nvidia-smi

Thu Dec 12 08:43:01 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.58.02              Driver Version: 556.12         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


In [None]:
!pip install torch --index-url https://download.pytorch.org/whl/cu124
!pip install torchinfo seaborn numpy pandas transformers matplotlib tqdm

In [3]:
# This code block just contains standard setup code for running in Python
import time

# PyTorch imports
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset #random_split
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Fix the random seed(s) for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Please set your device by uncommenting the right version below

# On Colab or on a machine with access to an Nvidia GPU use the following setting
# device = 'cuda:1'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# if you have an Apple Silicon machine with a GPU, use the following setting
# this should about 3-4 times faster that running it on just CPU
# device = 'mps'

# If you will use a cpu, this is the setting
# device = 'cpu'

# Note that in handin.py these next two lines will need to be removed
# if you are going run this on your personal machine you will need to install
# these locally in the shell/terminal.

# !pip install protobuf==3.20.2
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install seqeval

from transformers import AutoTokenizer, BertModel, DataCollatorForTokenClassification

import evaluate



**Confirm if GPU is available...**

In [4]:
print("Is CUDA available: ", torch.cuda.is_available())
print("Number of GPUs: ", torch.cuda.device_count())

Is CUDA available:  True
Number of GPUs:  1


In [5]:
# Load the dataset
from datasets import ClassLabel, Sequence, load_dataset

# Load the dataset from JSON files for train, dev, and test splits
data_splits = load_dataset('json', data_files={'train': 'dinos_and_deities_train_bio.jsonl', 'dev': 'dinos_and_deities_dev_bio_sm.jsonl', 'test': 'dinos_and_deities_test_bio_nolabels.jsonl'})

# Define the file name containing the label names
label_names_fname = "dinos_and_deities_train_bio.jsonl.labels"

# Initialize a list to store the label names
labels_int2str = []

# Read the label names from the file and split them into a list
with open(label_names_fname) as f:
    labels_int2str = f.read().split()

# Print the label names
print(f"Labels: {labels_int2str}")

# Create a dictionary to map label names to their corresponding integer indices
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

# Cast the "ner_tags" column to a sequence of ClassLabel with the defined label names
data_splits.cast_column("ner_tags", Sequence(ClassLabel(names=labels_int2str)))

# Print the dataset splits to verify the changes
print(data_splits)

Labels: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
DatasetDict({
    train: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 1749
    })
    dev: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 150
    })
    test: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 303
    })
})


In [6]:
# initialize pretrained BERT tokenizer. This might take a while the first time it's run because the model needs to be downloaded.
# Note: if you change the BERT model later, don't forget to also change this!!
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
# If you want you can look at some sample data items
print(data_splits["train"][8])
print(data_splits["dev"][5])

{'para_index': 0, 'title': 'Myersiohyla liliae', 'doc_id': 'Myersiohyla liliae-0', 'content': 'Myersiohyla liliae is a species of frogs in the family Hylidae. It is endemic to the Pacaraima Mountains in Guyana and known from the region of its type locality in the Kaieteur National Park and from Imbaimadai. The species is dedicated to the daughter of its describer, Lili Kok.', 'page_id': '28259031', 'id': 'Ud-DXIcB1INCf0UyAseC', 'tokens': ['Myersiohyla', 'liliae', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'Hylidae.', 'It', 'is', 'endemic', 'to', 'the', 'Pacaraima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kaieteur', 'National', 'Park', 'and', 'from', 'Imbaimadai.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describer,', 'Lili', 'Kok.'], 'ner_strings': ['B-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [8]:
# This dataset is split into a train, validation and test set, and each token has a label.
# Data from the dataset can generally be accessed like a Python dict.
print(data_splits['train'].features)

# Print the original sentence (which is whitespace tokenized).
example_input_tokens = data_splits['train'][8]['tokens']
print(f"Original tokens: {example_input_tokens}")

# Print the labels of the sentence.
example_ner_labels = data_splits['train'][8]['ner_tags']
print(f"NER labels: {example_ner_labels}")

# Map integer to string labels for the sentence
example_mapped_labels = [labels_int2str[l] for l in example_ner_labels]
print(f'Labels: {example_mapped_labels}')

# Print the sentence split into tokens.
example_tokenized = tokenizer(example_input_tokens, is_split_into_words=True)
print('BERT Tokenized: ', example_tokenized.tokens())

# Print the number of tokens in the vocabulary
print(f'Vocab size: {tokenizer.vocab_size}')

# # Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(example_tokenized.tokens()))

# Of course, there are now way more tokens than labels! Fortunately the HF tokenizer
# provides a function that will give us the mapping:
print(example_tokenized.word_ids())

{'para_index': Value(dtype='int64', id=None), 'title': Value(dtype='string', id=None), 'doc_id': Value(dtype='string', id=None), 'content': Value(dtype='string', id=None), 'page_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_strings': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Original tokens: ['Myersiohyla', 'liliae', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'Hylidae.', 'It', 'is', 'endemic', 'to', 'the', 'Pacaraima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kaieteur', 'National', 'Park', 'and', 'from', 'Imbaimadai.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describer,', 'Lili', 'Kok.']
NER labels: [5, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12,

In [9]:
# We can write a function that uses that along with the original labels to get the new set of labels
# for each BERT-tokenized token.
# def labels_tokens_alignment(labels, word_ids):
#     new_labels = []
#     current_word = None
#     for word_id in word_ids:
#         if word_id != current_word:
#             # Start of a new word!
#             current_word = word_id
#             label = -100 if word_id is None else labels[word_id]
#             new_labels.append(label)
#         elif word_id is None:
#             # Special token
#             new_labels.append(-100)
#         else:
#             # Same word as previous token
#             label = labels[word_id]
#             str_label = labels_int2str[label]
#             if str_label[0] == 'B':
#                 new_str_label = 'I' + str_label[1:]
#                 label = labels_str2int[new_str_label]
#             new_labels.append(label)

#     return new_labels


def labels_tokens_alignment(labels, word_ids):
    new_labels = []  # Initialize a list to store the new labels
    current_word = None  # Variable to keep track of the current word ID
    for word_id in word_ids:  # Iterate over each word ID in the word_ids list
        if word_id != current_word:  # Check if the word ID has changed
            current_word = word_id  # Update the current word ID
            # Append -100 if the word ID is None, otherwise append the corresponding label
            new_labels.append(-100 if word_id is None else labels[word_id])
        else:  # If the word ID is the same as the previous one
            # Append -100 if the word ID is None, otherwise check if the label starts with 'B'
            # If it does, change 'B' to 'I' and append the corresponding label, otherwise append the original label
            new_labels.append(-100 if word_id is None else labels_str2int['I' + labels_int2str[labels[word_id]][1:]] if labels_int2str[labels[word_id]][0] == 'B' else labels[word_id])
    return new_labels  # Return the list of new labels

In [10]:
tokenizer_aligned_labels = labels_tokens_alignment(example_ner_labels, example_tokenized.word_ids())
print(f'Aligned labels: {tokenizer_aligned_labels}')
print(f'Mapped aligned labels: {[labels_int2str[l] if l >= 0 else "_" for l in tokenizer_aligned_labels]}')

Aligned labels: [-100, 5, 0, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -100]
Mapped aligned labels: ['_', 'B-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '_']


In [11]:
# Let's check the function on the example from before. The special tokens don't have labels,
# so we'll just replace those with _
aligned_labels = labels_tokens_alignment(example_ner_labels, example_tokenized.word_ids())
print(f"Tokens: {example_tokenized.tokens()}")
print(f"Aligned labels: {[labels_int2str[l] if l >= 0 else '_' for l in aligned_labels]}")

Tokens: ['[CLS]', 'Myers', '##io', '##hyl', '##a', 'l', '##ilia', '##e', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'H', '##yl', '##idae', '.', 'It', 'is', 'endemic', 'to', 'the', 'Pac', '##ara', '##ima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kai', '##ete', '##ur', 'National', 'Park', 'and', 'from', 'I', '##mba', '##ima', '##dai', '.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describe', '##r', ',', 'Lil', '##i', 'Ko', '##k', '.', '[SEP]']
Aligned labels: ['_', 'B-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [12]:
# Need to get the whole dataset into this format, so need to write a fn
# we can apply efficiently across all examples using Dataset.map.
def tokenize_and_align_labels(examples):
    # Tokenize the input tokens with truncation and word splitting
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]  # Extract the NER tags from the examples
    new_labels = []  # Initialize a list to store the new labels for all examples
    for i, labels in enumerate(all_labels):  # Iterate over each set of labels
        word_ids = tokenized_inputs.word_ids(i)  # Get the word IDs for the current example
        # Align the labels with the tokens and append the result to new_labels
        new_labels.append(labels_tokens_alignment(labels, word_ids))

    tokenized_inputs["labels"] = new_labels  # Add the new labels to the tokenized inputs
    return tokenized_inputs  # Return the tokenized inputs with the new labels

In [13]:
# Now we can apply that fn to tokenize all the data
tokenized_data_splits = data_splits.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=data_splits["train"].column_names,
)

Map:   0%|          | 0/1749 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [14]:
# Testing batcher
print("Examples:")
for i in range(2):
    print(tokenized_data_splits["train"][i]["labels"])

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_data_splits["train"][i] for i in range(2)])

Examples:
[-100, 9, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 9, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -100]
[-100, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,

In [15]:
# Evaluation: we can use the seqeval library to handle calculating span-level precision, recall and F1
metric = evaluate.load("seqeval")

labels = data_splits["train"][0]["ner_tags"]
print("Labels before:", labels)
labels = [labels_int2str[i] for i in labels]
print("Labels after:", labels)

# Make a small change and see how it impacts the score
predictions = labels.copy()
predictions[0] = "O"
metric.compute(predictions=[predictions], references=[labels])

Labels before: [9, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 9, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
Labels after: ['B-Cretaceous_dinosaur', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Cretaceous_dinosaur', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

{'Cretaceous_dinosaur': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 0.5,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.9904761904761905}

In [None]:
# This code trains the model and evaluates it on test data. It should print
# progress messages during training indicating loss, accuracy and training speed.
# You will likely need to make changes to this code for it to work for token classification.
# 
# TODO: change this
def train(model,
          train_dataset,
          val_dataset,
          num_epochs,
          batch_size,
          optimizer_cls,
          lr,
          weight_decay,
          device,
          collate_fn=None,
          log_every=100):
  # Set the model to training mode and move it to the specified device
  model = model.train().to(device)
  # Create a DataLoader for the training dataset
  dataloader = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate_fn)

  # Initialize the optimizer based on the specified optimizer class
  if optimizer_cls == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
  elif optimizer_cls == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
  elif optimizer_cls == 'AdamW':
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

  # Initialize lists to store training and validation metrics
  train_loss_history = []
  train_acc_history = []
  val_loss_history = []
  val_acc_history = []

  # Define the loss function
  lossfn = nn.NLLLoss()
  for e in range(num_epochs):  # Loop over each epoch
    model.train(True)  # Set the model to training mode
    epoch_loss_history = []
    epoch_acc_history = []
    start_time = time.time()
    for i, batch in enumerate(tqdm(dataloader, desc="Training batches")):  # Loop over each batch
      # Move the batch to the specified device
      batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
      y = batch.pop('labels')  # Extract the labels from the batch
      
      logits = model(**batch)  # Forward pass
        
      # Apply log-softmax to logits before passing to NLLLoss
      log_probs = torch.log_softmax(logits, dim=-1)
      loss = lossfn(log_probs.view(-1, log_probs.size(-1)), y.view(-1))  # Compute the loss
      
      pred = logits.argmax(dim=-1)  # Get the predictions
      acc = (pred == y).float().mean()  # Compute the accuracy

      epoch_loss_history.append(loss.item())  # Append the loss to the epoch history
      epoch_acc_history.append(acc.item())  # Append the accuracy to the epoch history

      if (i % log_every == 0):  # Log the training progress every 'log_every' iterations
        speed = 0 if i == 0 else log_every/(time.time()-start_time)
        print(f'epoch: {e}\t iter: {i}\t train_loss: {np.mean(epoch_loss_history):.3e}\t train_acc:{np.mean(epoch_acc_history):.3f}\t speed:{speed:.3f} b/s')
        start_time = time.time()
      loss.backward()  # Backward pass
      optimizer.step()  # Update the model parameters
      optimizer.zero_grad()  # Zero the gradients
      
    # Evaluate the model on the validation dataset
    val_loss, val_metrics, predictions = run_eval(model, val_dataset, batch_size, device, collate_fn=collate_fn)

    val_acc = val_metrics['overall_accuracy']
    val_p = val_metrics['overall_precision']
    val_r = val_metrics['overall_recall']
    val_f1 = val_metrics['overall_f1']

    # Append the metrics to the history lists
    train_loss_history.append(np.mean(epoch_loss_history))
    train_acc_history.append(np.mean(epoch_acc_history))
    val_loss_history.append(val_loss.item())
    val_acc_history.append(val_acc)
    print(f'epoch: {e}\t train_loss: {train_loss_history[-1]:.3e}\t train_accuracy:{train_acc_history[-1]:.3f}\t val_loss: {val_loss_history[-1]:.3e}\t val_acc:{val_acc_history[-1]:.3f}\t val_p:{val_p:.3f}\t val_r:{val_r:.3f}\t val_f1:{val_f1:.3f}')

  # Return the trained model and the training/validation metrics
  return model, (train_loss_history, train_acc_history, val_loss_history, val_acc_history)

In [19]:
# Clear the un-used memory
torch.cuda.empty_cache()

In [None]:
# This code defines the token classification class using BERT.
# The classifier is defined on top of the final layer of BERT.
# The classifier has 1 hidden layer with 128 hidden nodes though we have found that
# using a smaller number of hidden nodes does not make much difference,
# 
# TODO: implement this
class BertForTokenClassification(nn.Module):
  def __init__(self, bert_pretrained_config_name, num_classes, freeze_bert=False, dropout_prob=0.1):
    '''
    BERT with a classification MLP
    args:
    - bert_pretrained_config_name (str): model name from huggingface hub
    - num_classes (int): number of classes in the classification task
    - freeze_bert (bool): [default False] If true gradients are not computed for
                          BERT's parameters.
    - dropout_prob (float): [default 0.1] probability of dropping each activation.
    '''
    super().__init__()
    # Load the pre-trained BERT model from Huggingface hub
    self.bert = BertModel.from_pretrained(bert_pretrained_config_name)
    # Freeze BERT parameters if freeze_bert is True
    self.bert.requires_grad_(not freeze_bert)
    
    # Define a dropout layer
    self.dropout = nn.Dropout(dropout_prob)
    # Define a classifier with a linear layer
    self.classifier = nn.Sequential(
      nn.Linear(self.bert.config.hidden_size, num_classes)
      # nn.ReLU(),
      # nn.Dropout(dropout_prob),
      # nn.Linear(128, num_classes)
    )
    
  def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
    # Pass inputs through BERT model
    outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    # Get the last hidden state from BERT outputs
    sequence_output = outputs.last_hidden_state
    # Apply dropout to the sequence output
    sequence_output = self.dropout(sequence_output)
    # Pass the sequence output through the classifier to get logits
    logits = self.classifier(sequence_output)
    return logits  # Return the logits   

In [21]:
# Clear the un-used memory
torch.cuda.empty_cache()

In [22]:
# This is where fine-tuning of the classifier happens.
# Here we are training with batch size 32 for 5 epochs.

# At the end of each epoch, you also see validation loss and validation accuracy.
# Change the device as described above if you will not be using a GPU

# Set the random seed(s) for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Make sure this is the same as you use for tokenization!
bert_model = 'bert-base-cased'

num_labels = len(labels_int2str)
print(f"Num labels: {num_labels}")

# conll hyperparams
# multiply your learning rate by k when using batch size of kN
lr = 4*2e-5 # 1e-3
weight_decay = 0.01
epochs = 5
batch_size = 32
dropout_prob = 0.2
freeze_bert = False

bert_cls = BertForTokenClassification(bert_model, num_labels, dropout_prob=dropout_prob, freeze_bert=freeze_bert)

print(f'Trainable parameters: {sum([p.numel() for p in bert_cls.parameters() if p.requires_grad])}\n')

# Flag for setting "debug" mode. Set debug to False for full training.
debug = False

# Sample a subset of the training data for faster iteration in debug mode
subset_size = 1000
subset_indices = torch.randperm(len(tokenized_data_splits['train']))[:subset_size]
train_subset = Subset(tokenized_data_splits['train'], subset_indices)

bert_cls, bert_cls_logs = train(bert_cls, tokenized_data_splits['train'] if not debug else train_subset, tokenized_data_splits['dev'],
                                num_epochs=epochs, batch_size=batch_size, optimizer_cls='AdamW',
                                lr=lr, weight_decay=weight_decay, device=device,
                                collate_fn=data_collator, log_every=10 if debug else 100)

# Final eval
final_loss, final_metrics, eval_pred = run_eval(bert_cls, tokenized_data_splits['dev'], batch_size=32, device=device, collate_fn=data_collator)
final_acc = final_metrics['overall_accuracy']
final_p = final_metrics['overall_precision']
final_r = final_metrics['overall_recall']
final_f1 = final_metrics['overall_f1']
print(f'\nFinal Loss: {final_loss:.3e}\t Final Accuracy: {final_acc:.3f}\t dev_p:{final_p:.3f}\t dev_r:{final_r:.3f}\t dev_f1:{final_f1:.3f}')

Num labels: 13
Trainable parameters: 108320269



Training batches:   0%|          | 0/55 [00:00<?, ?it/s]

epoch: 0	 iter: 0	 train_loss: 2.771e+00	 train_acc:0.011	 speed:0.000 b/s


Training batches: 100%|██████████| 55/55 [00:30<00:00,  1.80it/s]
Evaluation batches: 100%|██████████| 5/5 [00:01<00:00,  4.45it/s]


epoch: 0	 train_loss: 3.692e-01	 train_accuracy:0.346	 val_loss: 2.187e-01	 val_acc:0.942	 val_p:0.354	 val_r:0.070	 val_f1:0.117


Training batches:   2%|▏         | 1/55 [00:00<00:15,  3.43it/s]

epoch: 1	 iter: 0	 train_loss: 2.356e-01	 train_acc:0.288	 speed:0.000 b/s


Training batches: 100%|██████████| 55/55 [00:31<00:00,  1.77it/s]
Evaluation batches: 100%|██████████| 5/5 [00:01<00:00,  4.04it/s]


epoch: 1	 train_loss: 1.836e-01	 train_accuracy:0.340	 val_loss: 1.685e-01	 val_acc:0.944	 val_p:0.218	 val_r:0.287	 val_f1:0.248


Training batches:   2%|▏         | 1/55 [00:00<00:17,  3.12it/s]

epoch: 2	 iter: 0	 train_loss: 1.507e-01	 train_acc:0.233	 speed:0.000 b/s


Training batches: 100%|██████████| 55/55 [00:31<00:00,  1.76it/s]
Evaluation batches: 100%|██████████| 5/5 [00:01<00:00,  4.20it/s]


epoch: 2	 train_loss: 1.043e-01	 train_accuracy:0.345	 val_loss: 1.607e-01	 val_acc:0.954	 val_p:0.364	 val_r:0.338	 val_f1:0.351


Training batches:   2%|▏         | 1/55 [00:00<00:08,  6.03it/s]

epoch: 3	 iter: 0	 train_loss: 6.712e-02	 train_acc:0.425	 speed:0.000 b/s


Training batches: 100%|██████████| 55/55 [00:30<00:00,  1.79it/s]
Evaluation batches: 100%|██████████| 5/5 [00:01<00:00,  3.87it/s]


epoch: 3	 train_loss: 6.217e-02	 train_accuracy:0.355	 val_loss: 1.573e-01	 val_acc:0.954	 val_p:0.403	 val_r:0.518	 val_f1:0.453


Training batches:   2%|▏         | 1/55 [00:00<00:14,  3.76it/s]

epoch: 4	 iter: 0	 train_loss: 3.656e-02	 train_acc:0.340	 speed:0.000 b/s


Training batches: 100%|██████████| 55/55 [00:31<00:00,  1.76it/s]
Evaluation batches: 100%|██████████| 5/5 [00:01<00:00,  4.51it/s]


epoch: 4	 train_loss: 3.401e-02	 train_accuracy:0.357	 val_loss: 1.997e-01	 val_acc:0.953	 val_p:0.428	 val_r:0.500	 val_f1:0.461


Evaluation batches: 100%|██████████| 5/5 [00:00<00:00,  5.03it/s]



Final Loss: 1.997e-01	 Final Accuracy: 0.953	 dev_p:0.428	 dev_r:0.500	 dev_f1:0.461


In [23]:
print("Prediction shape (evaluation)", len(eval_pred))

Prediction shape (evaluation) 150


In [24]:
final_loss, final_metrics, test_pred = run_eval(bert_cls, tokenized_data_splits['test'], batch_size=32, device=device, collate_fn=data_collator)

Evaluation batches: 100%|██████████| 10/10 [00:01<00:00,  5.17it/s]


In [None]:
import json

# Define the output file name for saving the mapped predictions
output_file = "test_predictions_bert.json"

# Open the output file in write mode
with open(output_file, "w") as f:
    # Save the test predictions to the JSON file with indentation for readability
    json.dump(test_pred, f, indent=4)

# Print a message indicating that the mapped aligned labels have been saved
print(f"Mapped aligned labels saved to {output_file}")

Mapped aligned labels saved to test_predictions_bert.json
