# HW4: Fine-tuning BERT for entity labeling
This notebook contains starter code for finetuning a BERT-style model for the task of entity recognition. It has minimal text so you can easily copy it to **handin.py** when you submit.  Please read all the comments in the code as they contain important information.

In [1]:
# This code block just contains standard setup code for running in Python
import time

# PyTorch imports
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset #random_split
import numpy as np

import csv
import os
# Fix the random seed(s) for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Please set your device by uncommenting the right version below

# On Colab or on a machine with access to an Nvidia GPU use the following setting
#device = 'cuda:1'
device = 'cuda'
# if you have an Apple Silicon machine with a GPU, use the following setting
# this should about 3-4 times faster that running it on just CPU
# device = 'mps'

# If you will use a cpu, this is the setting
# device = 'cpu'

# Note that in handin.py these next two lines will need to be removed
# if you are going run this on your personal machine you will need to install
# these locally in the shell/terminal.

#!pip install protobuf==3.20.2
#!pip install transformers
#!pip install datasets
#!pip install evaluate
#!pip install seqeval

from transformers import AutoTokenizer, BertModel, DataCollatorForTokenClassification

import evaluate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the dataset
from datasets import ClassLabel, Sequence, load_dataset

data_splits = load_dataset('json', data_files={'train': '/content/drive/MyDrive/HW4/dinos_and_deities_train_bio.jsonl', 'dev': '/content/drive/MyDrive/HW4/dinos_and_deities_dev_bio_sm.jsonl'})

label_names_fname = "/content/drive/MyDrive/HW4/dinos_and_deities_train_bio.jsonl.labels"
labels_int2str = []
with open(label_names_fname) as f:
    labels_int2str = f.read().split()
print(f"Labels: {labels_int2str}")
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

data_splits.cast_column("ner_tags", Sequence(ClassLabel(names=labels_int2str)))
print(data_splits)

Labels: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
DatasetDict({
    train: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 1749
    })
    dev: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 150
    })
})


In [4]:
# initialize pretrained BERT tokenizer. This might take a while the first time it's run because the model needs to be downloaded.
# Note: if you change the BERT model later, don't forget to also change this!!
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# If you want you can look at some sample data items
print(data_splits["train"][8])
print(data_splits["dev"][5])

{'para_index': 0, 'title': 'Myersiohyla liliae', 'doc_id': 'Myersiohyla liliae-0', 'content': 'Myersiohyla liliae is a species of frogs in the family Hylidae. It is endemic to the Pacaraima Mountains in Guyana and known from the region of its type locality in the Kaieteur National Park and from Imbaimadai. The species is dedicated to the daughter of its describer, Lili Kok.', 'page_id': '28259031', 'id': 'Ud-DXIcB1INCf0UyAseC', 'tokens': ['Myersiohyla', 'liliae', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'Hylidae.', 'It', 'is', 'endemic', 'to', 'the', 'Pacaraima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kaieteur', 'National', 'Park', 'and', 'from', 'Imbaimadai.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describer,', 'Lili', 'Kok.'], 'ner_strings': ['B-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

### Data Preprocessing

In [6]:
# This dataset is split into a train, validation and test set, and each token has a label.
# Data from the dataset can generally be accessed like a Python dict.
print(data_splits['train'].features)

# Print the original sentence (which is whitespace tokenized).
example_input_tokens = data_splits['train'][8]['tokens']
print(f"Original tokens: {example_input_tokens}")

# Print the labels of the sentence.
example_ner_labels = data_splits['train'][8]['ner_tags']
print(f"NER labels: {example_ner_labels}")

# Map integer to string labels for the sentence
example_mapped_labels = [labels_int2str[l] for l in example_ner_labels]
print(f'Labels: {example_mapped_labels}')

# Print the sentence split into tokens.
example_tokenized = tokenizer(example_input_tokens, is_split_into_words=True)
print('BERT Tokenized: ', example_tokenized.tokens())

# Print the number of tokens in the vocabulary
print(f'Vocab size: {tokenizer.vocab_size}')

# # Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(example_tokenized.tokens()))

# Of course, there are now way more tokens than labels! Fortunately the HF tokenizer
# provides a function that will give us the mapping:
print(example_tokenized.word_ids())

{'para_index': Value(dtype='int64', id=None), 'title': Value(dtype='string', id=None), 'doc_id': Value(dtype='string', id=None), 'content': Value(dtype='string', id=None), 'page_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_strings': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Original tokens: ['Myersiohyla', 'liliae', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'Hylidae.', 'It', 'is', 'endemic', 'to', 'the', 'Pacaraima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kaieteur', 'National', 'Park', 'and', 'from', 'Imbaimadai.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describer,', 'Lili', 'Kok.']
NER labels: [5, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12,

In [7]:
# We can write a function that uses that along with the original labels to get the new set of labels
# for each BERT-tokenized token.
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            str_label = labels_int2str[label]
            if str_label[0] == 'B':
                new_str_label = 'I' + str_label[1:]
                label = labels_str2int[new_str_label]
            new_labels.append(label)

    return new_labels

In [8]:
tokenizer_aligned_labels = align_labels_with_tokens(example_ner_labels, example_tokenized.word_ids())
print(f'Aligned labels: {tokenizer_aligned_labels}')
print(f'Mapped aligned labels: {[labels_int2str[l] if l >= 0 else "_" for l in tokenizer_aligned_labels]}')

Aligned labels: [-100, 5, 0, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -100]
Mapped aligned labels: ['_', 'B-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '_']


In [9]:
# Let's check the function on the example from before. The special tokens don't have labels,
# so we'll just replace those with _
aligned_labels = align_labels_with_tokens(example_ner_labels, example_tokenized.word_ids())
print(f"Tokens: {example_tokenized.tokens()}")
print(f"Aligned labels: {[labels_int2str[l] if l >= 0 else '_' for l in aligned_labels]}")

Tokens: ['[CLS]', 'Myers', '##io', '##hyl', '##a', 'l', '##ilia', '##e', 'is', 'a', 'species', 'of', 'frogs', 'in', 'the', 'family', 'H', '##yl', '##idae', '.', 'It', 'is', 'endemic', 'to', 'the', 'Pac', '##ara', '##ima', 'Mountains', 'in', 'Guyana', 'and', 'known', 'from', 'the', 'region', 'of', 'its', 'type', 'locality', 'in', 'the', 'Kai', '##ete', '##ur', 'National', 'Park', 'and', 'from', 'I', '##mba', '##ima', '##dai', '.', 'The', 'species', 'is', 'dedicated', 'to', 'the', 'daughter', 'of', 'its', 'describe', '##r', ',', 'Lil', '##i', 'Ko', '##k', '.', '[SEP]']
Aligned labels: ['_', 'B-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'I-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [10]:
# Need to get the whole dataset into this format, so need to write a fn
# we can apply efficiently across all examples using Dataset.map.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [11]:
# Now we can apply that fn to tokenize all the data
tokenized_data_splits = data_splits.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=data_splits["train"].column_names,
)

In [12]:
# Testing batcher
print("Examples:")
for i in range(2):
    print(tokenized_data_splits["train"][i]["labels"])

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_data_splits["train"][i] for i in range(2)])

Examples:
[-100, 9, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 9, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -100]
[-100, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,

In [13]:
# Evaluation: we can use the seqeval library to handle calculating span-level precision, recall and F1
metric = evaluate.load("seqeval")

labels = data_splits["train"][0]["ner_tags"]
labels = [labels_int2str[i] for i in labels]
print(labels)

# Make a small change and see how it impacts the score
predictions = labels.copy()
predictions[0] = "O"
metric.compute(predictions=[predictions], references=[labels])

['B-Cretaceous_dinosaur', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Cretaceous_dinosaur', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


{'Cretaceous_dinosaur': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 0.5,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.9904761904761905}

### Train and evaluation

In [14]:
# This code runs evaluation on test data.
# You will need to change this to get it to work for sequence labeling.
#
# TODO: implement this.
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
@torch.no_grad()
def run_eval(model, dataset, batch_size, device, collate_fn=None):
    model.eval().to(device)
    dataloader = DataLoader(dataset, batch_size, shuffle = False, collate_fn=collate_fn)
    loss_fn = nn.NLLLoss(ignore_index=-100)
    val_loss_history = []

    # Initialize cumulative metrics
    total_tokens = 0
    correct_predictions = 0
    all_predictions = []
    all_labels = []

    for i, batch in enumerate(dataloader):

        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        y = batch.pop('labels')

        logits = model(**batch)
        val_loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))  # Flatten for NLLLoss
        val_loss_history.append(val_loss.item())


        # Compute predictions
        predictions = logits.argmax(dim=-1)  # Get the token-wise predicted class
        valid_labels = y.view(-1) != -100  # Mask for valid (non-ignored) tokens

        # Update metrics
        correct_predictions += (predictions.view(-1)[valid_labels] == y.view(-1)[valid_labels]).sum().item()
        total_tokens += valid_labels.sum().item()

        # Store for computing precision, recall, and F1
        #all_predictions.extend(predictions.view(-1)[valid_labels].tolist())
        #all_labels.extend(y.view(-1)[valid_labels].tolist())
        all_predictions.extend([labels_int2str[p] for p in predictions.view(-1)[valid_labels].tolist()])
        all_labels.extend([labels_int2str[l] for l in y.view(-1)[valid_labels].tolist()])

    # Compute overall metrics

    val_metric = metric.compute(predictions=[all_predictions], references=[all_labels], zero_division=0)

    #overall_accuracy = correct_predictions / total_tokens
    #overall_precision = precision_score(all_labels, all_predictions, average="macro", zero_division=0)
    #overall_recall = recall_score(all_labels, all_predictions, average="macro", zero_division=0)
    #overall_f1 = f1_score(all_labels, all_predictions, average="macro", zero_division=0)

    return np.mean(val_loss_history), val_metric

In [15]:
def record_metrics(epoch, train_loss, val_loss, train_acc, val_acc, filename='/content/drive/MyDrive/HW4/log/log_20.csv'):
    # Check if the log file already exists
    file_exists = os.path.isfile(filename)

    # Open the file in append mode; create it if it does not exist
    with open(filename, 'a', newline='') as f:
        writer = csv.writer(f)

        # If the file does not exist, write the header row first
        if not file_exists:
            writer.writerow(['epoch', 'train_loss', 'val_loss', 'train_acc', 'val_acc'])

        # Write the current epoch's metrics to the file
        writer.writerow([epoch, train_loss, val_loss, train_acc, val_acc])

In [16]:
# This code trains the model and evaluates it on test data. It should print
# progress messages during training indicating loss, accuracy and training speed.
# You will likely need to make changes to this code for it to work for token classification.
#
# TODO: change this
def train(model,
          train_dataset,
          val_dataset,
          num_epochs,
          batch_size,
          optimizer_cls,
          lr,
          weight_decay,
          device,
          collate_fn=None,
          log_every=100):
    model = model.train().to(device)
    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Initialize optimizer
    if optimizer_cls == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_cls == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_cls == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # History tracking
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []
    best_val_f1 = 0

    lossfn = nn.NLLLoss(ignore_index=-100)  # Use ignore_index to handle padding/special tokens

    for e in range(num_epochs):
        model.train(True)
        epoch_loss_history = []
        epoch_acc_history = []
        start_time = time.time()

        for i, batch in enumerate(dataloader):
            # Move tensors to the specified device
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
            labels = batch.pop('labels')

            # Forward pass
            # For NLLLoss --> expected input ((N, C), (N))
            # input:(batch_size, seq_len, num_classes) --> view --> (batch_size*seq_len, num_classes)
            # label: (batch_size, seq_len) --> view --> (batch_size*seq_len)

            logits = model(**batch)
            loss = lossfn(logits.view(-1, logits.size(-1)), labels.view(-1))  # Flatten for token-level loss

            # Predictions and accuracy
            predictions = logits.argmax(dim=-1)  # Token-level predictions
            valid_labels = labels.view(-1) != -100  # Mask to ignore invalid tokens
            correct_predictions = (predictions.view(-1)[valid_labels] == labels.view(-1)[valid_labels]).sum().item()
            total_valid_tokens = valid_labels.sum().item()
            acc = correct_predictions / total_valid_tokens if total_valid_tokens > 0 else 0

            # Track loss and accuracy
            epoch_loss_history.append(loss.item())
            epoch_acc_history.append(acc)

            # Logging
            if i % log_every == 0:
                speed = 0 if i == 0 else log_every / (time.time() - start_time)
                print(f'epoch: {e}\t iter: {i}\t train_loss: {np.mean(epoch_loss_history):.3e}\t train_acc:{np.mean(epoch_acc_history):.3f}\t speed:{speed:.3f} b/s')
                start_time = time.time()

            # Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Validation step
        val_loss, val_metrics = run_eval(model, val_dataset, batch_size, device, collate_fn=collate_fn)
        val_acc = val_metrics['overall_accuracy']
        val_p = val_metrics['overall_precision']
        val_r = val_metrics['overall_recall']
        val_f1 = val_metrics['overall_f1']

        #if val_f1 > best_val_f1:
        #  best_val_f1 = val_f1
        #  model_save_path = "/content/drive/MyDrive/HW4/model/best_model"
        #  model.save_pretrained(model_save_path)
        #  tokenizer.save_pretrained(model_save_path)  # Save the tokenizer too
        #  print(f"Model and tokenizer saved at epoch {e} to {model_save_path}")


        # Append epoch results to history
        train_loss_history.append(np.mean(epoch_loss_history))
        train_acc_history.append(np.mean(epoch_acc_history))
        val_loss_history.append(val_loss)
        val_acc_history.append(val_acc)
        record_metrics(e, train_loss_history[-1], val_loss_history[-1], train_acc_history[-1], val_acc_history[-1])

        # Epoch summary
        print(f'epoch: {e}\t train_loss: {train_loss_history[-1]:.3e}\t train_accuracy:{train_acc_history[-1]:.3f}\t val_loss: {val_loss_history[-1]:.3e}\t val_acc:{val_acc_history[-1]:.3f}\t val_p:{val_p:.3f}\t val_r:{val_r:.3f}\t val_f1:{val_f1:.3f}')

    return model, (train_loss_history, train_acc_history, val_loss_history, val_acc_history)

In [17]:
# This code defines the token classification class using BERT.
# The classifier is defined on top of the final layer of BERT.
# The classifier has 1 hidden layer with 128 hidden nodes though we have found that
# using a smaller number of hidden nodes does not make much difference,
#
# TODO: implement this
class BertForTokenClassification(nn.Module):
  def __init__(self, bert_pretrained_config_name, num_classes, freeze_bert=False, dropout_prob=0.1):
    '''
    BERT with a classification MLP
    args:
    - bert_pretrained_config_name (str): model name from huggingface hub
    - num_classes (int): number of classes in the classification task
    - freeze_bert (bool): [default False] If true gradients are not computed for
                          BERT's parameters.
    - dropout_prob (float): [default 0.1] probability of dropping each activation.
    '''
    super().__init__()
    self.bert = BertModel.from_pretrained(bert_pretrained_config_name)
    self.bert.requires_grad_(not freeze_bert)
    self.layers = nn.Sequential(
      nn.Linear(self.bert.config.hidden_size, 64),
      nn.ReLU(),
      nn.Dropout(dropout_prob),
      nn.Linear(64,32),
      nn.ReLU(),
      nn.Dropout(dropout_prob),
      nn.Linear(32, num_classes),
      nn.LogSoftmax(dim=-1)
    )
  def forward(self, **bert_kwargs):

    # Keyword arguments (e.g., input_ids, attention_mask) passed to BERT.
    output = self.bert(**bert_kwargs)

    # last_hidden_state: The contextual embeddings for each token (shape: (batch_size, seq_len, hidden_size)).
    # pooler_output: The pooled embedding for the [CLS] token (shape: (batch_size, hidden_size)).
    # For token classification, we need whole hidden state
    all_tokens = output.last_hidden_state
    logits = self.layers(all_tokens)
    return logits



In [18]:
# This is where fine-tuning of the classifier happens.
# Here we are training with batch size 32 for 5 epochs.

# At the end of each epoch, you also see validation loss and validation accuracy.
# Change the device as described above if you will not be using a GPU

# Set the random seed(s) for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Make sure this is the same as you use for tokenization!
bert_model = 'bert-base-cased'

num_labels = len(labels_int2str)
print(f"Num labels: {num_labels}")

# conll hyperparams
# multiply your learning rate by k when using batch size of kN
lr = 4*2e-5 # 1e-3
weight_decay = 0.01
epochs = 15
batch_size = 32
dropout_prob = 0.2
freeze_bert = False

bert_cls = BertForTokenClassification(bert_model, num_labels, dropout_prob=dropout_prob, freeze_bert=freeze_bert)

print(f'Trainable parameters: {sum([p.numel() for p in bert_cls.parameters() if p.requires_grad])}\n')

# Flag for setting "debug" mode. Set debug to False for full training.
debug = False

# Sample a subset of the training data for faster iteration in debug mode
subset_size = 1000
subset_indices = torch.randperm(len(tokenized_data_splits['train']))[:subset_size]
train_subset = Subset(tokenized_data_splits['train'], subset_indices)

bert_cls, bert_cls_logs = train(bert_cls, tokenized_data_splits['train'] if not debug else train_subset, tokenized_data_splits['dev'],
                                num_epochs=epochs, batch_size=batch_size, optimizer_cls='AdamW',
                                lr=lr, weight_decay=weight_decay, device=device,
                                collate_fn=data_collator, log_every=10 if debug else 100)

# Final eval
final_loss, final_metrics = run_eval(bert_cls, tokenized_data_splits['dev'], batch_size=16, device=device, collate_fn=data_collator)
final_acc = final_metrics['overall_accuracy']
final_p = final_metrics['overall_precision']
final_r = final_metrics['overall_recall']
final_f1 = final_metrics['overall_f1']
print(f'\nFinal Loss: {final_loss:.3e}\t Final Accuracy: {final_acc:.3f}\t dev_p:{final_p:.3f}\t dev_r:{final_r:.3f}\t dev_f1:{final_f1:.3f}')


Num labels: 13
Trainable parameters: 108361997

epoch: 0	 iter: 0	 train_loss: 2.735e+00	 train_acc:0.007	 speed:0.000 b/s
epoch: 0	 train_loss: 1.775e+00	 train_accuracy:0.602	 val_loss: 1.006e+00	 val_acc:0.938	 val_p:0.000	 val_r:0.000	 val_f1:0.000
epoch: 1	 iter: 0	 train_loss: 1.234e+00	 train_acc:0.865	 speed:0.000 b/s
epoch: 1	 train_loss: 8.519e-01	 train_accuracy:0.911	 val_loss: 4.572e-01	 val_acc:0.938	 val_p:0.000	 val_r:0.000	 val_f1:0.000
epoch: 2	 iter: 0	 train_loss: 5.379e-01	 train_acc:0.948	 speed:0.000 b/s
epoch: 2	 train_loss: 4.899e-01	 train_accuracy:0.933	 val_loss: 3.155e-01	 val_acc:0.941	 val_p:0.184	 val_r:0.021	 val_f1:0.038
epoch: 3	 iter: 0	 train_loss: 3.358e-01	 train_acc:0.946	 speed:0.000 b/s
epoch: 3	 train_loss: 3.209e-01	 train_accuracy:0.939	 val_loss: 2.537e-01	 val_acc:0.939	 val_p:0.111	 val_r:0.024	 val_f1:0.040
epoch: 4	 iter: 0	 train_loss: 2.214e-01	 train_acc:0.948	 speed:0.000 b/s
epoch: 4	 train_loss: 2.143e-01	 train_accuracy:0.949	 va

### Test data

In [26]:
from transformers import BertForTokenClassification
# Paths
test_data_path = "/content/drive/MyDrive/HW4/dinos_and_deities_test_bio_nolabels.jsonl"
output_path = "test_predictions_bert.json"


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert_cls.to(device)
bert_cls.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [27]:
torch.cuda.empty_cache()

In [28]:
import json
from datasets import Dataset
test_data_path = "/content/drive/MyDrive/HW4/dinos_and_deities_test_bio_nolabels.jsonl"
with open(test_data_path, "r") as f:
    test_data = [json.loads(line.strip()) for line in f]

# Convert test data into a Hugging Face Dataset
test_data = Dataset.from_list(test_data)

label_names_fname = "/content/drive/MyDrive/HW4/dinos_and_deities_train_bio.jsonl.labels"
labels_int2str = []
with open(label_names_fname) as f:
    labels_int2str = f.read().split()
print(f"Labels: {labels_int2str}")
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

if "ner_tags" in test_data.column_names:
    test_data = test_data.cast_column("ner_tags", Sequence(ClassLabel(names=labels_int2str)))
print(test_data)

# Now we can apply that fn to tokenize all the data
tokenized_data_test = test_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_data.column_names,
)

Labels: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']


Casting the dataset:   0%|          | 0/303 [00:00<?, ? examples/s]

Dataset({
    features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
    num_rows: 303
})


Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [29]:
print(tokenized_data_test)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 303
})


In [35]:
print(tokenized_data_test["input_ids"][0])

[101, 1130, 1699, 117, 1103, 182, 2744, 2007, 16430, 176, 12640, 25570, 7276, 1162, 113, 3337, 1270, 1202, 5822, 8816, 114, 1110, 2784, 1111, 1103, 1263, 1858, 1920, 1107, 170, 1416, 119, 1188, 12942, 13347, 117, 1972, 117, 1920, 1104, 1103, 8131, 1105, 14780, 1116, 1115, 1202, 1136, 4752, 170, 9131, 117, 1105, 10592, 2019, 170, 9131, 1165, 3238, 119, 1220, 1145, 2812, 1103, 5199, 8131, 1285, 118, 1106, 118, 1285, 113, 1206, 1103, 12104, 172, 18565, 1115, 4752, 1103, 9108, 1104, 170, 9131, 114, 119, 102]


In [39]:
import json

def run_test(model, dataset, batch_size, device, collate_fn=None):
    model.eval().to(device)
    dataloader = DataLoader(dataset, batch_size, shuffle = False, collate_fn=collate_fn)
    loss_fn = nn.NLLLoss(ignore_index=-100)
    test_loss_history = []

    # Initialize cumulative metrics
    total_tokens = 0
    correct_predictions = 0
    all_predictions_for_compute = []
    all_predictions = []
    all_labels = []


    for i, batch in enumerate(dataloader):

        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        y = batch.pop('labels')

        logits = model(**batch)
        test_loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))  # Flatten for NLLLoss
        test_loss_history.append(test_loss.item())


        # Compute predictions
        predictions = logits.argmax(dim=-1)  # Get the token-wise predicted class
        print(len(predictions))
        test_labels = y.view(-1) != -100  # Mask for valid (non-ignored) tokens

        # Update metrics
        correct_predictions += (predictions.view(-1)[test_labels] == y.view(-1)[test_labels]).sum().item()
        total_tokens += test_labels.sum().item()

        # Store for computing precision, recall, and F1
        all_predictions.append([labels_int2str[p] for p in predictions.view(-1)[test_labels].tolist()])

        #all_labels.extend(y.view(-1)[test_labels].tolist())
        all_predictions_for_compute.extend([labels_int2str[p] for p in predictions.view(-1)[test_labels].tolist()])
        all_labels.extend([labels_int2str[l] for l in y.view(-1)[test_labels].tolist()])


    # Save predictions as JSON
    with open(output_path, "w") as f:
        json.dump(all_predictions, f, indent=4)
    print(f"Predictions saved to {output_path}")

    # Compute overall metrics
    test_metric = metric.compute(predictions=[all_predictions_for_compute], references=[all_labels], zero_division=0)

    print("Test Results:")
    for key, value in test_metric.items():
        print(f"{key}: {value}")

    return np.mean(test_loss_history), test_metric

In [41]:
batch_size = 1

In [42]:
test_loss, test_metric = run_test(bert_cls, tokenized_data_test, batch_size, device, collate_fn=data_collator)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
Predictions saved to test_predictions_bert.json
Test Results:
Aquatic_animal: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
Aquatic_mammal: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
Cretaceous_dinosaur: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
Deity: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
Goddess: {'precision': 0.0, 'recall': 0.