# BERT Named Entity Recognition Fine Tuning Project Starter Code
### Dr. Sal Barbosa, Department of Computer Science, Middle Tennessee State University

In [None]:
# Required on TAMU FASTER to be able to pip install packages and download the dataset from Hugging Face
import os
os.environ['http_proxy'] = 'http://10.72.8.25:8080'
os.environ['https_proxy'] = 'http://10.72.8.25:8080'

In [None]:
# pip installs - comment out after running the notebook for the first time
!pip install datasets
!pip install evaluate
!pip install seqeval
!pip install accelerate==0.26.1

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Sequence, ClassLabel
import numpy as np
import evaluate
from collections import Counter

In [None]:
# Load the CONLL-2003 NER dataset
dataset = load_dataset("conll2003")

# Remove columns not used in this code
dataset = dataset.remove_columns(['id', 'pos_tags', 'chunk_tags'])
dataset

In [None]:
# Get and display the NER tag list for the dataset
label_list = dataset["train"].features["ner_tags"].feature.names

print("Label list:", label_list)

In [None]:
# Load the BERT cased model
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))


In [None]:
# Tokenization and tag distribution function
def tokenize_and_distribute_tags(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        #print(f"Tag List: {label_list}\n\nTokens: {examples['tokens'][0]}\n\nTokenized: {tokenized_inputs.tokens(batch_index=i)} \
        #\n\nTags: {label}\n\nTokenized word ids: {word_ids}\n\nDistributed tags: {label_ids}")
        #input()
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_and_distribute_tags, batched=True)

In [None]:
# Metric fucntion
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = \
        [ [label_list[label] for label in label_seq if label != -100] for label_seq in labels ]
    model_predictions = \
        [ [label_list[pred] for (pred, label) in zip(pred_seq, label_seq) if label != -100] for pred_seq, label_seq in zip(predictions, labels) ]

    results = metric.compute(predictions=model_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
# Set training arguments
batch_size = 64
epochs = 1

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
)

# Instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

In [None]:
# Make predictions on the test set
predictions = trainer.predict(tokenized_datasets["test"])
pred_labels = np.argmax(predictions.predictions, axis=2)
true_labels = predictions.label_ids