Drew Lickman

CSCI 4820-1

Project #5

Due: 11/19/24

# BERT Named Entity Recognition Fine Tuning Project Starter Code
### Dr. Sal Barbosa, Department of Computer Science, Middle Tennessee State University

In [1]:
# Required on TAMU FASTER to be able to pip install packages and download the dataset from Hugging Face
import os
os.environ['http_proxy'] = 'http://10.72.8.25:8080'
os.environ['https_proxy'] = 'http://10.72.8.25:8080'

In [2]:
# pip installs - comment out after running the notebook for the first time
#!pip install datasets
#!pip install evaluate
#!pip install seqeval
#!pip install accelerate==0.26.1

In [3]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Sequence, ClassLabel
import numpy as np
import evaluate
from collections import Counter

2024-11-15 19:38:13.909881: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 19:38:13.949904: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-15 19:38:13.949938: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-15 19:38:13.949963: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-15 19:38:13.957250: I tensorflow/core/platform/cpu_feature_g

In [4]:
# Load the CONLL-2003 NER dataset
dataset = load_dataset("conll2003")

# Remove columns not used in this code
dataset = dataset.remove_columns(['id', 'pos_tags', 'chunk_tags'])
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
# Get and display the NER tag list for the dataset
label_list = dataset["train"].features["ner_tags"].feature.names

# Rename PERSON labels to MALE labels
label_list[1] = 'B-MPER'
label_list[2] = 'I-MPER'

# Append FEMALE labels at end of label list
label_list.append('B-FPER')
label_list.append('I-FPER')

print("Label list:", label_list)

Label list: ['O', 'B-MPER', 'I-MPER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-FPER', 'I-FPER']


In [6]:
# Load the BERT cased model
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
def isShortNonsense(token):
    # Removes tokens such as "M." "S." "R." etc. from being male or female
    if len(token) == 2 and token[-1] == '.':
        return True
    return False

# Some names came from https://nameberry.com/blog/the-most-popular-baby-name-endings 11/15/24
femaleWhole = ("Taha", "Olga", "Inga")
femaleBeginnings = ("van", "alic", "lore", "bell", "lie", "chiq", "duf", "yas", "may", "kimi", "joha", "ili", "luo", "viv", "hill", "muse", "min", "marg", "till")
femaleEndings = ("ati", "issa", "iew", "issy", "ino", "ata", "ene", "si", "az", "lao", "izio", "uki", "ise", "ova", "oin", "ria", "ata", "anne", "drea", "ayla", "essa", "her", "anna", "ana", "ette", "etta", "elle", "ella", "ina", "yah", "iah", "lyn", "icia", "rie", "ora", "lie", "thy", "atie", "rude", "lia", "lla", "enna", "ine", "ani", "ola", "een", "ahi", "kie", "ane", "ahu", "ara", "ari", "mbe", "pta", "ady", "ie", "ary", "xa")
def is_female_name(token):
    # Uses algorithmic approach to assign gender tag and returns boolean
    result = False
    #if any(token.lower().endswith(ending) for ending in femaleEndings):
    if token.lower().endswith(femaleEndings):
        result = True
    #elif any(token.lower().startswith(start) for start in femaleBeginnings):
    elif token.lower().startswith(femaleBeginnings):
        result = True
    elif token in femaleWhole:
        result = True
    #if result: print("F:",token)
    return result

maleWhole = ("Jimi", "Levy", "Anders", "Nick", "Kenny", "Roy", "Alex", "Shen", "Costas", "Dan", "Hal", "Tom", "Ken", "Daniel", "Ian")
maleBeginnings = ("man", "simo", "gar", "hunt", "cor", "hel", "will", "mick", "con", "sal", "ric", "phi", "terr", "bru", "pete", "shay", "wern", "nikol", "fisch", "skandal", "stef", "benj", "rabi", "must", "per", "core", "dal", "gor", "pav", "coop", "ross", "car", "kaz", "bor", "asl", "bert", "cli", "stev", "pres", "berr", "greg")
maleEndings = ("tien", "man", "mann", "tr", "ung", "aul", "rty", "ldo", "ver", "gos", "olas", "cott", "ard", "rco", "dict", "udan", "mut", "rnd", "vis", "ael", "yne", "ng", "ick", "oud", "ard", "onan", "kel", "ayne", "git", "eer", "son", "ank", "unk", "ion", "oey", "mes", "ll", "than", "hn", "rick", "tn", "eg", "nley", "uce", "ndon", "osh", "ony", "even", "odd", "drix", "ang", "vid", "yahu", "san", "afat", "abil", "anz", "ain", "itris", "rner", "hael", "rtin", "taq", "eed", "haq", "asim", "mad", "ncan", "aig", "nto", "vich", "od", "mon", "der", "rad", "ting", "bert", "imis", "ran", "ark", "dul", "uy", "ippe", "mas", "nus", "cil", "vey", "los", "nup", "van", "uel", "ff", "rim", "pras", "dric", "uck", "ox", "vit", "jon", "ron", "rian", "ton", "dam", "rri", "sky", "fez", "mar", "sad", "tin", "usz", "qar", "rto", "ory")
def is_male_name(token):
	# Uses algorithmic approach to assign gender tag and returns boolean
	result = False
	if any(token.endswith(ending) for ending in maleEndings):
		result = True
	elif any(token.startsWith(start) for start in maleBeginnings):
		result = True
	elif token in maleWhole:
		result = True
	if result: print(token)
	return result

def chooseGender(femResult, maleResult):
	# If I use weights (instead of booleans), I can compare
	result = femResult - maleResult
	if result >= 0:
		return "Female"
	else:
		return "Male"


In [2]:
# Tokenization and tag distribution function
def tokenize_and_distribute_tags(examples):
	tokenized_inputs = tokenizer(
		examples["tokens"],
		truncation=True,
		is_split_into_words=True,
		padding='max_length',
		max_length=128
	)

	labels = []
	for i, label in enumerate(examples["ner_tags"]):
		word_ids = tokenized_inputs.word_ids(batch_index=i)

		# For loop written by Claude 3.5 Sonnet
		# It iterates through all the example tokens and classifies it as female or male
		modified_labels = []
		for j, tag in enumerate(label):
			if (tag == 1 or tag == 2) and isShortNonsense(examples["tokens"][i][j]):	# Implemented by myself, Drew Lickman
				modified_labels.append(tag)
			elif tag == 1:	# If it's a B-PERSON tag
				# Check the actual token using examples["tokens"][i][j]
				if is_female_name(examples["tokens"][i][j]):	# Implemented by myself, Drew Lickman
					modified_labels.append(9)	# Convert B-PERSON to B-FPER index
				elif is_male_name(examples["tokens"][i][j]):
					modified_labels.append(1)	# Convert B-PERSON to B-MPER index
				else:
					print("Unknown:", examples["tokens"][i][j])
					modified_labels.append(tag)
			elif tag == 2:	# If it's an I-PERSON tag	# Note: this might miss names split into more than 2 tokens
				# Keep the same type (MPER or FPER) as the previous B- tag
				if modified_labels[-1] == 9:	# If previous was B-FPER
					modified_labels.append(10)	# I-FPER index
				else:
					modified_labels.append(2)	# I-MPER index
			else:	# Not a person tag
				modified_labels.append(tag)

		label_ids = [-100 if word_id is None else modified_labels[word_id] for word_id in word_ids]
		#label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
		#print(f"Tag List: {label_list}\n\nTokens: {examples['tokens'][0]}\n\nTokenized: {tokenized_inputs.tokens(batch_index=i)} \
		#\n\nTags: {label}\n\nTokenized word ids: {word_ids}\n\nDistributed tags: {label_ids}")
		#input()
		labels.append(label_ids)

	tokenized_inputs["labels"] = labels
	return tokenized_inputs

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_and_distribute_tags, batched=True)

NameError: name 'dataset' is not defined

In [11]:
# Metric fucntion
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = \
        [ [label_list[label] for label in label_seq if label != -100] for label_seq in labels ]
    model_predictions = \
        [ [label_list[pred] for (pred, label) in zip(pred_seq, label_seq) if label != -100] for pred_seq, label_seq in zip(predictions, labels) ]

    results = metric.compute(predictions=model_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [12]:
# Set training arguments
batch_size = 64
epochs = 1

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
)

# Instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1718,0.157322,0.786921,0.819912,0.803078,0.954634


Checkpoint destination directory ./results/checkpoint-220 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Evaluation Results: {'eval_loss': 0.15732187032699585, 'eval_precision': 0.7869205583318973, 'eval_recall': 0.8199120208277224, 'eval_f1': 0.8030775994724116, 'eval_accuracy': 0.9546338302009073, 'eval_runtime': 32.414, 'eval_samples_per_second': 100.265, 'eval_steps_per_second': 1.573, 'epoch': 1.0}


In [14]:
# Make predictions on the test set
predictions = trainer.predict(tokenized_datasets["test"])
pred_labels = np.argmax(predictions.predictions, axis=2)
true_labels = predictions.label_ids