Drew Lickman

CSCI 4820-1

Project #5

Due: 11/19/24

# BERT Named Entity Recognition Fine Tuning Project Starter Code
### Dr. Sal Barbosa, Department of Computer Science, Middle Tennessee State University

In [1]:
# Required on TAMU FASTER to be able to pip install packages and download the dataset from Hugging Face
import os
os.environ['http_proxy'] = 'http://10.72.8.25:8080'
os.environ['https_proxy'] = 'http://10.72.8.25:8080'

In [2]:
# pip installs - comment out after running the notebook for the first time
#!pip install datasets
#!pip install evaluate
#!pip install seqeval
#!pip install accelerate==0.26.1

In [3]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Sequence, ClassLabel
import numpy as np
import evaluate
from collections import Counter

2024-11-16 16:01:17.844189: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-16 16:01:17.883565: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-16 16:01:17.883594: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-16 16:01:17.883621: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 16:01:17.890680: I tensorflow/core/platform/cpu_feature_g

In [4]:
# Load the CONLL-2003 NER dataset
dataset = load_dataset("conll2003")

# Remove columns not used in this code
dataset = dataset.remove_columns(['id', 'pos_tags', 'chunk_tags'])
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
# Get and display the NER tag list for the dataset
label_list = dataset["train"].features["ner_tags"].feature.names

# Rename PERSON labels to MALE labels
label_list[1] = 'B-MPER'
label_list[2] = 'I-MPER'

# Append FEMALE labels at end of label list
label_list.append('B-FPER')
label_list.append('I-FPER')

print("Label list:", label_list)

Label list: ['O', 'B-MPER', 'I-MPER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-FPER', 'I-FPER']


In [6]:
# Load the BERT cased model
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def isShortNonsense(token):
    # Removes tokens such as "M." "S." "R." etc. from being male or female
    if len(token) == 2 and token[-1] == '.':
        return True
    return False

# Some names came from https://nameberry.com/blog/the-most-popular-baby-name-endings 11/15/24
femaleWhole = ("Taha", "Olga", "Inga", "Abu", "Mia", "Rui", "Kim", "Ai", "Ebe")
femaleBeginnings = ("van", "alic", "ott", "male", "lea", "mere", "lind", "bai", "chi", "olivi", "cha", "eyl", "zun", "lore", "tara", "ise", "woo", "pet", "gius", "cook", "fab", "ghe", "syb", "juh", "bell", "charli", "asi", "lie", "chiq", "duf", "yas", "zit", "may", "kimi", "joha", "ili", "luo", "viv", "hill", "min", "marg", "till")
femaleEndings = ("ati", "ssa", "sea", "vai", "icky", "yste", "eah", "ure", "ate", "oche", "tel", "ele", "ee", "chi", "eva", "karo", "eggy", "via", "ama", "eles", "ope", "ela", "ona", "anda", "rii", "lly", "lli", "nja", "oku", "weyi", "sha", "aki", "pese", "alo", "tra", "elo", "rpe", "oto", "omo", "osa", "ghe", "ini", "rei", "are", "mmi", "ena", "luca", "thia", "una", "lah", "ewa", "aba", "eira", "aan", "gle", "xei", "eve", "erre", "wicz", "issa", "lat", "ima", "uta", "ley", "xia", "oe", "que", "nia", "hia", "iza", "erly", "ean", "ylis", "iew", "wicz", "issy", "dra", "abi", "rta", "aya", "gato", "cca", "oko", "gma", "ika", "ay", "ies", "zio", "arda", "oux", "ore", "elli", "raj", "antha", "gne", "kki", "evic", "ino", "ata", "ene", "si", "az", "uki", "ise", "ova", "oin", "ria", "ata", "anne", "drea", "ayla", "essa", "her", "anna", "ana", "ette", "etta", "elle", "ella", "ina", "yah", "iah", "lyn", "icia", "rie", "ora", "lie", "thy", "atie", "rude", "lia", "lla", "enna", "ine", "ani", "ola", "een", "ahi", "kie", "ane", "ahu", "ara", "ari", "mbe", "pta", "ady", "ie", "ary", "xa")
def is_female_name(token):
    # Uses algorithmic approach to assign gender tag and returns boolean
    result = False
    #if any(token.lower().endswith(ending) for ending in femaleEndings):
    if token.lower().endswith(femaleEndings):
        result = True
    #elif any(token.lower().startswith(start) for start in femaleBeginnings):
    elif token.lower().startswith(femaleBeginnings):
        result = True
    elif token in femaleWhole:
        result = True
    #if result: print("F:",token)
    return result

maleWhole = ("Jimi", "Levy", "Anders", "Fred", "Nick", "Juan", "Kenny", "Abu", "Jay", "Tim", "Roy", "Danny", "Liam", "Alex", "Shen", "Costas", "Dan", "Hal", "Sam", "Tom", "Ken", "Daniel", "Ian", "Blake")
maleBeginnings = ("man", "mr", "simo", "jar", "parn", "grae", "lew", "rub", "rifk", "rugg", "hers", "hars", "agr", "arj", "max", "kar", "gran", "li", "mosh", "ed", "rip", "ren", "jan", "chris", "neal", "hu", "marc", "you", "mill", "arw", "jul", "fern", "shig", "feli", "hidem", "stew", "serg", "efa", "jose", "olin", "erik", "bry", "sato", "jone", "fred", "owen", "edb", "bena", "web", "mach", "jim", "jord", "elm", "huse", "kenn", "vog", "jeff", "buca", "Yon", "craw", "bur", "charle", "tho", "aa", "col", "kri", "javi", "moy", "hic", "gar", "hunt", "cor", "bill", "bob", "hel", "will", "mick", "con", "sal", "ric", "phi", "terr", "bru", "pete", "shay", "wern", "nikol", "fisch", "skandal", "stef", "benj", "rabi", "must", "per", "core", "dal", "gor", "pav", "coop", "ross", "car", "kaz", "bor", "asl", "bert", "cli", "stev", "pres", "berr", "greg")
maleEndings = (".", "drew", "hen", "rett", "andy", "aus", "cois", "kis", "ink", "ur", "ind", "ath", "ndt", "sco", "ler", "ewt", "iser", "ein", "wis", "ind", "ien", "vik", "att", "orm", "nis", "reas", "zen", "las", "cak", "sty", "ats", "vin", "lip", "nko", "ons", "ume", "tien", "iri", "ij", "ant", "cisco", "rst", "ken", "tian", "nas", "slav", "wel", "ando", "han", "lix", "wart", "gen", "rif", "emp", "ruw", "ryan", "one", "yan", "wire", "pot", "lan", "olf", "gel", "ber", "dim", "tar", "red", "borg", "scar", "naj", "eir", "iki", "man", "ven", "un", "rek", "ric", "kus", "ten", "ito", "ite", "tro", "erve", "ido", "nna", "ster", "aac", "old", "ean", "eil", "rren", "gts", "orn", "roy", "les", "rio", "ter", "ral", "mond", "sen", "wan", "ado", "rak", "nn", "hew", "ier", "ike", "rgi", "vanni", "bed", "sin", "ald", "ole", "eus", "req", "chez", "nen", "ich", "pov", "nov", "dre", "mir", "erry", "ght", "lock", "utch", "os", "chel", "oel", "ert", "arl", "ger", "ald", "eld", "ford", "ques", "ott", "ham", "vic", "oft", "ump", "ance", "ius", "tor", "orge", "uis", "monn", "mir", "uzo", "oug", "ock", "ado", "dro", "tor", "nce", "cer", "hris", "iel", "ave", "ek", "mann", "tr", "din", "dan", "ung", "al", "aul", "mir", "nry", "rty", "ldo", "ack", "ver", "gos", "olas", "cott", "ard", "rco", "dict", "udan", "mut", "rnd", "vis", "ael", "yne", "ng", "ick", "oud", "ard", "onan", "kel", "ayne", "git", "eer", "son", "ank", "unk", "ion", "oey", "mes", "ll", "than", "hn", "rick", "tn", "eg", "nley", "uce", "ndon", "osh", "ony", "even", "odd", "drix", "ang", "vid", "yahu", "san", "afat", "abil", "anz", "ain", "itris", "rner", "hael", "rtin", "taq", "eed", "haq", "asim", "mad", "ncan", "aig", "nto", "vich", "od", "mon", "der", "rad", "ting", "bert", "imis", "ran", "ark", "dul", "uy", "ippe", "mas", "nus", "cil", "vey", "los", "nup", "van", "uel", "ff", "rim", "pras", "dric", "uck", "ox", "vit", "jon", "ron", "rian", "ton", "dam", "rri", "sky", "fez", "mar", "sad", "tin", "usz", "qar", "rto", "ory")
def is_male_name(token):
    # Uses algorithmic approach to assign gender tag and returns boolean
    result = False
    #if any(token.lower().endswith(ending) for ending in maleEndings):
    if token.lower().endswith(maleEndings):
        result = True
    #elif any(token.lower().startswith(start) for start in maleBeginnings):
    elif token.lower().startswith(maleBeginnings):
        result = True
    elif token in maleWhole:
        result = True
    #if result: print("M:",token)
    return result

def chooseGender(femResult, maleResult):
	# If I use weights (instead of booleans), I can compare
	result = femResult - maleResult
	if result >= 0:
		return "Female"
	else:
		return "Male"


In [8]:
# Tokenization and tag distribution function
def tokenize_and_distribute_tags(examples):
	tokenized_inputs = tokenizer(
		examples["tokens"],
		truncation=True,
		is_split_into_words=True,
		padding='max_length',
		max_length=128
	)

	labels = []
	for i, label in enumerate(examples["ner_tags"]):
		word_ids = tokenized_inputs.word_ids(batch_index=i)

		# For loop written by Claude 3.5 Sonnet
		# It iterates through all the example tokens and classifies it as female or male
		modified_labels = []
		for j, tag in enumerate(label):
			if (tag == 1 or tag == 2) and isShortNonsense(examples["tokens"][i][j]):	# Implemented by myself, Drew Lickman
				modified_labels.append(tag)
			elif tag == 1:	# If it's a B-PERSON tag
				# Check the actual token using examples["tokens"][i][j]
				if is_female_name(examples["tokens"][i][j]):	# Implemented by myself, Drew Lickman
					modified_labels.append(9)	# Convert B-PERSON to B-FPER index
				elif is_male_name(examples["tokens"][i][j]):
					modified_labels.append(1)	# Convert B-PERSON to B-MPER index
				else:
					print(examples["tokens"][i][j])
					modified_labels.append(tag)
			elif tag == 2:	# If it's an I-PERSON tag	# Note: this might miss names split into more than 2 tokens
				# Keep the same type (MPER or FPER) as the previous B- tag
				if modified_labels[-1] == 9:	# If previous was B-FPER
					modified_labels.append(10)	# I-FPER index
				else:
					modified_labels.append(2)	# I-MPER index
			else:	# Not a person tag
				modified_labels.append(tag)

		label_ids = [-100 if word_id is None else modified_labels[word_id] for word_id in word_ids]
		#label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
		#print(f"Tag List: {label_list}\n\nTokens: {examples['tokens'][0]}\n\nTokenized: {tokenized_inputs.tokens(batch_index=i)} \
		#\n\nTags: {label}\n\nTokenized word ids: {word_ids}\n\nDistributed tags: {label_ids}")
		#input()
		labels.append(label_ids)

	tokenized_inputs["labels"] = labels
	return tokenized_inputs

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_and_distribute_tags, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Gennadi
Ferenc
Fraser
Jeroen
Yevgeny
Hendrik
Mauricio
Arnaud
Hernan
Jakob
Johnny
Cy
Chili
Mo
Jaha
COCU
Luc
Nilis
Cocu
Antonio
Ciriaco
Murat
Lanner
Retief
Des
Lee
Welch
Gildea
Lynda
Graf
Yayuk
Graf
Monica
Yevgeny
Graf
Anke
Graf
Martinez
Stoltenberg
Malu
Ahmed
Mirhunisa
Komarica
Ronny
Ro
Downer
Qian
Downer
Said
Said
Mohamed
Hosni
Kevorkian
Kevorkian
Kevorkian
Geoffrey
Smith
Kevorkian
Kevorkian
Smith
Kevorkian
Lou
Kevorkian
Judith
Hans-Otto
Akashi
Yevgeny
Inderjit
Rangarajan
Rangarajan
Rangarajan
Saint
Agnes
Teresa
Therese
Jesus
Westlake
Chua
Chua
Leonid
Kuchma
Kuchma
Kuchma
Musa
Dmitry
Johnny
Asa
Akram
Enqvist
Enqvist
Korda
Enqvist
Davenport
Davenport
Rocky
Lee
Clemens
Jacob
Otis
Nail
Haile
Sobotzik
Wilmots
Diego
Jonzon
Barry
Des
Smyth
Ato
Allen
Ludmila
Brigita
Aliuska
Engquist
Gail
Devers
Gwen
Ilke
Franka
Natalya
Nicoleta
Ludmila
Aliuska
Dionne
Brigita
Dawn
Allen
Emilio
Paula
Ato
Laurent
Gail
Gwen
Leah
Sarah
Sinead
Moses
Gideon
Larbi
Eliud
Anthuan
Ato
Lamont
Veerle
Anneke
Marie-Jose
Mai

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Nasser
Such
SHEARER
Adams
Shearer
Hoddle
Shearer
Teddy
Shearer
Cosmin
Mihai
Anatoly
Camacho
Rodney
Ines
Filippo
Asa
Hendrik
Monica
Larry
Jesper
Russ
HENKE
Henke
Jesper
Henke
Henke
Estes
SILVA
Mauro
Silva
Jesse
Gail
Ludmila
Aliuska
Brigita
Dionne
Gillian
Leah
Emilio
Falk
Ato
Astrid
Claudia
Venuste
Laban
Marko
Lars
Vasily
Virgilijus
Gail
Gwen
Gete
Rose
Tegla
Gunhild
Sammy
Nico
Adem
Vebjoen
Igor
Maksim
Igor
Dmitri
Natalya
El
Shem
Brahim
Fita
Armen
Sigurd
Natalya
Rita
Jesse
Oeji
Falk
Bisconti
Gianluigi
Emiliano
Jayasuriya
Vaas
Ranatunga
Jayasuriya
Vass
de
Jayasuriya
Healy
A.de
Vaas
Law
Aravinda
Stuart
Waugh
Romesh
Asanka
Aravinda
Upul
Luli
Harry
Keith
Irwin
Harte
McLoughlin
Townsend
Nagoum
Maskhadov
Maskhadov
Alija
Amra
Larisa
Semyon
Zenon
Yuri
Sherbatov
Rajab
Maskhadov
Valery
Ivankov
Lavrentyeva
Levrentyeva
Emese
Laszlo
Ronny
Norodom
Sihanouk
Sihanouk
Norodom
Mishi
Gaddafi
Mohammed
Samer
Gaddafi
Gaddafi
Gaddafi
Geraghty
Geraghty
Geraghty
Geraghty
Reaser
Erin
Volker
Ruehe
Yitzhak
Ezer
Jupp

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Igor
Takuya
Hiroshige
Shu
CUTTITTA
Cuttitta
Coste
Cuttitta
Coste
Coste
Paolo
Diego
Massimo
Franco
Massimo
Giambatista
Takuya
Kenichi
Takagi
Miura
Naoki
Kenichi
Hiroshige
Naoki
Masami
Motohiro
Masakiyo
Takuya
Hiroshi
Bachar
Nihad
Mohammed
Ali
Khaled
Jesper
Jean-Luc
Korneilus
Candice
Tae
Ljudmila
Takuya
Shkvyrin
Shatskikh
Zahoor
Cairns
Inzamamul
Astle
Harris
Shahid
Harris
Astle
Harris
Harris
Harris
Cairns
Harris
Astle
Afridi
Afridi
Shahid
BLINKER
Blinker
Blinker
BOWYER
Lee
Bowyer
Bowyer
Bowyer
Rob
Campo
Rob
Rob
Pat
Des
Schalk
Desvonde
Botes
Sammy
Trevor
Don
Anghel
Viorel
Iordanescu
Florin
Tibor
Iulian
Basarab
Dorinel
Ovidiu
Ioan
Ionel
Viorel
Blewett
Stuart
Healy
Warne
Moody
Healy
Blewett
Moody
Blewett
Warne
Warne
Reiffel
Moody
Blewett
Warne
Hooper
Hooper
Ambrose
Walsh
Hooper
Courtney
Lloyd
Healy
Courtney
Stuart
Sherwin
Junior
Nixon
Curtly
Courtney
Roland
Rashid
Soren
Poul-Erik
Budi
Ye
Ye
Aly
Mohamed
Faulk
Harbaugh
Kerwin
Faulk
Ty
PACE
Pace
Pace
Pace
Japhet
Claude
Gourvennec
Stickroth
Wos

In [9]:
# Metric fucntion
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = \
        [ [label_list[label] for label in label_seq if label != -100] for label_seq in labels ]
    model_predictions = \
        [ [label_list[pred] for (pred, label) in zip(pred_seq, label_seq) if label != -100] for pred_seq, label_seq in zip(predictions, labels) ]

    results = metric.compute(predictions=model_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [10]:
# Set training arguments
batch_size = 64
epochs = 1

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
)

# Instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
# Make predictions on the test set
predictions = trainer.predict(tokenized_datasets["test"])
pred_labels = np.argmax(predictions.predictions, axis=2)
true_labels = predictions.label_ids