# Fine-Tuning LLM for Named Entity Recognition (NER) using HuggingFace transformers library

Token Classification is the task of identifying classes for particular tokens like person (names), places (locations), or company names (organization).  

Example below shows how to train Distil-BERT on the WNUT 17 datset, save the model to model hub, then perform inference with the newly saved fined-tuned model.

#### Import packages


In [None]:
# Import Packages
from huggingface_hub import notebook_login
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np 

#### Login to HF (optional)


Use 

notebook_login()

OR 

run `huggingface-cli login` in terminal

i.e. Type "huggingface-cli login" in terminal to login to HuggingFace. This allows you to push your saved models to the HuggingFace database to share openly to all users and to load in for future use.



#### Load Dataset


In [None]:
dataset = load_dataset("wnut_17")
dataset
''' 
B- represents the beginning of an entity e.g "B-Paris" for Paris France 
I- represents an element associated with the B- tage e.g. "I-France" for Paris France
O- no entity was identified
'''

#### Generate labels for each token


In [None]:
id2label = {k:v for k,v in enumerate(dataset['train'].features['ner_tags'].feature.names)}
label2id = {v:k for k,v in id2label.items()}
label2id
# Tokens i.e. list of words for each sequence
# dataset['train']['tokens']

#### Tokenize dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer

#### Preprocessing Dataset Example 

In [None]:

Below is just an example but to preprocess an entire dataset, then use a map function which includes the preprocessing function
 
example_input_info = tokenizer(dataset['train']['tokens'][0], max_length=50, truncation=True, is_split_into_words=True)
# tokenizer.decode(example_input_info['input_ids'])
example_tokens = tokenizer.convert_ids_to_tokens(example_input_info['input_ids'])
example_tokens
example_input_info['input_ids'][:10], example_tokens[:10]

example_input_info.word_ids(batch_index=0)
for idx, val in enumerate(example_input_info['input_ids']):
    print(idx, val)
  

#### Re-align token labels with the word ids then ner labels


The hyperparameter "is_split_into_words" creates a mismatch in labels (ner tags).
Therefore re-align token labels with the word ids then ner labels. Word ids
are ids where each subword has the same id. From there, only the first 
unique label id stays the same and the rest convert to -100. Additionally,
special tokens don't have word ids so they are represented as None values. Afterwards,
they are converted to -100 values in the ner tags outputs.

e.g. 
sequence: 'playground'
tokens: (with is_split_into_words=True): ['[CLS]', 'play', 'ground', '[SEP]']
word ids: [None, 1, 1, None]
token ids: [-100, 5, -100, -100]

In [None]:


def preprocess_dataset(sequences: str):
    # Tokenize sequence of sentences to words with special characters and subwords included
    # e.g. "playground" -> ["CLS", "play", "ground", "SEP"]
    # AND ....
    # Convert each sequence into its respective token ids
    # e.g. ["CLS", "play", "ground", "SEP"] -> [101, 344, 9948, 102]
    tokenized_info = tokenizer(sequences['tokens'], truncation=True, is_split_into_words=True)

    all_new_ner_tags = []

    ner_tags = sequences['ner_tags']

    for ner_idx, ner_tag in enumerate(ner_tags):
        # Convert each token id to its associated word id
        # A word id represents a unique word or group of subwords
        # that represents the label or ner tag it's associtated with.
        # For the example below, both "play" and "ground" receive the same
        # id since "ground" is the *inner subword of "play" (which is the 
        # beginning of the subword). 
        # [101, 344, 9948, 102] -> [None, 2, 2, None]
        word_ids = tokenized_info.word_ids(batch_index=ner_idx)
        prev_idx = None
        single_new_ner_tag = []

        # Swap the None values and repetive subwords for the value -100 and keep the remaining word ids
        # [None, 2, 2, None] -> [-100, 2, -100, -100] 
        # Another example -> [None, 1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, None] -> [-100, 1, 2, 3, 4, 5, 6, -100]
        for word_idx in word_ids:
            if word_idx is None:
                single_new_ner_tag.append(-100)
            elif prev_idx != word_idx:
                single_new_ner_tag.append(ner_tag[word_idx])
            else:
                single_new_ner_tag.append(-100)
            prev_idx = word_idx
        all_new_ner_tags.append(single_new_ner_tag)

    # Add the labels that was just created to the 'tokenized_info' DatasetDict    
    tokenized_info['labels'] = all_new_ner_tags
    return tokenized_info

            
new_data = data.map(create_new_input_ids_for_new_token_list, batched=True)
new_data
new_data['train']['labels'][4]
        
        # Option 2: save results in-place
    #     current_word_id = None
    #     for word_idx, word_id in enumerate(single_sequence_ids):
    #         if word_id is None or word_id == current_word_id:
    #             single_sequence_ids[word_idx] = -100
    #             current_word_id = single_sequence_ids[word_idx]
    # input_info['labels'] = input_ids
    # return input_info


In [None]:
# NOT using batched=True hyperparameter, outputs a "IndexError: list index out of range"
tokenized_dataset = dataset.map(preprocess_dataset, batched=True)

tokenized_dataset
tokenized_dataset['train']['labels']
for i in tokenized_dataset['train']['labels']:
    print(len(i))

  #### Dynamic Padding

Notice above the len of each sequence is mismatched. 
Each sequence greater than the max length limit was truncated to the max length, however sequences 
with a length less than the maximum will still be short. Therefore padding still needs to take 
place. 
Q.) But if I add the hyperparameter padding=True or padding=max_length or max_length=50 to the 
tokenizer object, then why do I still have mismatched lengths? 
A.) The subwords for each word are computed AFTER the initial padding. 
Meaning that no matter what, there will be a mismatch between tokens and word ids. To solve this 
problem, each sequence must be padded to be the same length. 
There are two obvious ways to do this: 
1. pad the list of sequences as a whole or 
2. pad a batch of sequences. The first option is cumbersome and can be computationally expensive 
especially with a large amount of data. Essentially, it involes just detmining the sequence with 
the max length and then padding the remaining sequences with zeros (left and/or right padding) 
to reach the max length. The better option would be to extract a batch of data 
(n samples of whole dataset) and pad based off the sequence in the batch with the max length. 
This means batches of data will contain the same sequence length, but can be different size when compared to other batches

e.g. dynamic padding with batches of 2, truncation=True and max_length=7

batch 1
[16,28,43]
[86,25,47,54,765,3,45,23,66,75,89,43]
[76,55,987,4,1,22,33,16,54]

dynamic padding: batch 1 
[16,28,43,0,0,0,0]
[86,25,47,54,765,3,45]
[76,55,987,4,1,22,33]

batch 2
[245,98,8,74,14,75,3]
[6,30,5,7]
[66,88,31]

dynamic padding: batch 2
[245,98,8,74,14,75,3]
[6,30,5,7,0,0,0]
[66,88,31,0,0,0,0]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

In [None]:
id2label

#### Evaluation 

In [None]:
# Seqeval is an easy-to-implement library for model evaluation that includes various metrics, like precison, recall, accuracy, f1 score, etc.

# Create an evaluation function which will later be inputed as a one of the training arguements for the Trainer class
# Therefore the evaluation function will kick off as the model is training.
# The seqeval framework includes several metrics to use like precision, recall, f1 score, and accuracy
seqeval = evaluate.load("seqeval")
seqeval

In [None]:
# Create function for evaluation metrics
# Use mapping dict for ner_labels
# e.g. id2label and label2id
# Create compute metrics function which will return when training is complete.
# It will be be instantiated inside the TrainerArguements which can hold 
# range of hyperparamters to tune. 
# nx13 size input
def compute_metrics(predictions_for_each_ner_tag):
    probs, labels = predictions_for_each_ner_tag
    print('labels', labels)
    predictions = np.argmax(probs, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    # true_labels = [
    #     [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    #     for prediction, label in zip(predictions, labels)
    # ]
    print('true labels', true_labels)
    true_predictions = [[id2label[p] for (p,l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    # true_predictions = [
    #         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    #         for prediction, label in zip(predictions, labels)
    #     ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    # result = seqeval.compute(predictions=true_predictions, references=true_labels)
    # Trainer is attempting to log a value of "{'precision': 0.14285714285714285, 'recall': 0.045454545454545456, 'f1': 0.06896551724137931, 'number': 66}" of type <class 'dict'> for key "eval/corporation" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
    print('RESULTS------', results)
    return {
        'precision': results['overall_precision'],
        'f1': results['overall_f1'],
        'recall': results['overall_recall'],
        'accuracy': results['overall_accuracy']

    }
    # return {
    #         "precision": results["overall_precision"],
    #         "recall": results["overall_recall"],
    #         "f1": results["overall_f1"],
    #         "accuracy": results["overall_accuracy"],
    #     }

    # print('TP', true_predictions)
    # print('TL', true_labels)
    # return true_labels

    # return true_predictions, true_labels

#### Create model or use pre-trained one to fine-tune

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id)
model

##### Instatiate a TrainingArgs object with a set of arguements

In [None]:

training_args = TrainingArguments(output_dir="./ner_model_wnut17",
                                  push_to_hub=True,
                                  learning_rate=2.5e-5,
                                  weight_decay=0.05,
                                  evaluation_strategy="epoch",
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3)
training_args


In [None]:
tokenized_dataset

In [None]:

#### Instantiate Trainer object

# Before Training, quickly create two dictionaries: id2label and label2id
# This will be helpful as reference when outputting predictions. 


trainer = Trainer(model=model,
                  data_collator=data_collator,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['test'],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                  args=training_args
                  )
trainer

In [None]:
#### Train model
trainer.train()

#### Inference after pushing model to Huggingface Hub


##### Inference Method 1: use pipeline class

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
from datasets import load_dataset

# Load model to HF model hub
sequence = "The man works in Indiana for Prolific Inc."

# Load tokenizer and model
pipe = pipeline(task="ner", model="dstaples08/ner_model_wnut17")
pipe
pipe(sequence)


##### Infererence Method 2: Replicate the results of the pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dstaples08/ner_model_wnut17")
model = AutoModelForTokenClassification.from_pretrained("dstaples08/ner_model_wnut17")
tokenizer
model
seq_vector = tokenizer.encode(sequence, return_tensors="pt")
tokenized_info = tokenizer(sequence, return_tensors="pt")
tokenized_info
seq_vector
with torch.no_grad():
    model_info = model(**tokenized_info)
# logits = model(tokenized_info["input_ids"])["logits"]
model_info
logits = model_info["logits"]
logits
logits.shape, 
label_map = model.config.id2label
label_map
predictions_label_ids = logits.argmax(dim=2).tolist()
predictions_label_ids
sequence
predictions = [label_map[i] for i in predictions_label_ids[0]]
predictions
