<a href="https://colab.research.google.com/github/Daryldactyl/Training_Transformers/blob/main/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER (Named Entity Recognition)
### Import the Dataset

In [1]:
!pip install transformers datasets

In [2]:
from datasets import load_dataset

data = load_dataset('conll2003')
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

### Visualize the Dataset

In [3]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [4]:
label_names = data['train'].features['ner_tags'].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Build the Tokenizer

In [5]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

#### Target alignment
- As we can see the word lamb is split into 2 tokens and the CLS and SEP tokens have been added. In order to match our ner_tags we will need to make sure that the tag for lamb is duplicated to accompany both parts of the token
- We will also need to create tags for the CLS and SEP tokens
- We will need to make sure if the token is split that the tag goes from B-PER to I-PER for example as B is for beginning and I is for Inner

In [8]:
#Example of Word IDs for first sentence
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [9]:
#['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
# Map Begin IDs to Inner IDs
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8,
}

In [10]:
def align_targets(labels, word_ids):
  aligned_labels = []
  last_word = None
  for word in word_ids:
    if word is None:
      label = -100 #for CLS and SEP tokens
    elif word != last_word:
      #new word
      label = labels[word]
    else:
      #same as previous word (got split)
      label = labels[word]

      #change the B to an I for the tag
      if label in begin2inside:
        label = begin2inside[label]

    aligned_labels.append(label)

    last_word = word

  return aligned_labels

In [11]:
#test function on first sentence
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
print(f'Labels: {labels} Total: {len(labels)}\nWord IDs: {word_ids} Total: {len(word_ids)}')

Labels: [3, 0, 7, 0, 0, 0, 7, 0, 0] Total: 9
Word IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None] Total: 12


In [12]:
aligned_targets = align_targets(labels, word_ids)
aligned_targets

#Looking at the loop, none becomes -100 then last word is equal to none, 0 becomes 3 and last word becomes 0, 1 becomes 0 then last word becomes 1 all the way up to word 7
#Last word = 7 so second 7 becomes 0, since its not in begin2inside it stays as 0

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [13]:
align_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), align_labels):
  print(f'{x}\t{y}')

[CLS]	None
EU	B-ORG
rejects	O
German	B-MISC
call	O
to	O
boycott	O
British	B-MISC
la	O
##mb	O
.	O
[SEP]	None


#### Now that Target Alignment is confirmed to be working, lets build the tokenize function

In [14]:
def tokenize_fn(batch):
  #Tokenize the inputs (the tokenized version of each word ex: '[CLS]','EU','rejects','German','call','to','boycott','British','la','##mb','.','[SEP]')
  tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)

  #Get the original ner tags (ex: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
  labels_batch = batch['ner_tags']
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):

    #Iterate over each sentence in tokenized inputs and get the list of word ids
    word_ids = tokenized_inputs.word_ids(i)

    #Use the label per label in labels batch and now your iterated sentence from the tokenized inputs into our align function. This will iteratively align every sentence to its labels in the dataset
    aligned_labels_batch.append(align_targets(labels, word_ids))

  tokenized_inputs['labels'] = aligned_labels_batch

  return tokenized_inputs

In [15]:
#Remember for our model we will need a column named inputs_ids and another named labels
data['train'].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [16]:
tokenized_datasets = data.map(
    tokenize_fn,
    batched=True,
    remove_columns = data['train'].column_names
)

tokenized_datasets

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

The tokenizer creates the input_ids and attention mask column and our tokenize function creates the labels column

### Instantiate the Data Collator
The Data Collator pads all the columns in the dataset to be the same length but also pads the attention mask so the model knows to ignore the padding during training

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [18]:
[tokenized_datasets['train'][i] for i in range(2)]

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [19]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

Notice the padding for the labels are marked as -100 to be ignored by the model

### Metrics
Considering we are classifying tokens accuracy won't be a good measurement because we have multiple outputs for varying inputs

We will use seqeval which has the sole purpose of computing metrics for NLP Tasks with sequence targets

Keep in mind we will need to remove the pad tokens when computing the loss function as to not scew our accuracy in order to bias the model to predict all pad tokens to boost accuracy metric

In [20]:
!pip install seqeval



In [21]:
from datasets import load_metric

metric = load_metric('seqeval')

  metric = load_metric('seqeval')


In [22]:
#Test it out with the proper labels (since seqeval is expecting true labels)
metric.compute(
    predictions = [['O', 'O', 'I-ORG', 'B-MISC']],
    references = [['O', 'B-ORG', 'I-ORG', 'B-MISC']]
)

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

#### Setup our compute metrics function

In [23]:
import numpy as np

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)

  #remove the -100 from labels and predictions
  str_labels = [
      [label_names[t] for t in label if t != -100] for label in labels #For each label in the list of labels we will go through and remove the -100 tag and converting each tag to its str name
  ]

  #We will still get loss if the model predicts a -100 for the actual label so we know the model got it wrong and the prediction sequence will be the same length as the target sequence
  str_preds = [
      [label_names[p] for p, t in zip(pred, targ) if t != -100] for pred, targ in zip(preds, labels)
  ]

  the_metrics = metric.compute(predictions=str_preds, references=str_labels)

  return {
      'precision': the_metrics['overall_precision'],
      'recall': the_metrics['overall_recall'],
      'f1': the_metrics['overall_f1'],
      'accuracy': the_metrics['overall_accuracy']
  }

### Load in a Model and Fine-Tune with our Dataset

In [24]:
#Create Labels to specify to model so the labels will come out correctly instead of as LABEL_0, LABEL_1, etc.
id2label = {k: v for k, v in enumerate(label_names)}
label2id = {v:k for k, v in id2label.items()}

In [25]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
model

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

### Set up Training Arguments and Trainer

In [26]:
from transformers import TrainingArguments, Trainer

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [32]:
training_args = TrainingArguments(
    output_dir='distilbert-finetuned-ner',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0925,0.08447,0.880241,0.907943,0.893878,0.975776


TrainOutput(global_step=1756, training_loss=0.1487948910793574, metrics={'train_runtime': 4047.3827, 'train_samples_per_second': 3.469, 'train_steps_per_second': 0.434, 'total_flos': 153520489309824.0, 'train_loss': 0.1487948910793574, 'epoch': 1.0})

In [35]:
trainer.save_model('Sentence_NER_Model')

### Load Saved Model to test on Token classification

In [36]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    model = 'Sentence_NER_Model',
    aggregation_strategy='simple',
)

In [37]:
sentence = 'Bill Gates was the CEO of Microsoft in Seattle, Washington.'

ner(sentence)

[{'entity_group': 'PER',
  'score': 0.9970716,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9879116,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9949267,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.9943825,
  'word': 'Washington',
  'start': 48,
  'end': 58}]

In [43]:
mapping = {'PER': 'Person', 'ORG': 'Organization', 'LOC': 'Location'}
for entity in ner(sentence):
  type_of_entity = entity['entity_group']
  a_or_an = ('an' if entity['entity_group'] == 'ORG' else 'a')
  print(f"I am {round(entity['score'] * 100, 2)}% sure that {entity['word']} is {a_or_an} {mapping[type_of_entity]}")

I am 99.71% sure that Bill Gates is a Person
I am 98.79% sure that Microsoft is an Organization
I am 99.49% sure that Seattle is a Location
I am 99.44% sure that Washington is a Location
