
### Named Entity Recognition (NER) by directly using the bert-base-NER model in Hugging Face


# Install Transformers and Datasets from Hugging Face

In [None]:
# Transformers installation
! pip install transformers[torch] datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-an

# NER as Token classification

# Load the Model and Tokenizer from bert-base-NER

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# Create a Pipeline from the bert-base-NER Model and Tokenizer

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Prepare a Text

In [None]:
text = "Apple Inc. plans to open a new store in San Francisco by January 2024. Tim Cook, the CEO, announced the news yesterday."

# Label Tokens with the Tags in the B-I-O Scheme

In [None]:
ner_results = nlp(text)
print(ner_results)

# Extract the Named Entities

In [None]:
# The code below presumes that ner_results is a list of dictionaries, each representing a token,
# arranged in the sequence they appeared in the source sentence.
organized_results = {'LOC': [], 'PER': [], 'ORG': [], 'MISC': []}

current_entity = None
current_words = []

for result in ner_results:
    entity_type = result['entity'].split('-')[1]
    if result['entity'].startswith('B-'):
        if current_entity:
            organized_results[current_entity].append(' '.join(current_words))
        current_entity = entity_type
        current_words = [result['word']]
    elif result['entity'].startswith('I-') and current_entity == entity_type:
        current_words.append(result['word'])

# Handle the last entity
if current_entity:
    organized_results[current_entity].append(' '.join(current_words))

# Remove hash symbols from words
for key, value in organized_results.items():
    organized_results[key] = [' '.join(word.split('##')) for word in value]

print(organized_results)


# Generate a List of Tokens and the Corresponding List of Entity Tags

In [None]:
token_list = []
tag_list = []
for result in ner_results:
    token_list.append(result['word'])
    tag_list.append(result['entity'])

In [None]:
token_list, tag_list

# Let Us Test the Model on the CoNLL2003 Data


Start by loading the CoNLL2003 dataset from the Datasets library:

In [None]:
from datasets import load_dataset

conll = load_dataset("conll2003")

The dataset has been split into train, test, and validation sets:

In [None]:
conll

Get the features in the datasets:

In [None]:
conll['test'].features

Get the list of tag names:

In [None]:
tag_names = conll["test"].features[f"ner_tags"].feature.names
tag_names

The letter that prefixes each `ner_tag` indicates the token position of the entity:

- `B-` indicates the beginning of an entity.
- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like
  `Empire State Building`).
- `0` indicates the token doesn't correspond to any entity.

## Test the Model on a Test Data

Use the instance of index 12 in the test dataset as an example:

In [None]:
example = conll['test'][12]
for key in example:
    print(key, ":", example[key])

Convert the tag ids to tag names to see what entities are recognized:

In [None]:
example_entities = [tag_names[i] for i in example['ner_tags']]

In [None]:
for idx, w in enumerate(example['tokens']):
    print(idx, w, ":", example_entities[idx])

What is the number of original tokens in the given data?

In [None]:
len(example_entities)

Tokenize the input by the tokenizer. Set `is_split_into_words=True` so that the given list of tokens can be processed correctly:

In [None]:
tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)
tokenized_input

List the resultant tokens after the tokenization:

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

What is the number of tokens generated by the tokenizer?

In [None]:
len(tokens)

You see there is a mismatch between the result of the tokenization and the given list of tokens. For evaluating the model's performance against the given tags, we need to realign the tokenization result with the given list of tags. Let's see whether the pipeline() could correctly handle the alignment.

Classify the tokens into recognized entities:

In [None]:
ner_results = nlp(example['tokens'])
ner_results

What is the length of the classification result?

In [None]:
len(ner_results)

Wonderful! It seems that the classification by pipeline() took care of the tokenization results. Subword tokens were grouped as a single unit if the tokens came from the same word. Now, we can retrieve the list of predictions by using the prediction of the first token in each group.

In [None]:
predictions = []
for result in ner_results:
    if len(result) == 0:
        predictions.append('O')
    else:
        predictions.append(result[0]['entity'])

In [None]:
print(predictions)

What is the length of the predictions?

In [None]:
len(predictions)

Great! We have matched predictions and given tags.

For the single example, we can see that there are 3 named entity tags in the given list. The model correctly classified 2 of them.

## Apply the Model to All Test Data

In [None]:
from tqdm import tqdm

In [None]:
# use the test dataset
test = conll['test']

In [None]:
from tqdm import tqdm

true_tags_list = []
predicted_tags_list = []
count = 0 # for test purpose
for atest in tqdm(test, desc=str(len(test))):
    if count < len(test) + 1:
        # add true labels to references
        true_tags_list.append([tag_names[id] for id in atest['ner_tags']])

        # recognize named entity in a test tokens
        test_ner_results = nlp(atest['tokens'])

        predicted_tags = []
        # extract the predicted tags
        for result in test_ner_results:
            if len(result) == 0:
                predicted_tags.append('O')
            else:
                predicted_tags.append(result[0]['entity'])

        predicted_tags_list.append(predicted_tags)
    count += 1

In [None]:
len(predicted_tags_list), len(true_tags_list)

## Check the predictions match the true tags

In [None]:
flag = True
for idx, apredi in enumerate(predicted_tags_list):
    if len(apredi) != len(true_tags_list[idx]):
        flag = False
        print(idx, ":", False)
if flag:
    print(True)

## Evaluate

In [None]:
! pip install -q evaluate seqeval

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

Apply the seqeval to the predicted tags and true tags:

In [None]:
results = seqeval.compute(predictions=predicted_tags_list, references=true_tags_list)

print("precision:", results["overall_precision"]),
print("recall:", results["overall_recall"]),
print("f1:", results["overall_f1"]),
print("accuracy:", results["overall_accuracy"])