# Token classification
Find the tokens that belong to the same entity ([Chunking](https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt)).

In [1]:
!pip install transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load a pre-trained tokenizer and model for NER
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# Create a NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly iden

In [3]:
text = "Barack Obama was the 44th President of the United States and was born in Hawaii."

entities = ner_pipeline(text)

# Print the identified entities
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.3f}")


Entity: Barack Obama, Type: PER, Score: 0.999
Entity: United States, Type: LOC, Score: 0.997
Entity: Hawaii, Type: LOC, Score: 0.999


In [4]:
def group_entities(entities):
    grouped_entities = []
    current_entity = {"entity": "", "type": "", "score": 0.0}

    for entity in entities:
        # If the current entity is empty or matches the previous type, concatenate
        if current_entity["type"] == entity["entity_group"]:
            current_entity["entity"] += " " + entity["word"]
            current_entity["score"] = max(current_entity["score"], entity["score"])
        else:
            # Append the completed entity
            if current_entity["entity"]:
                grouped_entities.append(current_entity)

            # Start a new entity
            current_entity = {
                "entity": entity["word"],
                "type": entity["entity_group"],
                "score": entity["score"]
            }

    # Append the last entity
    if current_entity["entity"]:
        grouped_entities.append(current_entity)

    return grouped_entities

# Group entities and print results
grouped_entities = group_entities(entities)
for entity in grouped_entities:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}, Score: {entity['score']:.3f}")


Entity: Barack Obama, Type: PER, Score: 0.999
Entity: United States Hawaii, Type: LOC, Score: 0.999


from videos

In [32]:
from transformers import pipeline

model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.9988506,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9647625,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9986118,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [35]:
token_classifier("My name is bushra and I am AI student at jeddah")

[{'entity_group': 'ORG',
  'score': 0.79212904,
  'word': 'AI',
  'start': 27,
  'end': 29}]

In [37]:
token_classifier("My name is sara and I am AI student at New York")

[{'entity_group': 'PER',
  'score': 0.9493027,
  'word': 'sa',
  'start': 11,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.62619275,
  'word': 'AI',
  'start': 25,
  'end': 27},
 {'entity_group': 'LOC',
  'score': 0.99765235,
  'word': 'New York',
  'start': 39,
  'end': 47}]