In [1]:
from transformers import pipeline

In [2]:
token_classifier = pipeline('token-classification')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [4]:
token_classifier('I am Dawood Khan and I live in Munich')

[{'entity': 'I-PER',
  'score': np.float32(0.999718),
  'index': 3,
  'word': 'Da',
  'start': 5,
  'end': 7},
 {'entity': 'I-PER',
  'score': np.float32(0.99953854),
  'index': 4,
  'word': '##wood',
  'start': 7,
  'end': 11},
 {'entity': 'I-PER',
  'score': np.float32(0.99973756),
  'index': 5,
  'word': 'Khan',
  'start': 12,
  'end': 16},
 {'entity': 'I-LOC',
  'score': np.float32(0.998351),
  'index': 10,
  'word': 'Munich',
  'start': 31,
  'end': 37}]

In [8]:
token_classifier = pipeline('token-classification', aggregation_strategy='average')
token_classifier('I am Dawood and I work in Munich')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity_group': 'PER',
  'score': np.float32(0.9993456),
  'word': 'Dawood',
  'start': 5,
  'end': 11},
 {'entity_group': 'LOC',
  'score': np.float32(0.9986532),
  'word': 'Munich',
  'start': 26,
  'end': 32}]

 Now we'll do all the work manually

### From Inputs to Predictions

In [11]:
# First tokenize input and pass it through model
from transformers import AutoTokenizer, AutoModelForTokenClassification
checkpoint = 'dbmdz/bert-large-cased-finetuned-conll03-english'
tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')

In [12]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
sentence = 'Dawood was working for UN in Pakistan'
inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)

In [14]:
print(inputs['input_ids'].shape)
print(outputs.logits.shape# Batch of 1 sequence, 10 tokens and 9 different labels ([1,10,9])

torch.Size([1, 10])
torch.Size([1, 10, 9])


In [15]:
# We use softmax to convert logits into probabilities
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()

In [16]:
print(predictions)

[0, 4, 4, 0, 0, 0, 6, 0, 8, 0]


In [17]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [18]:
results = []
tokens = inputs.tokens()

In [19]:
for idx ,pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label!=0:
        results.append(
            {'entity':label, 'score':probabilities[idx][pred],'word':tokens[idx]}
        )

print(results)

[{'entity': 'O', 'score': 0.9996232986450195, 'word': '[CLS]'}, {'entity': 'I-PER', 'score': 0.999491810798645, 'word': 'Da'}, {'entity': 'I-PER', 'score': 0.999091625213623, 'word': '##wood'}, {'entity': 'O', 'score': 0.999958872795105, 'word': 'was'}, {'entity': 'O', 'score': 0.9999486207962036, 'word': 'working'}, {'entity': 'O', 'score': 0.9998605251312256, 'word': 'for'}, {'entity': 'I-ORG', 'score': 0.9988266825675964, 'word': 'UN'}, {'entity': 'O', 'score': 0.9997767806053162, 'word': 'in'}, {'entity': 'I-LOC', 'score': 0.9998264908790588, 'word': 'Pakistan'}, {'entity': 'O', 'score': 0.9996232986450195, 'word': '[SEP]'}]


In [20]:
# now we do offsets mapping to get the info about start and end of each entity
inputs_offsets = tokenizer(sentence,return_offsets_mapping = True)
inputs_offsets['offset_mapping']

[(0, 0),
 (0, 2),
 (2, 6),
 (7, 10),
 (11, 18),
 (19, 22),
 (23, 25),
 (26, 28),
 (29, 37),
 (0, 0)]

In [23]:
sentence[7:10]

'was'

In [25]:
# using this we can complete our results just like 'pipeline' function
results = []
inputs_offsets = tokenizer(sentence,return_offsets_mapping = True)
tokens = inputs_offsets.tokens()
offsets = inputs_offsets['offset_mapping']
for idx ,pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label !=0:
        start,end = offsets[idx]
        results.append(
            {'entity':label,
        'score':probabilities[idx][pred],
        'word':tokens[idx],
        'start':start,
        'end':end}
        )

In [26]:
print(results)

[{'entity': 'O', 'score': 0.9996232986450195, 'word': '[CLS]', 'start': 0, 'end': 0}, {'entity': 'I-PER', 'score': 0.999491810798645, 'word': 'Da', 'start': 0, 'end': 2}, {'entity': 'I-PER', 'score': 0.999091625213623, 'word': '##wood', 'start': 2, 'end': 6}, {'entity': 'O', 'score': 0.999958872795105, 'word': 'was', 'start': 7, 'end': 10}, {'entity': 'O', 'score': 0.9999486207962036, 'word': 'working', 'start': 11, 'end': 18}, {'entity': 'O', 'score': 0.9998605251312256, 'word': 'for', 'start': 19, 'end': 22}, {'entity': 'I-ORG', 'score': 0.9988266825675964, 'word': 'UN', 'start': 23, 'end': 25}, {'entity': 'O', 'score': 0.9997767806053162, 'word': 'in', 'start': 26, 'end': 28}, {'entity': 'I-LOC', 'score': 0.9998264908790588, 'word': 'Pakistan', 'start': 29, 'end': 37}, {'entity': 'O', 'score': 0.9996232986450195, 'word': '[SEP]', 'start': 0, 'end': 0}]


In [27]:
sentence[29:37]

'Pakistan'