# Fast tokenizers' special powers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 18.4 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 768 kB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 49.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 62.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 59.9 MB/s 
[?25hCollecting multiprocess
  Download

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Anri and I work at Twitter on Mars."
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [16]:
tokenizer.is_fast

True

In [17]:
encoding.is_fast

True

In [18]:
encoding.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'An',
 '##ri',
 'and',
 'I',
 'work',
 'at',
 'Twitter',
 'on',
 'Mars',
 '.',
 '[SEP]']

In [19]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]

In [20]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Anri'

In [21]:
from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Anri and I work at Twitter on Mars.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'entity': 'I-PER',
  'score': 0.99888366,
  'index': 4,
  'word': 'An',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.9989052,
  'index': 5,
  'word': '##ri',
  'start': 13,
  'end': 15},
 {'entity': 'I-ORG',
  'score': 0.98794556,
  'index': 10,
  'word': 'Twitter',
  'start': 30,
  'end': 37},
 {'entity': 'I-LOC',
  'score': 0.98824614,
  'index': 12,
  'word': 'Mars',
  'start': 41,
  'end': 45}]

In [22]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Anri and I work at Twitter on Mars.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'entity_group': 'PER',
  'score': 0.99889445,
  'word': 'Anri',
  'start': 11,
  'end': 15},
 {'entity_group': 'ORG',
  'score': 0.98794556,
  'word': 'Twitter',
  'start': 30,
  'end': 37},
 {'entity_group': 'LOC',
  'score': 0.98824614,
  'word': 'Mars',
  'start': 41,
  'end': 45}]

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Anri and I work at Twitter on Mars."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

In [25]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

torch.Size([1, 15])
torch.Size([1, 15, 9])


In [26]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 6, 0, 8, 0, 0]


In [27]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [28]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9988836646080017, 'word': 'An'}, {'entity': 'I-PER', 'score': 0.9989051818847656, 'word': '##ri'}, {'entity': 'I-ORG', 'score': 0.9879454374313354, 'word': 'Twitter'}, {'entity': 'I-LOC', 'score': 0.9882460236549377, 'word': 'Mars'}]


In [29]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 13),
 (13, 15),
 (16, 19),
 (20, 21),
 (22, 26),
 (27, 29),
 (30, 37),
 (38, 40),
 (41, 45),
 (45, 46),
 (0, 0)]

In [30]:
example[12:14]

'nr'

In [31]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9988836646080017, 'word': 'An', 'start': 11, 'end': 13}, {'entity': 'I-PER', 'score': 0.9989051818847656, 'word': '##ri', 'start': 13, 'end': 15}, {'entity': 'I-ORG', 'score': 0.9879454374313354, 'word': 'Twitter', 'start': 30, 'end': 37}, {'entity': 'I-LOC', 'score': 0.9882460236549377, 'word': 'Mars', 'start': 41, 'end': 45}]


In [None]:
example[33:45]

Hugging Face

In [32]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)

[{'entity_group': 'PER', 'score': 0.9988944232463837, 'word': 'Anri', 'start': 11, 'end': 15}, {'entity_group': 'ORG', 'score': 0.9879454374313354, 'word': 'Twitter', 'start': 30, 'end': 37}, {'entity_group': 'LOC', 'score': 0.9882460236549377, 'word': 'Mars', 'start': 41, 'end': 45}]
