In [6]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy = "simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity_group': 'PER',
  'score': np.float32(0.9986171),
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': np.float32(0.97779936),
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': np.float32(0.9889683),
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

# General Pipeline (example on Named Entity Recognition)

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Mickelson and I am working at Home Depot in Manila"
inputs = tokenizer(example, return_tensors = "pt")
outputs = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim = -1)[0]
predictions = outputs.logits.argmax(dim = -1)[0].tolist()

print(predictions)

[0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 6, 6, 0, 8, 0]


In [9]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping = True)
tokens = inputs.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label, 
                "score": probabilities[idx][pred].item(), 
                "index": idx, "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

In [12]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred].item())
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)

[{'entity_group': 'PER', 'score': 0.9989841977755228, 'word': 'Mickelson', 'start': 11, 'end': 20}, {'entity_group': 'ORG', 'score': 0.9978117644786835, 'word': 'Home Depot', 'start': 41, 'end': 51}, {'entity_group': 'LOC', 'score': 0.9988477230072021, 'word': 'Manila', 'start': 55, 'end': 61}]


### Grouping entities

In [17]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

example = "My name is Mickelson and I am working at Home Depot in Manila"

def token_classification_pipline(example, aggregation_strategy = "simple"):

    ## Copied from earlier
    model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

    inputs = tokenizer(example, return_tensors = "pt")
    outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim = -1)[0]
    predictions = outputs.logits.argmax(dim = -1)[0].tolist()

    results = []
    inputs_with_offsets = tokenizer(example, return_offsets_mapping = True)
    tokens = inputs.tokens()
    offsets = inputs_with_offsets["offset_mapping"]

    idx = 0
    while idx < len(predictions):
        pred = predictions[idx]
        label = model.config.id2label[pred]
        if label != "O":
            # Remove B- or I-
            label = label[2:]
            start, _ = offsets[idx]

            all_scores = []
            while (
                idx < len(predictions)
                and model.config.id2label[predictions[idx]] == f"I-{label}"
            ):
                all_scores.append(probabilities[idx][pred].item())
                _, end = offsets[idx]
                idx += 1
            
            if aggregation_strategy == "simple":
                score = np.mean(all_scores).item()
            elif aggregation_strategy == "first":
                score = all_scores[0].item()
            elif aggregation_strategy == "max":
                score = np.max(all_scores).item()
            
            word = example[start:end]
            results.append(
                {
                    "entity_group": label, 
                    "score": score, 
                    "index": idx, 
                    "word": word,
                    "start": start,
                    "end": end,
                }
            )
        idx += 1

    return results

from pprint import pprint 
pprint(
    token_classification_pipline(
        example
    )
)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'end': 20,
  'entity_group': 'PER',
  'index': 7,
  'score': 0.9989841977755228,
  'start': 11,
  'word': 'Mickelson'},
 {'end': 51,
  'entity_group': 'ORG',
  'index': 14,
  'score': 0.9978117644786835,
  'start': 41,
  'word': 'Home Depot'},
 {'end': 61,
  'entity_group': 'LOC',
  'index': 16,
  'score': 0.9988477230072021,
  'start': 55,
  'word': 'Manila'}]


# Question Answering Pipeline

In [44]:
from transformers import pipeline

question_answerer = pipeline("question-answering", top_k = 5)

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

question = "Which deep learning librares back Transformers"
question_answerer(question = question, context = context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'score': 0.8079867959022522,
  'start': 78,
  'end': 106,
  'answer': 'Jax, PyTorch, and TensorFlow'},
 {'score': 0.028152480721473694,
  'start': 96,
  'end': 106,
  'answer': 'TensorFlow'},
 {'score': 0.016116440296173096,
  'start': 78,
  'end': 90,
  'answer': 'Jax, PyTorch'},
 {'score': 0.012342635542154312,
  'start': 83,
  'end': 106,
  'answer': 'PyTorch, and TensorFlow'},
 {'score': 0.01066360529512167,
  'start': 78,
  'end': 108,
  'answer': 'Jax, PyTorch, and TensorFlow —'}]

In [21]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors = "pt")
outputs = model(**inputs)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [25]:
import torch

start_logits = outputs.start_logits
end_logits = outputs.end_logits

sequence_ids = inputs.sequence_ids()

# Mask (True) for non-context tokens and [SEP]
mask = [i != 1 for i in sequence_ids]
mask[0] = False
mask = torch.tensor(mask)[None]

start_logits[mask] = -10000
end_logits[mask] = -10000

In [26]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim = -1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim = -1)[0]

In [None]:
scores = start_probabilities[:, None] * end_probabilities[None, :]
scores = torch.triu(scores)

In [32]:
max_index = scores.argsort().item()
start_index = max_index // scores.shape[1]
end_index = max_index % scores.shape[1]
print(scores[start_index, end_index].item())

0.8079869747161865


In [50]:
max_indices = [(i // scores.shape[1], i % scores.shape[1]) for i in torch.topk(scores.flatten(), 5).indices]
for pair in max_indices:
    start_index = pair[0]
    end_index = pair[1]

    start_char, _ = offsets[start_index]
    _, end_char = offsets[end_index]

    print("===========")
    print(context[start_char:end_char])
    print(scores[start_index, end_index].item())

Jax, PyTorch, and TensorFlow
0.8079869747161865
Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow
0.05657912790775299
TensorFlow
0.028152529150247574

0.0236094668507576
Jax, PyTorch
0.016116417944431305


## Handling long contexts

In [52]:
inputs = tokenizer(
    question,
    long_context,
    stride = 128, # how much to keep in between
    max_length = 384,
    padding = "longest",
    truncation = "only_second",
    return_overflowing_tokens = True,
    return_offsets_mapping = True
)

_ = inputs.pop("overflow_to_sample_mapping")
offsets = inputs.pop("offset_mapping")

inputs = inputs.convert_to_tensors("pt")
print(inputs["input_ids"].shape)

torch.Size([2, 384])


In [73]:
outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

torch.Size([2, 384]) torch.Size([2, 384])


In [74]:
sequence_ids = inputs.sequence_ids()
mask = [i != 1 for i in sequence_ids]
mask[0] = False
mask = torch.logical_or(torch.tensor(mask)[None], (inputs["attention_mask"] == 0))

start_logits[mask] = -10000
end_logits[mask] = -10000

In [71]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)

In [72]:
candidates = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
    scores = start_probs[:, None] * end_probs[None, :]
    idx = torch.triu(scores).argmax().item()

    start_idx = idx // scores.shape[1]
    end_idx = idx % scores.shape[1]
    score = scores[start_idx, end_idx].item()
    candidates.append((start_idx, end_idx, score))

print(candidates)

[(0, 0, 0.8998146653175354), (175, 186, 0.9085207581520081)]


In [75]:
for candidate, offset in zip(candidates, offsets):
    start_token, end_token, score = candidate
    start_char, _ = offset[start_token]
    _, end_char = offset[end_token]
    answer = long_context[start_char:end_char]
    result = {"answer": answer, "start": start_char, "end": end_char, "score": score}
    print(result)

{'answer': '', 'start': 0, 'end': 0, 'score': 0.8998146653175354}
{'answer': 'Jax, PyTorch and TensorFlow', 'start': 1892, 'end': 1919, 'score': 0.9085207581520081}
