# Experiements with NER on MS MARCO Document dataset

This notebook is a sandbox for using 🤗 Transformers for NER in the indexing pipeline and at search time for MS MARCO Document ranking.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import importlib
import os
import sys

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

In [3]:
# project library
sys.path.insert(0, os.path.abspath('..'))

import qopt
importlib.reload(qopt)

from qopt.trec import load_queries_as_tuple

In [4]:
# model_name = 'dbmdz/bert-large-cased-finetuned-conll03-english'
# model_name = 'dbmdz/bert-base-cased-finetuned-conll03-english'
# model_name = 'sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english'
# model_name = 'mrm8488/mobilebert-finetuned-ner'
model_name = 'elastic/distilbert-base-cased-finetuned-conll03-english'
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, padding=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline('ner', tokenizer=tokenizer, model=model, grouped_entities=True, ignore_subwords=True)
ner_ungrouped = pipeline('ner', tokenizer=tokenizer, model=model, grouped_entities=False)

In [5]:
# ner([
#     "This is a sentence by John Smith",
#     "I smell chocolate maybe a Mars bar or something made by Nestle in Switzerland",
# ])

In [6]:
# From:
#  - https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html
#  - https://pytorch.org/docs/stable/quantization.html#dynamic-quantization

import os
import torch

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

model.to("cpu")
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

print_size_of_model(model)
print_size_of_model(quantized_model)

quantized_ner = pipeline('ner', tokenizer=tokenizer, model=quantized_model, grouped_entities=True, ignore_subwords=True)
quantized_ner_ungrouped = pipeline('ner', tokenizer=tokenizer, model=quantized_model, grouped_entities=False, ignore_labels=[])

Size (MB): 260.832555
Size (MB): 133.440079


# Experiments

## Transformers

In [36]:
%%time
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 119 ms, sys: 6.76 ms, total: 126 ms
Wall time: 127 ms


[{'entity_group': 'ORG',
  'score': 0.9840184450149536,
  'word': 'Elasticsearch',
  'start': 0,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.968928337097168,
  'word': 'Kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9917063117027283,
  'word': 'Elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9993715286254883,
  'word': 'Amsterdam',
  'start': 69,
  'end': 78},
 {'entity_group': 'ORG',
  'score': 0.7701449394226074,
  'word': 'Elastic',
  'start': 104,
  'end': 111},
 {'entity_group': 'ORG',
  'score': 0.9989482462406158,
  'word': 'US Bank',
  'start': 122,
  'end': 129},
 {'entity_group': 'PER',
  'score': 0.9997017085552216,
  'word': 'John Smith',
  'start': 201,
  'end': 211},
 {'entity_group': 'ORG',
  'score': 0.9980860352516174,
  'word': 'Accenture',
  'start': 236,
  'end': 245},
 {'entity_group': 'PER',
  'score': 0.9989999532699585,
  'word': 'Max Musterman',
  'start': 263,
  'end': 276},
 {'entity_group': 'O

In [40]:
%%time
quantized_ner_ungrouped("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 109 ms, sys: 5.51 ms, total: 114 ms
Wall time: 52.4 ms


[{'word': 'El',
  'score': 0.5598241090774536,
  'entity': 'O',
  'index': 1,
  'start': 0,
  'end': 2},
 {'word': '##astic',
  'score': 0.6717578768730164,
  'entity': 'O',
  'index': 2,
  'start': 2,
  'end': 7},
 {'word': '##sea',
  'score': 0.3697337210178375,
  'entity': 'I-MISC',
  'index': 3,
  'start': 7,
  'end': 10},
 {'word': '##rch',
  'score': 0.7620847821235657,
  'entity': 'O',
  'index': 4,
  'start': 10,
  'end': 13},
 {'word': 'and',
  'score': 0.9988463521003723,
  'entity': 'O',
  'index': 5,
  'start': 14,
  'end': 17},
 {'word': 'Ki',
  'score': 0.5805865526199341,
  'entity': 'B-ORG',
  'index': 6,
  'start': 18,
  'end': 20},
 {'word': '##bana',
  'score': 0.8712792992591858,
  'entity': 'I-ORG',
  'index': 7,
  'start': 20,
  'end': 24},
 {'word': 'are',
  'score': 0.9986699819564819,
  'entity': 'O',
  'index': 8,
  'start': 25,
  'end': 28},
 {'word': 'products',
  'score': 0.9995617270469666,
  'entity': 'O',
  'index': 9,
  'start': 29,
  'end': 37},
 {'wor

In [None]:
%%time
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

In [None]:
%%time
ner("What were the key parts of the Manhattan Project? Does it take place in New York or is that just something that people read about on Wikipedia?")

In [None]:
%%time
ner_ungrouped("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

In [None]:
%%time
quantized_ner_ungrouped("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

## flair

In [None]:
from flair.models import SequenceTagger as FlairSequenceTagger
from flair.data import Sentence as FlairSentence
from syntok import segmenter

In [None]:
tagger = FlairSequenceTagger.load('ner-fast')

def flair_extract(text):
    paragraphs = segmenter.process(text)

    words = set()
    for sentences in paragraphs:
        for tokens in sentences:
            flair_sentence = FlairSentence([token.value for token in tokens], use_tokenizer=False)
            tagger.predict(flair_sentence)
            for entity in flair_sentence.get_spans('ner'):
                words.add(entity.text)
    return words

In [None]:
%%time
flair_extract("In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

In [None]:
%%time
flair_extract("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

# Queries

In [41]:
model_name = 'dslim/bert-base-NER-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, padding=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_uncased = pipeline('ner', tokenizer=tokenizer, model=model, grouped_entities=True, ignore_subwords=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=1257, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=112, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=39, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Downloading', max=438006919, style=ProgressStyle(description_…




In [46]:
train_queries = '../data/msmarco-document-sampled-queries.1000.tsv'
dev_queries = '../data/msmarco/document/msmarco-docdev-queries.tsv'

## Queries with entities in query

In [51]:
train_query_tuples = list(load_queries_as_tuple(train_queries))
dev_query_tuples = list(load_queries_as_tuple(dev_queries))
print(len(dev_query_tuples))

5193


In [50]:
yes = 0
no = 0
for _, q in dev_query_tuples:
    entities = [x['word'] for x in ner_uncased(q)]
    if entities:
#         print(f" - {q}: {' '.join(entities)}")
        yes += 1
    else:
        no += 1
print(yes)
print(no)
print(yes+no)

2128
3065
5193


- 39% of the 10,000 train set have entities ~= 3,900
- 41% of the 1,000 train set have entities ~= 418
- 41% of the 5,200 dev set have entities ~= 2,128

## Queries with entities in the top-result document

# Grouping

In [28]:
import torch
import numpy as np

from transformers.file_utils import add_end_docstrings, is_tf_available, is_torch_available
from transformers.modelcard import ModelCard
from transformers.models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
from transformers.models.bert.tokenization_bert import BasicTokenizer
from transformers.modeling_utils import PreTrainedModel
from transformers.pipelines import *
from transformers.pipelines.base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
from transformers.tokenization_utils import PreTrainedTokenizer
from typing import TYPE_CHECKING, List, Optional, Union

class MyTokenClassificationPipeline(Pipeline):
    default_input_names = "sequences"

    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = TokenClassificationArgumentHandler(),
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
        task: str = "",
        grouped_entities: bool = False,
        ignore_subwords: bool = False,
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            device=device,
            binary_output=binary_output,
            task=task,
        )

        self.check_model_type(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING)

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self._args_parser = args_parser
        self.ignore_labels = ignore_labels
        self.grouped_entities = grouped_entities
        self.ignore_subwords = ignore_subwords

        if self.ignore_subwords and not self.tokenizer.is_fast:
            raise ValueError(
                "Slow tokenizers cannot ignore subwords. Please set the `ignore_subwords` option"
                "to `False` or use a fast tokenizer."
            )

    def __call__(self, inputs: Union[str, List[str]], **kwargs):
        _inputs, offset_mappings = self._args_parser(inputs, **kwargs)

        answers = []

        for i, sentence in enumerate(_inputs):

            # Manage correct placement of the tensors
            with self.device_placement():

                tokens = self.tokenizer(
                    sentence,
                    return_attention_mask=False,
                    return_tensors=self.framework,
                    truncation=True,
                    return_special_tokens_mask=True,
                    return_offsets_mapping=self.tokenizer.is_fast,
                )
                if self.tokenizer.is_fast:
                    offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
                elif offset_mappings:
                    offset_mapping = offset_mappings[i]
                else:
                    offset_mapping = None

                special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]

                # Forward
                if self.framework == "tf":
                    entities = self.model(tokens.data)[0][0].numpy()
                    input_ids = tokens["input_ids"].numpy()[0]
                else:
                    with torch.no_grad():
                        tokens = self.ensure_tensor_on_device(**tokens)
                        entities = self.model(**tokens)[0][0].cpu().numpy()
                        input_ids = tokens["input_ids"].cpu().numpy()[0]

            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
            labels_idx = score.argmax(axis=-1)

            entities = []
            # Filter to labels not in `self.ignore_labels`
            # Filter special_tokens
            filtered_labels_idx = [
                (idx, label_idx)
                for idx, label_idx in enumerate(labels_idx)
                if (self.model.config.id2label[label_idx] not in self.ignore_labels) and not special_tokens_mask[idx]
            ]

            for idx, label_idx in filtered_labels_idx:
                if offset_mapping is not None:
                    start_ind, end_ind = offset_mapping[idx]
                    word_ref = sentence[start_ind:end_ind]
                    word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
                    is_subword = len(word_ref) != len(word)

                    if int(input_ids[idx]) == self.tokenizer.unk_token_id:
                        word = word_ref
                        is_subword = False
                else:
                    word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))

                    start_ind = None
                    end_ind = None

                entity = {
                    "word": word,
                    "score": score[idx][label_idx].item(),
                    "entity": self.model.config.id2label[label_idx],
                    "index": idx,
                    "start": start_ind,
                    "end": end_ind,
                }

                if self.grouped_entities and self.ignore_subwords:
                    entity["is_subword"] = is_subword

                entities += [entity]

            if self.grouped_entities:
                answers += [self.group_entities(entities)]
            # Append ungrouped entities
            else:
                answers += [entities]

        if len(answers) == 1:
            return answers[0]
        return answers

    def group_sub_entities(self, entities: List[dict]) -> dict:
        """
        Group together the adjacent tokens with the same entity predicted.

        Args:
            entities (:obj:`dict`): The entities predicted by the pipeline.
        """
        # Get the first entity in the entity group
        entity = entities[0]["entity"].split("-")[-1]
        scores = np.nanmean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return entity_group

    def group_entities(self, entities: List[dict]) -> List[dict]:
        """
        Find and group together the adjacent tokens with the same entity predicted.

        Args:
            entities (:obj:`dict`): The entities predicted by the pipeline.
        """

        entity_groups = []
        entity_group_disagg = []

        if entities:
            last_idx = entities[-1]["index"]

        for entity in entities:

            is_last_idx = entity["index"] == last_idx
            is_subword = self.ignore_subwords and entity["is_subword"]
            if not entity_group_disagg:
                entity_group_disagg += [entity]
                if is_last_idx:
                    entity_groups += [self.group_sub_entities(entity_group_disagg)]
                continue

            # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
            # The split is meant to account for the "B" and "I" suffixes
            # Shouldn't merge if both entities are B-type
            if (
                (
                    entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1]
                    and entity["entity"].split("-")[0] != "B"
                )
                and entity["index"] == entity_group_disagg[-1]["index"] + 1
            ) or is_subword:
                # Modify subword type to be previous_type
                if is_subword:
                    entity["entity"] = entity_group_disagg[-1]["entity"].split("-")[-1]
                    entity["score"] = np.nan  # set ignored scores to nan and use np.nanmean

                entity_group_disagg += [entity]
                # Group the entities at the last entity
                if is_last_idx:
                    entity_groups += [self.group_sub_entities(entity_group_disagg)]
            # If the current entity is different from the previous entity, aggregate the disaggregated entity group
            else:
                entity_groups += [self.group_sub_entities(entity_group_disagg)]
                entity_group_disagg = [entity]
                # If it's the last entity, add it to the entity groups
                if is_last_idx:
                    entity_groups += [self.group_sub_entities(entity_group_disagg)]

        return entity_groups

In [30]:
my_ner = MyTokenClassificationPipeline(tokenizer=tokenizer, model=model, grouped_entities=True, ignore_subwords=True)

In [33]:
ner_ungrouped("Elasticsearch is a thing made by Elastic from Amsterdam")

[{'word': '##astic',
  'score': 0.8498260378837585,
  'entity': 'I-MISC',
  'index': 2,
  'start': 2,
  'end': 7},
 {'word': '##sea',
  'score': 0.7780320644378662,
  'entity': 'I-MISC',
  'index': 3,
  'start': 7,
  'end': 10},
 {'word': 'El',
  'score': 0.9576388597488403,
  'entity': 'B-ORG',
  'index': 10,
  'start': 33,
  'end': 35},
 {'word': '##astic',
  'score': 0.8840276002883911,
  'entity': 'I-ORG',
  'index': 11,
  'start': 35,
  'end': 40},
 {'word': 'Amsterdam',
  'score': 0.998390257358551,
  'entity': 'B-LOC',
  'index': 13,
  'start': 46,
  'end': 55}]

In [35]:
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

[{'entity_group': 'ORG',
  'score': 0.9990760087966919,
  'word': 'Elasticsearch',
  'start': 0,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.996468722820282,
  'word': 'Kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9992861747741699,
  'word': 'Elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9995539784431458,
  'word': 'Amsterdam',
  'start': 69,
  'end': 78}]

# Appendix

In [None]:
from transformers.file_utils import hf_bucket_url, cached_path
pretrained_model_name = 'elastic/distilbert-base-cased-finetuned-conll03-english'
archive_file = hf_bucket_url(
    pretrained_model_name,
    filename='pytorch_model.bin',
)
resolved_archive_file = cached_path(archive_file)
resolved_archive_file