# Experiements with NER on MS MARCO Document dataset

This notebook is a sandbox for using 🤗 Transformers for NER in the indexing pipeline and at search time for MS MARCO Document ranking.

In [44]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# model_name = 'dbmdz/bert-base-cased-finetuned-conll03-english'
model_name = 'mrm8488/mobilebert-finetuned-ner'
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline('ner', tokenizer=tokenizer, model=model, ignore_subwords=True, grouped_entities=True, ignore_labels="O")

In [45]:
%%time
ner("In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 91.1 ms, sys: 10 ms, total: 101 ms
Wall time: 101 ms


[{'entity_group': 'MISC',
  'score': 0.7413530349731445,
  'word': '##rch',
  'start': 34,
  'end': 37},
 {'entity_group': 'ORG',
  'score': 0.9833260774612427,
  'word': 'us bank',
  'start': 42,
  'end': 49},
 {'entity_group': 'PER',
  'score': 0.9997989535331726,
  'word': 'john smith',
  'start': 121,
  'end': 131},
 {'entity_group': 'ORG',
  'score': 0.9999570250511169,
  'word': 'accenture',
  'start': 156,
  'end': 165},
 {'entity_group': 'PER',
  'score': 0.999747097492218,
  'word': 'max mustermann',
  'start': 183,
  'end': 197},
 {'entity_group': 'ORG',
  'score': 0.9998007118701935,
  'word': 'us bank',
  'start': 201,
  'end': 208}]

In [46]:
%%time
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

CPU times: user 58.3 ms, sys: 5.76 ms, total: 64 ms
Wall time: 64.1 ms


[{'entity_group': 'ORG',
  'score': 0.9763613343238831,
  'word': 'elasticsearch',
  'start': 0,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.9148470163345337,
  'word': 'kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9994836449623108,
  'word': 'elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9981455206871033,
  'word': 'amsterdam',
  'start': 69,
  'end': 78}]

In [47]:
%%time
ner("What were the key parts of the Manhattan Project? Does it take place in New York or is that just something that people read about on Wikipedia?")

CPU times: user 74 ms, sys: 3.55 ms, total: 77.5 ms
Wall time: 78.9 ms


[{'entity_group': 'MISC',
  'score': 0.7912053167819977,
  'word': 'manhattan project',
  'start': 31,
  'end': 48},
 {'entity_group': 'LOC',
  'score': 0.9998374581336975,
  'word': 'new york',
  'start': 72,
  'end': 80},
 {'entity_group': 'MISC',
  'score': 0.967934250831604,
  'word': 'wikipedia',
  'start': 133,
  'end': 142}]

In [1]:
from flair.models import SequenceTagger as FlairSequenceTagger
from flair.data import Sentence as FlairSentence
from syntok import segmenter

In [22]:
tagger = FlairSequenceTagger.load('ner-fast')

def flair_extract(text):
    paragraphs = segmenter.process(text)

    words = set()
    for sentences in paragraphs:
        for tokens in sentences:
            flair_sentence = FlairSentence([token.value for token in tokens], use_tokenizer=False)
            tagger.predict(flair_sentence)
            for entity in flair_sentence.get_spans('ner'):
                words.add(entity.text)
    return words

2021-02-08 16:12:24,965 loading file /Users/josh/.flair/models/en-ner-fast-conll03-v0.4.pt


In [23]:
%%time
flair_extract("In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 401 ms, sys: 20.4 ms, total: 422 ms
Wall time: 433 ms


{'Accenture', 'Elasticsearch', 'John Smith', 'Max Mustermann', 'US Bank'}

In [24]:
%%time
flair_extract("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

CPU times: user 152 ms, sys: 3.43 ms, total: 155 ms
Wall time: 157 ms


{'Amsterdam', 'Elastic', 'Elasticsearch', 'Kibana'}