# Experiements with NER on MS MARCO Document dataset

This notebook is a sandbox for using 🤗 Transformers for NER in the indexing pipeline and at search time for MS MARCO Document ranking.

In [171]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# model_name = 'dbmdz/bert-base-cased-finetuned-conll03-english'
# model_name = 'sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english'
# model_name = 'mrm8488/mobilebert-finetuned-ner'
model_name = 'elastic/distilbert-base-cased-finetuned-conll03-english'
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline('ner', tokenizer=tokenizer, model=model, grouped_entities=True, ignore_subwords=True)
ner_ungrouped = pipeline('ner', tokenizer=tokenizer, model=model, grouped_entities=False)

In [190]:
# From:
#  - https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html
#  - https://pytorch.org/docs/stable/quantization.html#dynamic-quantization

import os
import torch

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

model.to("cpu")
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

print_size_of_model(model)
print_size_of_model(quantized_model)

quantized_ner = pipeline('ner', tokenizer=tokenizer, model=quantized_model, grouped_entities=True, ignore_subwords=True)
quantized_ner_ungrouped = pipeline('ner', tokenizer=tokenizer, model=quantized_model, grouped_entities=False, ignore_labels=[])

Size (MB): 260.832555
Size (MB): 133.440079


# Experiments

## Transformers

In [179]:
%%time
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 117 ms, sys: 7.39 ms, total: 125 ms
Wall time: 120 ms


[{'entity_group': 'ORG',
  'score': 0.9840184450149536,
  'word': 'Elasticsearch',
  'start': 0,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.968928337097168,
  'word': 'Kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9917063117027283,
  'word': 'Elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9993715286254883,
  'word': 'Amsterdam',
  'start': 69,
  'end': 78},
 {'entity_group': 'ORG',
  'score': 0.7701449394226074,
  'word': 'Elastic',
  'start': 104,
  'end': 111},
 {'entity_group': 'ORG',
  'score': 0.9989482462406158,
  'word': 'US Bank',
  'start': 122,
  'end': 129},
 {'entity_group': 'PER',
  'score': 0.9997017085552216,
  'word': 'John Smith',
  'start': 201,
  'end': 211},
 {'entity_group': 'ORG',
  'score': 0.9980860352516174,
  'word': 'Accenture',
  'start': 236,
  'end': 245},
 {'entity_group': 'PER',
  'score': 0.9989999532699585,
  'word': 'Max Musterman',
  'start': 263,
  'end': 276},
 {'entity_group': 'O

In [180]:
%%time
quantized_ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 107 ms, sys: 8.53 ms, total: 116 ms
Wall time: 55.7 ms


[{'entity_group': 'MISC',
  'score': 0.3697337210178375,
  'word': '##sea',
  'start': 7,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.5805865526199341,
  'word': 'Kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.8589796423912048,
  'word': 'Elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9893618822097778,
  'word': 'Amsterdam',
  'start': 69,
  'end': 78},
 {'entity_group': 'ORG',
  'score': 0.3766401708126068,
  'word': 'Elastic',
  'start': 104,
  'end': 111},
 {'entity_group': 'ORG',
  'score': 0.9852622449398041,
  'word': 'US Bank',
  'start': 122,
  'end': 129},
 {'entity_group': 'PER',
  'score': 0.9986754655838013,
  'word': 'John Smith',
  'start': 201,
  'end': 211},
 {'entity_group': 'ORG',
  'score': 0.6226997375488281,
  'word': 'A',
  'start': 236,
  'end': 237},
 {'entity_group': 'PER',
  'score': 0.9859289228916168,
  'word': 'Max Musterman',
  'start': 263,
  'end': 276},
 {'entity_group': 'ORG',
  'score'

In [181]:
%%time
ner("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

CPU times: user 63.2 ms, sys: 3.91 ms, total: 67.1 ms
Wall time: 65.5 ms


[{'entity_group': 'ORG',
  'score': 0.9990760087966919,
  'word': 'Elasticsearch',
  'start': 0,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.996468722820282,
  'word': 'Kibana',
  'start': 18,
  'end': 24},
 {'entity_group': 'ORG',
  'score': 0.9992861747741699,
  'word': 'Elastic',
  'start': 43,
  'end': 50},
 {'entity_group': 'LOC',
  'score': 0.9995539784431458,
  'word': 'Amsterdam',
  'start': 69,
  'end': 78}]

In [182]:
%%time
ner("What were the key parts of the Manhattan Project? Does it take place in New York or is that just something that people read about on Wikipedia?")

CPU times: user 69.4 ms, sys: 4.43 ms, total: 73.8 ms
Wall time: 71.4 ms


[{'entity_group': 'MISC',
  'score': 0.7096573710441589,
  'word': 'Manhattan Project',
  'start': 31,
  'end': 48},
 {'entity_group': 'LOC',
  'score': 0.9995931386947632,
  'word': 'New York',
  'start': 72,
  'end': 80},
 {'entity_group': 'ORG',
  'score': 0.9316956996917725,
  'word': 'Wikipedia',
  'start': 133,
  'end': 142}]

In [183]:
%%time
ner_ungrouped("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 122 ms, sys: 6.23 ms, total: 128 ms
Wall time: 124 ms


[{'word': 'El',
  'score': 0.9840184450149536,
  'entity': 'B-ORG',
  'index': 1,
  'start': 0,
  'end': 2},
 {'word': '##astic',
  'score': 0.8492672443389893,
  'entity': 'I-ORG',
  'index': 2,
  'start': 2,
  'end': 7},
 {'word': '##sea',
  'score': 0.7870254516601562,
  'entity': 'I-ORG',
  'index': 3,
  'start': 7,
  'end': 10},
 {'word': '##rch',
  'score': 0.5311601758003235,
  'entity': 'I-ORG',
  'index': 4,
  'start': 10,
  'end': 13},
 {'word': 'Ki',
  'score': 0.968928337097168,
  'entity': 'B-ORG',
  'index': 6,
  'start': 18,
  'end': 20},
 {'word': '##bana',
  'score': 0.9857085347175598,
  'entity': 'I-ORG',
  'index': 7,
  'start': 20,
  'end': 24},
 {'word': 'El',
  'score': 0.9917063117027283,
  'entity': 'B-ORG',
  'index': 11,
  'start': 43,
  'end': 45},
 {'word': '##astic',
  'score': 0.895086407661438,
  'entity': 'I-ORG',
  'index': 12,
  'start': 45,
  'end': 50},
 {'word': 'Amsterdam',
  'score': 0.9993715286254883,
  'entity': 'B-LOC',
  'index': 17,
  'star

In [191]:
%%time
quantized_ner_ungrouped("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam. In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 120 ms, sys: 12.7 ms, total: 133 ms
Wall time: 61.6 ms


[{'word': 'El',
  'score': 0.5598241090774536,
  'entity': 'O',
  'index': 1,
  'start': 0,
  'end': 2},
 {'word': '##astic',
  'score': 0.6717578768730164,
  'entity': 'O',
  'index': 2,
  'start': 2,
  'end': 7},
 {'word': '##sea',
  'score': 0.3697337210178375,
  'entity': 'I-MISC',
  'index': 3,
  'start': 7,
  'end': 10},
 {'word': '##rch',
  'score': 0.7620847821235657,
  'entity': 'O',
  'index': 4,
  'start': 10,
  'end': 13},
 {'word': 'and',
  'score': 0.9988463521003723,
  'entity': 'O',
  'index': 5,
  'start': 14,
  'end': 17},
 {'word': 'Ki',
  'score': 0.5805865526199341,
  'entity': 'B-ORG',
  'index': 6,
  'start': 18,
  'end': 20},
 {'word': '##bana',
  'score': 0.8712792992591858,
  'entity': 'I-ORG',
  'index': 7,
  'start': 20,
  'end': 24},
 {'word': 'are',
  'score': 0.9986699819564819,
  'entity': 'O',
  'index': 8,
  'start': 25,
  'end': 28},
 {'word': 'products',
  'score': 0.9995617270469666,
  'entity': 'O',
  'index': 9,
  'start': 29,
  'end': 37},
 {'wor

## flair

In [127]:
from flair.models import SequenceTagger as FlairSequenceTagger
from flair.data import Sentence as FlairSentence
from syntok import segmenter

In [128]:
tagger = FlairSequenceTagger.load('ner-fast')

def flair_extract(text):
    paragraphs = segmenter.process(text)

    words = set()
    for sentences in paragraphs:
        for tokens in sentences:
            flair_sentence = FlairSentence([token.value for token in tokens], use_tokenizer=False)
            tagger.predict(flair_sentence)
            for entity in flair_sentence.get_spans('ner'):
                words.add(entity.text)
    return words

2021-02-10 11:42:03,726 loading file /Users/josh/.flair/models/en-ner-fast-conll03-v0.4.pt


In [133]:
%%time
flair_extract("In the context of using Elasticsearch for US Bank, we see similarities with other observability use-cases. We spoke with John Smith and his collegaues from Accenture who confirmed to Max Mustermann at US Bank that their use-case would fit.")

CPU times: user 487 ms, sys: 5.36 ms, total: 492 ms
Wall time: 493 ms


{'Accenture', 'Elasticsearch', 'John Smith', 'Max Mustermann', 'US Bank'}

In [134]:
%%time
flair_extract("Elasticsearch and Kibana are products from Elastic which is based in Amsterdam.")

CPU times: user 161 ms, sys: 2.59 ms, total: 164 ms
Wall time: 164 ms


{'Amsterdam', 'Elastic', 'Elasticsearch', 'Kibana'}

# Appendix

In [163]:
from transformers.file_utils import hf_bucket_url, cached_path
pretrained_model_name = 'elastic/distilbert-base-cased-finetuned-conll03-english'
archive_file = hf_bucket_url(
    pretrained_model_name,
    filename='pytorch_model.bin',
)
resolved_archive_file = cached_path(archive_file)
resolved_archive_file

'/Users/josh/.cache/huggingface/transformers/db0fa06199e910d8ccf857e250e52ca30e519e20c1ec975834a154a825ffb09a.31511ec4de414fb6d0efdd7619b137c45dc2382fe19be0adbc9a78f8b7941f6b'