In [3]:
import torch
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModel
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
set_seed(42)

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")


nlp = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")
text = "Barack Obama was born in Hawaii, and likes Google. Urbana Champaugn is a good place"
entities = nlp(text)#[0]['entity_group']
print(entities)

[{'entity_group': 'PER', 'score': 0.999992, 'word': 'Barack Obama', 'start': 0, 'end': 12}, {'entity_group': 'LOC', 'score': 0.9999887, 'word': 'Hawaii', 'start': 25, 'end': 31}, {'entity_group': 'ORG', 'score': 0.999961, 'word': 'Google', 'start': 43, 'end': 49}, {'entity_group': 'LOC', 'score': 0.98065805, 'word': 'Urbana Champaugn', 'start': 51, 'end': 67}]


In [2]:
labels = set([entity["entity_group"] for entity in entities])
print({"text": text, "entities": entities, "labels": list(labels)})

{'text': 'Barack Obama was born in Hawaii, and likes Google. Urbana Champaugn is a good place', 'entities': [{'entity_group': 'PER', 'score': 0.999992, 'word': 'Barack Obama', 'start': 0, 'end': 12}, {'entity_group': 'LOC', 'score': 0.9999887, 'word': 'Hawaii', 'start': 25, 'end': 31}, {'entity_group': 'ORG', 'score': 0.999961, 'word': 'Google', 'start': 43, 'end': 49}, {'entity_group': 'LOC', 'score': 0.98065805, 'word': 'Urbana Champaugn', 'start': 51, 'end': 67}], 'labels': ['PER', 'LOC', 'ORG']}


In [None]:
# modified_html = ""
    # for word in words:
    #   if word == "he":
    #     modified_html += "<span style='background-color: purple;'>" + word + "</span> "
    #   else:
    #     modified_html += word + " "
    # return modified_html 


In [2]:
def highlight_entities(text, entities):
    color_map = {
        "PER": "blue",
        "LOC": "green",
        "ORG": "red"
    }
    new_text = ""
    last_end = 0
    for entity in entities:
        label = entity["entity_group"]
        start = entity["start"]
        end = entity["end"]
        new_text += text[last_end:start]
        color = color_map.get(label, "black")
        new_text += f'<span style="color: {color};">{text[start:end]}</span>'
        last_end = end
    new_text += text[last_end:]
    return new_text

In [3]:
entities = nlp(text)
new_text = highlight_entities(text, entities)
print(new_text)

<span style="color: blue;">Barack Obama</span> was born in <span style="color: green;">Hawaii</span>, and likes <span style="color: red;">Google</span>. <span style="color: green;">Urbana Champaugn</span> is a good place


In [4]:
from transformers import pipeline

ner = pipeline("ner", model="bert-base-cased", tokenizer="bert-base-cased")

text = "Apple is looking at buying U.K. startup for $1 billion"

entities = ner(text)
print(entities)
#for ent in entities:
 #   if ent["entity"] != "O":
  #      start = ent["start"]
   #     end = ent["end"]
    #    entity_type = ent["entity"]
     #   print(f"{entity_type}: {text[start:end]}")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

[{'entity': 'LABEL_0', 'score': 0.57435954, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'LABEL_0', 'score': 0.5643061, 'index': 2, 'word': 'is', 'start': 6, 'end': 8}, {'entity': 'LABEL_0', 'score': 0.5997141, 'index': 3, 'word': 'looking', 'start': 9, 'end': 16}, {'entity': 'LABEL_1', 'score': 0.5410624, 'index': 4, 'word': 'at', 'start': 17, 'end': 19}, {'entity': 'LABEL_0', 'score': 0.51501936, 'index': 5, 'word': 'buying', 'start': 20, 'end': 26}, {'entity': 'LABEL_0', 'score': 0.680543, 'index': 6, 'word': 'U', 'start': 27, 'end': 28}, {'entity': 'LABEL_0', 'score': 0.612117, 'index': 7, 'word': '.', 'start': 28, 'end': 29}, {'entity': 'LABEL_0', 'score': 0.57498014, 'index': 8, 'word': 'K', 'start': 29, 'end': 30}, {'entity': 'LABEL_0', 'score': 0.59091127, 'index': 9, 'word': '.', 'start': 30, 'end': 31}, {'entity': 'LABEL_0', 'score': 0.51981264, 'index': 10, 'word': 'start', 'start': 32, 'end': 37}, {'entity': 'LABEL_0', 'score': 0.6103272, 'index': 11, 'wor

In [4]:
from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup
from ipymarkup import show_span_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN
from ipymarkup import format_span_box_markup


text = 'Barack Obama was born in Hawaii, and likes Google. Urbana Champaugn is a good place'

spans = []
for entity in entities:
    #     label = entity["entity_group"]
    #     start = entity["start"]
    #     end = entity["end"]
    #     new_text += text[last_end:start]
    #     color = color_map.get(label, "black")
    #     new_text += f'<span style="color: {color};">{text[start:end]}</span>'
    #     last_end = end
    # new_text += text[last_end:
    spans.append((entity["start"], entity["end"], entity["entity_group"]))

#show_span_ascii_markup(text, spans)

show_span_box_markup(text, spans, palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

list(format_span_box_markup(text, spans))


['<div class="tex2jax_ignore" style="white-space: pre-wrap">',
 '',
 '<span style="padding: 2px; border-radius: 4px; border: 1px solid #bbdefb; background: #e3f2fd">',
 'Barack Obama',
 '<span style="vertical-align: middle; margin-left: 2px; font-size: 0.7em; color: #64b5f6;">',
 'PER',
 '</span>',
 '</span>',
 ' was born in ',
 '<span style="padding: 2px; border-radius: 4px; border: 1px solid #c8e6c9; background: #e8f5e9">',
 'Hawaii',
 '<span style="vertical-align: middle; margin-left: 2px; font-size: 0.7em; color: #66bb6a;">',
 'LOC',
 '</span>',
 '</span>',
 ', and likes ',
 '<span style="padding: 2px; border-radius: 4px; border: 1px solid #ffcdd2; background: #ffebee">',
 'Google',
 '<span style="vertical-align: middle; margin-left: 2px; font-size: 0.7em; color: #e57373;">',
 'ORG',
 '</span>',
 '</span>',
 '. ',
 '<span style="padding: 2px; border-radius: 4px; border: 1px solid #c8e6c9; background: #e8f5e9">',
 'Urbana Champaugn',
 '<span style="vertical-align: middle; margin-lef