In [1]:
import spacy

In [94]:
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>



In [101]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
        displacy.render(doc, style='ent')
    else:
        print('No NER')

In [102]:
doc = nlp(u'This is Times Square. Current time is 9:45 AM and this is Tuesday. Microsoft building is shining.')

In [103]:
show_ents(doc)

Times Square - FAC - Buildings, airports, highways, bridges, etc.
9:45 AM - TIME - Times smaller than a day
Tuesday - DATE - Absolute or relative dates or periods
Microsoft - ORG - Companies, agencies, institutions, etc.


In [49]:
doc = nlp(u'₹ 30,000 Please pay your bill.')
show_ents(doc)

₹ - ORG - Companies, agencies, institutions, etc.
30,000 - MONEY - Monetary values, including unit


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [39]:
# adding Rs as money
from spacy.tokens import Span

In [54]:
MONEY = doc.vocab.strings[u'MONEY']

In [55]:
MONEY

394

In [61]:
new_ent = Span(doc, 0, 1, label=MONEY)  # Create a new Span for the new entity


In [62]:
doc.ents = [e for e in doc.ents if not (e.start < new_ent.end and e.end > new_ent.start)] + [new_ent]


In [63]:
doc = nlp(u'₹30,000 Please pay your bill.')
show_ents(doc)

₹ - ORG - Companies, agencies, institutions, etc.
30,000 - MONEY - Monetary values, including unit


In [64]:
doc = nlp(u'Our company created new vaccume cleaner.'u'This vaccume-cleaner is the best')

In [65]:
show_ents(doc)

No NER


In [66]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [67]:
matcher = PhraseMatcher(nlp.vocab)

In [91]:
def create_NER(phrase_list, entity):
    phrase_pattern = [nlp(text) for text in phrase_list]
    matcher.add('newproduct', None, *phrase_pattern)
    found_matches = matcher(doc)
    print('Match found: ',found_matches)
    
    PROD = doc.vocab.strings[entity]
    new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]
    doc.ents = list(doc.ents) + new_ents
    
    if len(found_matches) > 0:
        print('Already added!')

In [77]:
found_matches

[(2689272359382549672, 4, 6), (2689272359382549672, 8, 11)]

In [81]:
show_ents(doc)

vaccume cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccume-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [83]:
doc = nlp(u'Our company created new solar power.'u'This solar-power is the best')

In [84]:
show_ents(doc)

No NER


In [92]:
phrase_list = ['vaccume cleaner', 'vaccume-cleaner']
entity = str(u"PRODUCT")
create_NER(phrase_list, entity)

Match found:  [(2689272359382549672, 4, 6), (2689272359382549672, 8, 11)]


ValueError: [E1010] Unable to set entity information for token 4 which is included in more than one span in entities, blocked, missing or outside.

In [88]:
phrase_list = ['solar power', 'solar-power']
entity = str(u"PRODUCT")
create_NER(phrase_list, entity)

Match found:  [(2689272359382549672, 4, 6), (2689272359382549672, 8, 11)]


ValueError: [E1010] Unable to set entity information for token 4 which is included in more than one span in entities, blocked, missing or outside.

In [93]:
show_ents(doc)

solar power - PRODUCT - Objects, vehicles, foods, etc. (not services)
solar-power - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [104]:
doc = nlp(u'In the last quarter, Apple sold 1 million iPods worth $16 millions')

In [105]:
show_ents(doc)

the last quarter - DATE - Absolute or relative dates or periods
Apple - ORG - Companies, agencies, institutions, etc.
1 million - CARDINAL - Numerals that do not fall under another type
$16 millions - MONEY - Monetary values, including unit


In [112]:
# rendering only specific entity
options = {'ents':['DATE', 'ORG']}
displacy.render(doc, style='ent', options=options)

In [123]:
# rendering specific color to specific entity
colors = {'DATE':'linear-gradient(90deg,#aa9cfc, #fc9ce7)'}
options = {'ents':['DATE', 'ORG'],'colors':colors}

displacy.render(doc, style='ent', options=options)