In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [13]:
def show_ents(doc): ## gives explination of the entities present in the doc usingh spacy library
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+ ' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print("No entities found.")

In [7]:
doc = nlp(u'How are you')

In [10]:
show_ents(doc)

No entities found.


In [11]:
doc = nlp(u" May I go to Washington, DC next May to see the Washington Monument? ")

In [14]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [15]:
doc =nlp(u"Can I please have 500 dollars of Microsoft stock?")

In [16]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [17]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million.")

In [18]:
show_ents(doc) ## Tesla is not identified as a company

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


### Adding namaed entities to a Span

In [19]:
from spacy.tokens import Span

In [20]:
ORG = doc.vocab.strings[u"ORG"] ## It grab the unicode string for ORG and the ORG entity label

In [21]:
ORG

381

In [25]:
new_ent = Span(doc, 0, 1, label=ORG) ## Passing the span for new entity ,(0,1) is the span of The entity on doc i.e Tesla has span (0,1)

In [27]:
doc.ents = list(doc.ents)+[new_ent]

In [28]:
show_ents(doc) ## Tesla is identified as a company

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


### Adding namaed entities to all matching Spans

In [40]:
doc = nlp(u"Our company created a brand new vaccum cleaner."
         u"This new vaccum-cleaner is the best in show")

In [41]:
from spacy.matcher import PhraseMatcher

In [42]:
matcher = PhraseMatcher(nlp.vocab)

In [43]:
phrase_list =['vaccum cleaner','vaccum-cleaner']

In [44]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [45]:
matcher.add('newproduct',None,*phrase_patterns)

In [46]:
found_matches = matcher(doc)

In [47]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [48]:
from spacy.tokens import Span

In [50]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [52]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [53]:
doc.ents = list(doc.ents)+new_ents

In [54]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


###  Frequency of entities

In [55]:
doc =nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.")

In [56]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

### Visualizing named entity recogntion

In [58]:
from spacy import displacy

In [66]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousasnd iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players." )

In [67]:
displacy.render(doc,style ='ent',jupyter=True)

In [69]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style ='ent',jupyter=True)

In [77]:
colors ={'ORG':'radial-gradient(yellow,green)'}## linear-gradient(start,middel,end) colors, 'name/code of color' these are inputs after 'ORG':'input'

In [78]:
options = {'ents':['PRODUCT','ORG'],'colors':colors}

In [79]:
displacy.render(doc,style ='ent',jupyter=True,options=options)