## POS_count
count the pos in the text
coarse pos:noun,verb,adjective
fine-grained tags:plural noun,past-tense verb,superlative adjective

##### here we just want to know the order how the word is used, becasue the same word,when used in different places, can mean very differently

In [16]:
import spacy

In [17]:
nlp=spacy.load('en_core_web_sm')

In [18]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

In [19]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back


In [20]:
print(doc[4])

jumped


In [21]:
print(doc[4].tag_) #print the fine grade tag, do not forget the underline _

VBD


In [22]:
print(doc[4].tag) # if no undersocre will just give us the numerical ID, sometimes ID is useful for looking information

17109001835818727656


In [23]:
print(doc[4].pos_)  #print the coarse tag

VERB


In [24]:
#run a for loop for the text
for token in doc:
    print(f'{token.text:{10}}   {token.pos_:{10}}   {token.tag_:{10}}   {spacy.explain(token.tag_)}')

The          DET          DT           determiner
quick        ADJ          JJ           adjective
brown        ADJ          JJ           adjective
fox          NOUN         NN           noun, singular or mass
jumped       VERB         VBD          verb, past tense
over         ADP          IN           conjunction, subordinating or preposition
the          DET          DT           determiner
lazy         ADJ          JJ           adjective
dog          NOUN         NN           noun, singular or mass
's           PART         POS          possessive ending
back         NOUN         NN           noun, singular or mass


In [25]:
doc1=nlp(u"I read books on NLP")

In [26]:
token=doc[1]
print(f'{token.text:{10}}   {token.pos_:{10}}   {token.tag_:{10}}   {spacy.explain(token.tag_)}')

quick        ADJ          JJ           adjective


In [27]:
doc2=nlp(u"I read a book on NLP")
token=doc1[1]
print(f'{token.text:{10}}   {token.pos_:{10}}   {token.tag_:{10}}   {spacy.explain(token.tag_)}')

read         VERB         VBD          verb, past tense


### POS_counts

In [28]:
POS_counts=doc.count_by(spacy.attrs.POS)

In [29]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1}

In [30]:
doc.vocab[95].text,doc.vocab[96].text,doc.vocab[92].text

('PRON', 'PROPN', 'NOUN')

In [31]:
doc[2].pos

84

In [32]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
100. VERB  1


In [33]:
#do the same for fine-grained tags

In [34]:
TAG_counts=doc1.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {spacy.explain(doc.vocab[k].text)} {v}")

783433942507015291. NNS   noun, plural 1
1292078113972184607. IN    conjunction, subordinating or preposition 1
13656873538139661788. PRP   pronoun, personal 1
15794550382381185553. NNP   noun, proper singular 1
17109001835818727656. VBD   verb, past tense 1


In [35]:
len(doc.vocab)

506

In [36]:
len(doc1.vocab)

506

In [37]:
DEP_counts=doc1.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {spacy.explain(doc.vocab[k].text)} {v}")

416. dobj  direct object 1
429. nsubj nominal subject 1
439. pobj  object of preposition 1
443. prep  prepositional modifier 1
8206900633647566924. ROOT  None 1


## vitualize part of the speech
displacy

In [38]:
from spacy import displacy

In [39]:
displacy.render(doc)

In [40]:
options={"distance":90,"compact":True,"color":'yellow','bg':"#09a3d5",'font':'Times'}

In [41]:
displacy.render(doc,style='dep',jupyter=True,options=options)

In [42]:
doc3=nlp(u"This is a sentence,. This is another sentence, possibly longer than the other.")

In [43]:
spans=list(doc3.sents) #.sents just separate your text into sentences ! this is useful

In [None]:
#visualize in the serve
displacy.serve(spans,style='dep',options={'distance':110})

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



# Named Entity Recognition(NER)
NER seeks to locate and classify named entity mentions in unstructured text into pre-defined categories!
such as person names,organizations,locations,medical codes,time expressions,quantities,monetary values,percentages

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+'-'+ent.label_+'-'+str(spacy.explain(ent.label_)))
    else:
        print("No Entities Found")

In [None]:
doc4=nlp(u'Hi how are you ?')

In [None]:
show_ents(doc4)

In [None]:
doc5=nlp(u'May I go to Washington,DC next May to see the Washington Monument')

In [None]:
show_ents(doc5)

In [None]:
doc6=nlp(u"Tesla to build a UK factory for $6 million")

In [None]:
show_ents(doc6)

In [None]:
from spacy.tokens import Span

In [None]:
ORG=doc6.vocab.strings[u"ORG"]  #the hash value 
ORG

In [None]:
new_ent=Span(doc6,0,1,label=ORG)
doc6.ents=list(doc6.ents)+[new_ent]

In [None]:
show_ents(doc6)

#### custormized the entities
-add a single term as our own NER
-have several terms to add as possible NERs

In [None]:
doc7=nlp(u"Our company created a brand new vacuum cleaner."
        u"This new vacuum-cleaner is the best in show.")

In [None]:
show_ents(doc7)

In [None]:
from spacy.matcher import PhraseMatcher 

In [None]:
matcher=PhraseMatcher(nlp.vocab)

In [None]:
phrase_list=['vacuum cleaner','vacuum-cleaner']

In [None]:
phrase_patterns=[nlp(text) for text in phrase_list]

In [None]:
matcher.add('newproduct',None,*phrase_patterns)

In [None]:
found_matches=matcher(doc7)
found_matches

In [None]:
from spacy.tokens import Span

In [None]:
PROD=doc7.vocab.strings[u"PRODUCT"]

In [None]:
new_ents=[Span(doc7,match[1],match[2],label=PROD) for match in found_matches]

In [None]:
doc7.ents=list(doc7.ents)+new_ents

In [None]:
show_ents(doc7)

In [None]:
doc8=nlp(u'Oringinally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.')

In [None]:
[ent for ent in doc8.ents ]

In [None]:
[ent for ent in doc8.ents if ent.label_=='MONEY']

## Visualizing NER

In [34]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [35]:
from spacy import displacy

In [36]:
doc9=nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for profit of $6 million.')

In [37]:
displacy.render(doc9,style='ent',jupyter=True)

In [38]:
doc10=nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for profit of $6 million.'
        u'By contrast, Sony only sold 8 thousand Walkman music players')

In [39]:
for sent in doc10.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

##### options
such as only highlight a certain entity

In [40]:
options={'ents':['PRODUCT']}

In [41]:
displacy.render(doc9,style='ent',jupyter=True,options=options)

In [42]:
#choose colors
colors={'ORG':'#aa9cfc'}
options={'ents':['PRODUCT','ORG'],'colors':colors}

In [43]:
displacy.render(doc9,style='ent',jupyter=True,options=options)

In [44]:
colors={'ORG':'radial-gradient(yellow,green)'}
options={'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc9,style='ent',jupyter=True,options=options)

In [45]:
colors={'ORG':'linear-gradient(90deg,#aa9cfc,#fc9ce7)'}
options={'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc9,style='ent',jupyter=True,options=options)

# sentence segmentation
set our own segmentation rules to break up docs into sentences based on our own rules

In [46]:
doc11=nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [47]:
for sent in doc11.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [48]:
list(doc11.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [49]:
list(doc11.sents)[0]

This is the first sentence.

In [50]:
type(list(doc11.sents)[0])

spacy.tokens.span.Span

In [51]:
doc12=nlp(u' "Management is doing the right things; Leadership is doing the right things." --Peter Drucker')

In [52]:
doc12.text

' "Management is doing the right things; Leadership is doing the right things." --Peter Drucker'

In [53]:
for sent in doc12.sents:
    print(sent)
    print('\n')

 "Management is doing the right things; Leadership is doing the right things."


--Peter Drucker




#ADD A SEGMENTATION RULE
def set_customer_boundaries(doc):
    for token in doc:
        print(token.i)

set_customer_boundaries(doc12)

In [54]:
def set_customer_boundaries(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start==True
    return doc

In [55]:
nlp.add_pipe(set_customer_boundaries,before='parser')
nlp.pipe_names

['tagger', 'set_customer_boundaries', 'parser', 'ner']

In [56]:
doc12[:-1]

 "Management is doing the right things; Leadership is doing the right things." --Peter

In [57]:
doc13=nlp(u' "Management is doing the right things; Leadership is doing the right things." --Peter Drucker')

In [58]:
for sent in doc13.sents:
    print(sent)

 "Management is doing the right things; Leadership is doing the right things."
--Peter Drucker


In [None]:
#CHANGE SEGMENTATION RULES

In [59]:
nlp=spacy.load('en_core_web_sm') #reload the library

In [60]:
mystring=u'This is a sentence. This is another . \n\nThis is a \nthird sentence'

In [61]:
print(mystring)

This is a sentence. This is another . 

This is a 
third sentence


In [62]:
doc14=nlp(mystring)
for sent in doc14.sents:
    print(sent)

This is a sentence.
This is another . 


This is a 
third sentence


In [63]:
from spacy.pipeline import SentenceSegmenter

In [64]:
def split_on_newlines(doc):
    start=0
    seen_newline=False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start=word.i
            seen_newline=False
        elif word.text.startwith('\n'):
            seen_newline=True
    yield doc[start:]

In [66]:
sbd=SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [67]:
for sent in doc14.sents:
    print(sent)

This is a sentence.
This is another . 


This is a 
third sentence
