In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [11]:
# Jumpes is a VBD is a past tense verb
print(doc[4].text)
print(doc[4].pos_) # coars-grain tag
print(doc[4].tag_) # fine-grain tag

jumped
VERB
VBD


## Coarse-grained Part-of-speech Tags
Every token is assigned a POS Tag from the following list:


<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>
    
<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>
<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>
<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>
<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>
<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>
<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>
<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>
<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>
<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>
<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>
<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>
<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>
<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>
<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>
<tr><td>SPACE</td><td>space</td></tr>

___
## Fine-grained Part-of-speech Tags
Tokens are subsequently given a fine-grained tag as determined by morphology:
<table>
<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>
<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>
<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>
<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>
<tr><td>ADJ</td><td></td><td>PRP\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>
<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>
<tr><td>ADJ</td><td></td><td>WP\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>
<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>
<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>
<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>
<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>
<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>
<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>
<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>
<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>
<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>
<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>
<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>
<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>
<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>
<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>
<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>
<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>
<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>""</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>
<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>
<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>
<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>
<tr><td>SYM</td><td></td><td>\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>
<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>
<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary "be"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>HVS</td><td>forms of "have"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>
<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>
<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>
<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>
<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>
<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>
<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>
<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>
<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>
<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>
<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>
<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>
<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>
</table>

In [15]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [16]:
doc = nlp(u"I read books on NLP.")

In [17]:
word = doc[1]
word.text

'read'

In [18]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [19]:
doc = nlp(u"I read a book on NLP.")
word = doc[1]
word.text

'read'

In [20]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [21]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [27]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [24]:
# The key values are the numerical identifier of the part of speech
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [25]:
doc.vocab[83].text

'ADJ'

In [29]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}: {doc.vocab[k].text:{5}} {v}")

83: ADJ   3
84: ADP   1
89: DET   2
91: NOUN  3
93: PART  1
96: PUNCT 1
99: VERB  1


In [31]:
# Fine-grain part of speech
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k}: {doc.vocab[k].text:{5}} {v}")

74: POS   1
1292078113972184607: IN    1
10554686591937588953: JJ    3
12646065887601541794: .     1
15267657372422890137: DT    2
15308085513773655218: NN    3
17109001835818727656: VBD   1


In [32]:
len(doc.vocab)

57863

In [33]:
# Syntactic dependencies
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}: {doc.vocab[k].text:{5}} {v}")

399: amod  3
412: det   2
426: nsubj 1
436: pobj  1
437: poss  1
440: prep  1
442: punct 1
8110129090154140942: case  1
8206900633647566924: ROOT  1


# Visualizing Parts of Speech

In [34]:
import spacy

In [35]:
nlp = spacy.load('en_core_web_sm')

In [36]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [37]:
from spacy import displacy

In [39]:
# Show syntactic dependencies
displacy.render(doc,style='dep',jupyter=True)

In [40]:
options = {'distance':110, 
           'compact': 'True', 'color': 'yellow',
           'bg':'#09a3d5',
           'font':'Times'}

In [42]:
# Show syntactic dependencies
displacy.render(doc,style='dep',jupyter=True, options=options)

In [43]:
doc2 = nlp(u"This is a sentence. This is another sentence. This is another sentence, possibly longer than the other.")

In [44]:
spans = list(doc2.sents)

In [46]:
# Display outside of jupyter
# displacy.serve(spans,style='dep',options = {'distance':110})
# 127.0.0.1:5000

# Named Entity Recognition

In [50]:
import spacy

In [51]:
nlp = spacy.load('en_core_web_sm')

In [53]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' -'+ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [54]:
doc = nlp(u"Hi how are you?")

In [55]:
show_ents(doc)

No entities found


In [56]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [57]:
show_ents(doc)

Washington, DC -GPE - Countries, cities, states
next May -DATE - Absolute or relative dates or periods
the Washington Monument -ORG - Companies, agencies, institutions, etc.


## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>




## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [58]:
doc = nlp(u"Can I please have 500 dollars of Microsoft stock?")
show_ents(doc)

500 dollars -MONEY - Monetary values, including unit
Microsoft -ORG - Companies, agencies, institutions, etc.


## Add a named entity to a span

In [60]:
# Note it does not give Tesla as an entity, we want to add it to the ORG entity
doc = nlp(u"Tesla to build a U.K. factory for $6 million")
show_ents(doc)

U.K. -GPE - Countries, cities, states
$6 million -MONEY - Monetary values, including unit


In [61]:
from spacy.tokens import Span

In [62]:
# This is equal to the hash value of the ORG entity label
ORG = doc.vocab.strings[u"ORG"]
ORG

381

In [63]:
new_ent = Span(doc,0,1,label=ORG)

In [64]:
# Could use .append(new_ent) instead
doc.ents = list(doc.ents) + [new_ent]

In [65]:
show_ents(doc)

Tesla -ORG - Companies, agencies, institutions, etc.
U.K. -GPE - Countries, cities, states
$6 million -MONEY - Monetary values, including unit


# Named Entity Recognition - Part 2
## Addiing multiple entities to a span

In [72]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vacuum-cleaner is the best in show.")

In [73]:
show_ents(doc)

No entities found


In [74]:
from spacy.matcher import PhraseMatcher

In [75]:
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
# Add phrases as a matcher titled 'newproduct'
matcher.add('newproduct', None, *phrase_patterns)

In [76]:
found_matches = matcher(doc)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [77]:
from spacy.tokens import Span

In [78]:
# Look at named-entity recognition list to determine what tags are relevant
# Pass in the TYPE
PROD = doc.vocab.strings[u'PRODUCT']

In [79]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [80]:
# label each of the instances of "vacuum cleaner" as PROD
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [83]:
doc.ents = list(doc.ents) + new_ents

In [84]:
show_ents(doc)

vacuum cleaner -PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner -PRODUCT - Objects, vehicles, foods, etc. (not services)


In [85]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.")

In [88]:
# How many times was money or any type of named-entity mentioned
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2

# Visualizing Named Entity Recognition

In [89]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [90]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [91]:
displacy.render(doc,style='ent',jupyter=True)

In [92]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [94]:
displacy.render(doc,style='ent',jupyter=True)

In [96]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [97]:
options = {'ents': ['PRODUCT']}

In [98]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [99]:
options['ents'].append('ORG')
options

{'ents': ['PRODUCT', 'ORG']}

In [100]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [104]:
colors = {'ORG':'red'}
options.update({'colors':colors})
options

{'ents': ['PRODUCT', 'ORG'], 'colors': {'ORG': 'red'}}

In [105]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [106]:
colors = {'ORG':'radial-gradient(yellow,green)'}
options.update({'colors':colors})
options

{'ents': ['PRODUCT', 'ORG'],
 'colors': {'ORG': 'radial-gradient(yellow,green)'}}

In [107]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [113]:
colors = {'ORG':'linear-gradient(45deg,orange,red)'}
options.update({'colors':colors})
options

{'ents': ['PRODUCT', 'ORG'],
 'colors': {'ORG': 'linear-gradient(45deg,orange,red)'}}

In [114]:
displacy.render(doc,style='ent',jupyter=True,options=options)

# Sentence Segmentation

In [115]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [116]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [117]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [119]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [120]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [121]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [122]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter Drucker




In [146]:
# Add a segmentation rule

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc
    

In [147]:
nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

ValueError: [E007] 'set_custom_boundaries' already exists in pipeline. Existing names: ['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [148]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [149]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [150]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things.
" - Peter Drucker


In [152]:
# Change segmentation rules

In [153]:
nlp = spacy.load('en_core_web_sm')

In [157]:
my_string = u"This is a sentence. This is another sentence. \n\nThis is a \nthird sentence."

In [158]:
print(my_string)

This is a sentence. This is another sentence. 

This is a 
third sentence.


In [159]:
doc = nlp(my_string)

In [160]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another sentence. 


This is a 
third sentence.


In [161]:
from spacy.pipeline import SentenceSegmenter

In [162]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    
    yield doc[start:]

In [163]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [164]:
nlp.add_pipe(sbd)

In [165]:
doc = nlp(my_string)

In [166]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another sentence. 


This is a 

third sentence.


# Assessment

In [2]:
# Standard imports
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

**1. Create a Doc object from the file `peterrabbit.txt`**<br>
> HINT: Use `with open('../TextFiles/peterrabbit.txt') as f:`

In [3]:
with open('peterrabbit.txt') as f:
    doc = nlp(f.read())

**2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.**

In [4]:
for token in list(doc.sents)[2]:
    print(f'{token.text:{15}} {token.pos_:{5}} {token.tag_:{10}} {spacy.explain(token.tag_)}')

They            PRON  PRP        pronoun, personal
lived           VERB  VBD        verb, past tense
with            ADP   IN         conjunction, subordinating or preposition
their           ADJ   PRP$       pronoun, possessive
Mother          PROPN NNP        noun, proper singular
in              ADP   IN         conjunction, subordinating or preposition
a               DET   DT         determiner
sand            NOUN  NN         noun, singular or mass
-               PUNCT HYPH       punctuation mark, hyphen
bank            NOUN  NN         noun, singular or mass
,               PUNCT ,          punctuation mark, comma
underneath      ADP   IN         conjunction, subordinating or preposition
the             DET   DT         determiner
root            NOUN  NN         noun, singular or mass
of              ADP   IN         conjunction, subordinating or preposition
a               DET   DT         determiner

               SPACE            None
very            ADV   RB         adver

**3. Provide a frequency list of POS tags from the entire document**

In [5]:
POS_counts = doc.count_by(spacy.attrs.POS)

for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')

83. ADJ  : 83
84. ADP  : 127
85. ADV  : 75
88. CCONJ: 61
89. DET  : 90
91. NOUN : 176
92. NUM  : 8
93. PART : 36
94. PRON : 72
95. PROPN: 75
96. PUNCT: 174
99. VERB : 182
102. SPACE: 99


**4. CHALLENGE: What percentage of tokens are nouns?**<br>
HINT: the attribute ID for 'NOUN' is 91

In [12]:
percent = 100*POS_counts[91]/len(doc)

print(f'{POS_counts[91]}/{len(doc)} = {round(percent,2)}%')

176/1258 = 13.99%


**5. Display the Dependency Parse for the third sentence**

In [13]:
from spacy import displacy

In [14]:
sent_three = list(doc.sents)[2]
displacy.render(sent_three, style='dep', jupyter=True, options={'distance': 110})

**6. Show the first two named entities from Beatrix Potter's *The Tale of Peter Rabbit***

In [16]:
for ent in doc.ents[:2]:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

The Tale of Peter Rabbit - WORK_OF_ART - Titles of books, songs, etc.
Beatrix Potter - PERSON - People, including fictional


In [17]:
len(list(doc.sents))

56

**8. CHALLENGE: How many sentences contain named entities?**

In [20]:
sentences = list(doc.sents)
entities = [doc for doc in sentences if doc.ents]
len(entities)

51

In [21]:
list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.ents]
len(list_of_ners)

49

**9. CHALLENGE: Display the named entity visualization for `list_of_sents[0]` from the previous problem**

In [23]:
displacy.render(list_of_sents[0],style='ent',jupyter=True)