# Spacy & NLTK for NLP
* Spacy is generally faster than NLTK because it defaults tothe most efficient method whereas NLTK allows the user to choose algorithms

* NLTK is more preferred for certain tasks, like sentiment analysis

In [1]:
import spacy

In [2]:
# This is where the model is loaded
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
looking 99 VERB ROOT
at 84 ADP prep
buying 99 VERB pcomp
U.S. 95 PROPN compound
startup 91 NOUN dobj
for 84 ADP prep
$ 98 SYM quantmod
6 92 NUM compound
million 92 NUM pobj


In [5]:
# ner is short of "named entity recognizer"
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11c11d0d0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x11c281ad0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x11c2af0b0>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

# Tokenization

In [7]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
n't 85 ADV neg
looking 99 VERB ROOT
into 84 ADP prep
startups 91 NOUN pobj
anymore 85 ADV advmod
. 96 PUNCT punct


In [9]:
doc2[0].pos_

'PROPN'

In [10]:
doc2[0].dep_

'nsubj'

In [11]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [12]:
life_quote = doc3[16:30]
display(life_quote)
display(type(life_quote))
display(type(doc3))

"Life is what happens to us while we are making other plans"

spacy.tokens.span.Span

spacy.tokens.doc.Doc

In [13]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [14]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [15]:
doc4[6].is_sent_start

True

In [16]:
# returns "none" because it's not
doc4[8].is_sent_start

<img src="Tokenization.png">

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [18]:
my_string = '"We\'re moving to L.A!"'
display(my_string)
print(my_string)

'"We\'re moving to L.A!"'

"We're moving to L.A!"


In [19]:
doc = nlp(my_string)

In [20]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [21]:
doc2 = nlp(u"We're here to help! Send snail-mal, email support@oursite.com ot visit us as http://oursite.com")

In [22]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mal
,
email
support@oursite.com
ot
visit
us
as
http://oursite.com


In [23]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [24]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [25]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [26]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [27]:
# Number of tokens in a doc
len(doc4)

11

In [28]:
doc4.vocab

<spacy.vocab.Vocab at 0x11cbe9560>

In [29]:
len(doc4.vocab)

57853

In [30]:
doc5 = nlp(u"It is better to give than to receive.")

In [31]:
doc5[0]

It

In [32]:
doc5[2:5]

better to give

In [33]:
# Won't work, the index is not just a string
doc5[0] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [34]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [35]:
for t in doc8:
    print(t.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [36]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [37]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [38]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [39]:
from spacy import displacy

In [40]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [41]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [42]:
doc = nlp(u"Over the last quarter, Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [43]:
displacy.render(doc,style='ent',jupyter=True)

In [46]:
doc = nlp(u"This is a sentence.")
displacy.serve(doc,style='dep')
# http://127.0.0.1:5000


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [22/Oct/2019 06:02:30] "GET / HTTP/1.1" 200 3057
127.0.0.1 - - [22/Oct/2019 06:02:31] "GET / HTTP/1.1" 200 3057



    Shutting down server on port 5000.



# Stemming
* Not included in Spacy, but is a common topic in NLP
> * Will use NLTK

In [44]:
# import nltk

In [45]:
from nltk.stem.porter import PorterStemmer

In [46]:
p_stemmer = PorterStemmer()

In [47]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [48]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fairli
fairness---->fair


In [49]:
from nltk.stem.snowball import SnowballStemmer

In [50]:
s_stemmer = SnowballStemmer(language='english')

In [51]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fair
fairness---->fair


In [52]:
words = ['generous', 'generation', 'generously', 'generate']

In [53]:
for word in words:
    print(word + '--->' + s_stemmer.stem(word))

generous--->generous
generation--->generat
generously--->generous
generate--->generat


# Lemmatization
* Reduce words to their true roots

In [54]:
import spacy

In [55]:
nlp = spacy.load('en_core_web_sm')

In [56]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [57]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [58]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [59]:
doc2 = nlp(u"I saw ten mice today!")

In [60]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop words

In [61]:
import spacy

In [62]:
nlp = spacy.load('en_core_web_sm')

In [63]:
print(nlp.Defaults.stop_words)

{'your', 'show', 'therein', 'twenty', 'do', 'get', 'such', 'you', 'herself', 'might', 'afterwards', 'side', 'his', 'are', 'becomes', 'another', 'bottom', 'meanwhile', 'everyone', 'full', 'themselves', 'done', 'least', 'seem', 'namely', 'their', 'hundred', 'an', 'others', 'twelve', 'off', 'on', 'eight', 'other', 'may', 'can', 'herein', 'with', 'none', 'move', 'more', 'say', 'often', 'she', 'these', 'thus', 'was', 'whereas', 'whenever', 'empty', 'whoever', 'above', 'always', 'be', 'beyond', 'under', 'neither', 'put', 'nor', 'towards', 'will', 'besides', 'fifteen', 'regarding', 'eleven', 'perhaps', 'nevertheless', 'yourselves', 'now', 'they', 'where', 'anything', 'used', 'from', 'both', 'nothing', 'it', 'still', 'we', 'somehow', 'unless', 'please', 'forty', 'name', 'last', 'thereupon', 'or', 'third', 'made', 'until', 'would', 'being', 'seeming', 'front', 'during', 'about', 'latterly', 'some', 'sometimes', 'very', 'through', 'anyone', 'fifty', 'make', 'whole', 'whether', 'whatever', 'after

In [64]:
len(nlp.Defaults.stop_words)

305

In [65]:
nlp.vocab['is'].is_stop

True

In [66]:
nlp.vocab['mystery'].is_stop

False

### Add a stop word

In [67]:
nlp.Defaults.stop_words.add('btw')

In [68]:
nlp.vocab['btw'].is_stop = True

In [69]:
len(nlp.Defaults.stop_words)

306

In [70]:
nlp.vocab['btw'].is_stop

True

In [71]:
nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False

In [72]:
nlp.vocab['beyond'].is_stop

False

# Vocabulary and Matching

In [73]:
import spacy

In [74]:
nlp = spacy.load('en_core_web_sm')

In [75]:
from spacy.matcher import Matcher

In [76]:
matcher = Matcher(nlp.vocab)

In [77]:
# Detect solarpower as
# Solarpower
# Solar-power
# Solar power

# Solarpower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [78]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [79]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [80]:
found_matches = matcher(doc)

In [81]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [82]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [83]:
matcher.remove('SolarPower')

In [84]:
# solarpower
pattern1 = [{'LOWER':'solarpower'}]
# solar*--_.etc_power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER':'power'}]

In [85]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [86]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [87]:
found_matches = matcher(doc2)

In [88]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 The Solar Power
8656102463236116519 SolarPower 4 5 continues


### Matching on a phrase list 

In [89]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import PhraseMatcher

In [90]:
matcher = PhraseMatcher(nlp.vocab)

In [91]:
# had to save file as utf-8
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [92]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [93]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [94]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [95]:
found_matches = matcher(doc3)

In [96]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2984, 2988)]

In [97]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


In [98]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+10]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and free-market economics by
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.

The four pillars of Reagan
3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in opposition to Keynesian demand-
3680293220734633682 EconMatcher 2984 2988 became widely known as "trickle-down economics", due to the significant cuts in the upper


# Assessment

In [99]:
import spacy
nlp = spacy.load('en_core_web_sm')

**1. Create a Doc object from the file `owlcreek.txt`**<br>
> HINT: Use `with open('../TextFiles/owlcreek.txt') as f:`

In [100]:
with open('owlcreek.txt') as f:
    doc = nlp(f.read())

In [101]:
doc[:36]

AN OCCURRENCE AT OWL CREEK BRIDGE

by Ambrose Bierce

I

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  

**2. How many tokens are contained in the file?**

In [102]:
len(doc)

4833

**3. How many sentences are contained in the file?**<br>HINT: You'll want to build a list first!

In [103]:
sentences = []
for sentence in doc.sents:
    sentences.append(sentence)
len(sentences)

211

**4. Print the second sentence in the document**<br> HINT: Indexing starts at zero, and the title counts as the first sentence.

In [104]:
sentences[1]

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  

**5. For each token in the sentence above, print its `text`, `POS` tag, `dep` tag and `lemma`**<br>
CHALLENGE: Have values line up in columns in the print output.**

In [105]:
for token in sentences[1]:
    print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')

A               DET   det        a              
man             NOUN  nsubj      man            
stood           VERB  ROOT       stand          
upon            ADP   prep       upon           
a               DET   det        a              
railroad        NOUN  compound   railroad       
bridge          NOUN  pobj       bridge         
in              ADP   prep       in             
northern        ADJ   amod       northern       
Alabama         PROPN pobj       alabama        
,               PUNCT punct      ,              
looking         VERB  advcl      look           
down            PART  prt        down           

               SPACE            
              
into            ADP   prep       into           
the             DET   det        the            
swift           ADJ   amod       swift          
water           NOUN  pobj       water          
twenty          NUM   nummod     twenty         
feet            NOUN  npadvmod   foot           
below           ADV 

**6. Write a matcher called 'Swimming' that finds both occurrences of the phrase "swimming vigorously" in the text**<br>
HINT: You should include an `'IS_SPACE': True` pattern between the two words!

In [106]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
# Detect solarpower

# Swimming
pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True}, {'LOWER': 'vigorously'}]
matcher.add('Swimming', None, pattern)
found_matches = matcher(doc)
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

12881893835109366681 Swimming 1274 1277 swimming
vigorously
12881893835109366681 Swimming 3607 3610 swimming
vigorously


**7. Print the text surrounding each found match**

In [108]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start-9:end+5]                    # get the matched span
    print(span.text)

By diving I could evade the bullets and, swimming
vigorously, reach the bank,
all this over his shoulder; he was now swimming
vigorously with the current.  


**EXTRA CREDIT:<br>Print the *sentence* that contains each found match**

In [112]:
for i in range(2):
    for sent in sentences:
        if found_matches[i][1] < sent.end:
            print(sent)
            print()
            break

By diving I could evade the bullets and, swimming
vigorously, reach the bank, take to the woods and get away home.  

The hunted man saw all this over his shoulder; he was now swimming
vigorously with the current.  

