# Spacy & NLTK for NLP
* Spacy is generally faster than NLTK because it defaults tothe most efficient method whereas NLTK allows the user to choose algorithms

* NLTK is more preferred for certain tasks, like sentiment analysis

In [1]:
import spacy

In [2]:
# This is where the model is loaded
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
looking 99 VERB ROOT
at 84 ADP prep
buying 99 VERB pcomp
U.S. 95 PROPN compound
startup 91 NOUN dobj
for 84 ADP prep
$ 98 SYM quantmod
6 92 NUM compound
million 92 NUM pobj


In [5]:
# ner is short of "named entity recognizer"
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11c1bef10>),
 ('parser', <spacy.pipeline.DependencyParser at 0x11c5f2dd0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x11c6203b0>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

# Tokenization

In [7]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
n't 85 ADV neg
looking 99 VERB ROOT
into 84 ADP prep
startups 91 NOUN pobj
anymore 85 ADV advmod
. 96 PUNCT punct


In [9]:
doc2[0].pos_

'PROPN'

In [10]:
doc2[0].dep_

'nsubj'

In [11]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [12]:
life_quote = doc3[16:30]
display(life_quote)
display(type(life_quote))
display(type(doc3))

"Life is what happens to us while we are making other plans"

spacy.tokens.span.Span

spacy.tokens.doc.Doc

In [13]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [14]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [15]:
doc4[6].is_sent_start

True

In [16]:
# returns "none" because it's not
doc4[8].is_sent_start

<img src="Tokenization.png">

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [18]:
my_string = '"We\'re moving to L.A!"'
display(my_string)
print(my_string)

'"We\'re moving to L.A!"'

"We're moving to L.A!"


In [19]:
doc = nlp(my_string)

In [20]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [21]:
doc2 = nlp(u"We're here to help! Send snail-mal, email support@oursite.com ot visit us as http://oursite.com")

In [22]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mal
,
email
support@oursite.com
ot
visit
us
as
http://oursite.com


In [23]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [24]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [25]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [26]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [27]:
# Number of tokens in a doc
len(doc4)

11

In [28]:
doc4.vocab

<spacy.vocab.Vocab at 0x11d020710>

In [29]:
len(doc4.vocab)

57853

In [30]:
doc5 = nlp(u"It is better to give than to receive.")

In [31]:
doc5[0]

It

In [32]:
doc5[2:5]

better to give

In [33]:
# Won't work, the index is not just a string
doc5[0] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [34]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [35]:
for t in doc8:
    print(t.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [36]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [37]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [38]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [39]:
from spacy import displacy

In [40]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [41]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [42]:
doc = nlp(u"Over the last quarter, Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [43]:
displacy.render(doc,style='ent',jupyter=True)

In [44]:
doc = nlp(u"This is a sentence.")
displacy.serve(doc,style='dep')
# http://127.0.0.1:5000


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer


    Shutting down server on port 5000.



# Stemming
* Not included in Spacy, but is a common topic in NLP
> * Will use NLTK

In [48]:
# import nltk

In [49]:
from nltk.stem.porter import PorterStemmer

In [50]:
p_stemmer = PorterStemmer()

In [57]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [58]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fairli
fairness---->fair


In [54]:
from nltk.stem.snowball import SnowballStemmer

In [55]:
s_stemmer = SnowballStemmer(language='english')

In [59]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fair
fairness---->fair


In [60]:
words = ['generous', 'generation', 'generously', 'generate']

In [61]:
for word in words:
    print(word + '--->' + s_stemmer.stem(word))

generous--->generous
generation--->generat
generously--->generous
generate--->generat
