# Spacy & NLTK for NLP
* Spacy is generally faster than NLTK because it defaults tothe most efficient method whereas NLTK allows the user to choose algorithms

* NLTK is more preferred for certain tasks, like sentiment analysis

In [1]:
import spacy

In [2]:
# This is where the model is loaded
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [16]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
looking 99 VERB ROOT
at 84 ADP prep
buying 99 VERB pcomp
U.S. 95 PROPN compound
startup 91 NOUN dobj
for 84 ADP prep
$ 98 SYM quantmod
6 92 NUM compound
million 92 NUM pobj


In [17]:
# ner is short of "named entity recognizer"
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x10d83c690>),
 ('parser', <spacy.pipeline.DependencyParser at 0x10dc6c830>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x10dc6cdd0>)]

In [18]:
nlp.pipe_names

['tagger', 'parser', 'ner']

# Tokenization

In [5]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [6]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 95 PROPN nsubj
is 99 VERB aux
n't 85 ADV neg
looking 99 VERB ROOT
into 84 ADP prep
startups 91 NOUN pobj
anymore 85 ADV advmod
. 96 PUNCT punct


In [7]:
doc2[0].pos_

'PROPN'

In [8]:
doc2[0].dep_

'nsubj'

In [9]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [12]:
life_quote = doc3[16:30]
display(life_quote)
display(type(life_quote))
display(type(doc3))

"Life is what happens to us while we are making other plans"

spacy.tokens.span.Span

spacy.tokens.doc.Doc

In [13]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [14]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [16]:
doc4[6].is_sent_start

True

In [18]:
# returns "none" because it's not
doc4[8].is_sent_start

<img src="Tokenization.png">

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [22]:
my_string = '"We\'re moving to L.A!"'
display(my_string)
print(my_string)

'"We\'re moving to L.A!"'

"We're moving to L.A!"


In [23]:
doc = nlp(my_string)

In [24]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [25]:
doc2 = nlp(u"We're here to help! Send snail-mal, email support@oursite.com ot visit us as http://oursite.com")

In [26]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mal
,
email
support@oursite.com
ot
visit
us
as
http://oursite.com


In [27]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [28]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [29]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [30]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [31]:
# Number of tokens in a doc
len(doc4)

11

In [32]:
doc4.vocab

<spacy.vocab.Vocab at 0x11468d320>

In [33]:
len(doc4.vocab)

57853

In [34]:
doc5 = nlp(u"It is better to give than to receive.")

In [35]:
doc5[0]

It

In [36]:
doc5[2:5]

better to give

In [37]:
# Won't work, the index is not just a string
doc5[0] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [38]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [39]:
for t in doc8:
    print(t.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [42]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [43]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [44]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers
