# Tokenization and Streaming with Spacy
- As being modern spacy implementation done as pipeline object or stream process

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# -initiate a pipeline with a document 
doc = nlp(u"Tesla is looking a buying U.S. startups for $6 million")

In [4]:
# - Explore different component of tokenize object
for token in doc:
    print("{}, {}, {}".format(token.text, token.pos, token.dep_))

Tesla, 96, nsubj
is, 87, aux
looking, 100, ROOT
a, 90, det
buying, 100, amod
U.S., 96, compound
startups, 92, dobj
for, 85, prep
$, 99, quantmod
6, 93, compound
million, 93, pobj


In [5]:
# The whole process in a pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x110b4d0f0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1110be0a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1110be108>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc_2 = nlp(u"Tesla isn't looking startups anymore.")

In [8]:
for token in doc_2:
    print("{}, {}, {}".format(token.text, token.pos_, token.dep_))

Tesla, PROPN, nsubj
is, AUX, aux
n't, PART, neg
looking, VERB, ROOT
startups, NOUN, dobj
anymore, ADV, advmod
., PUNCT, punct


In [9]:
# Get an arbitary Token
print("Token: {}, POS: {}".format(doc_2[0], doc_2[0].pos_))

Token: Tesla, POS: PROPN


# Span from a Large Doc

In [10]:
doc_3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [11]:
life_quote = doc_3[16:30]

In [12]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [13]:
type(life_quote)

spacy.tokens.span.Span

# Seperate Sentences in Spacy

In [14]:
doc_4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [15]:
for sentence in doc_4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


# Tokenaization

In [16]:
my_string = '"We\'re moving to L.A.!"'

In [17]:
print(my_string)

"We're moving to L.A.!"


In [18]:
doc = nlp(my_string)

In [19]:
for token in doc:
    print("{}".format(token.text))

"
We
're
moving
to
L.A.
!
"


## Clever Spacy

In [20]:
doc_2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [21]:
for t in doc_2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [23]:
doc_3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc_3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [24]:
doc_4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc_4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [25]:
# Length of a doc
len(doc_4)

11

In [26]:
len(doc_4.vocab)

552

In [31]:
len(doc.vocab)

557

In [27]:
doc_8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [29]:
for t in doc_8:
    print(t.text, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [35]:
## Finding spacial words
for e in doc_8.ents:
    print(e)
    print(e.label_)
    print(str(spacy.explain(e.label_)))
    print("\n")

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [36]:
# Find the noun chunks
doc_9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc_9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# Visualising Spacy Elements

In [37]:
from spacy import displacy

In [38]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')

In [43]:
displacy.render(doc, style="dep", jupyter=True, options={"distance": 80})

In [44]:
# Entity Dependency
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

# Stemming
- Most of the time Useless as it cut off words

# Lemmatization

In [45]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [48]:
for t in doc1:
    print("{:12}\t{:6}\t{:22}\t{}".format(t.text, t.pos_, t.lemma, t.lemma_))

I           	PRON  	    561228191312463089	-PRON-
am          	AUX   	  10382539506755952630	be
a           	DET   	  11901859001352538922	a
runner      	NOUN  	  12640964157389618806	runner
running     	VERB  	  12767647472892411841	run
in          	ADP   	   3002984154512732771	in
a           	DET   	  11901859001352538922	a
race        	NOUN  	   8048469955494714898	race
because     	SCONJ 	  16950148841647037698	because
I           	PRON  	    561228191312463089	-PRON-
love        	VERB  	   3702023516439754181	love
to          	PART  	   3791531372978436496	to
run         	VERB  	  12767647472892411841	run
since       	SCONJ 	  10066841407251338481	since
I           	PRON  	    561228191312463089	-PRON-
ran         	VERB  	  12767647472892411841	run
today       	NOUN  	  11042482332948150395	today


# Stop Words

In [49]:
# English Stop Words
print(nlp.Defaults.stop_words)

{'done', 'them', 'thereupon', 'always', 'whoever', 'for', 'give', 'several', 'this', 'many', 'will', 're', 'upon', 'not', 'hers', 'thereafter', 'up', '‘d', 'even', 'along', 'why', 'above', '’m', 'via', 'the', 'another', 'being', 'must', 'empty', 'more', 'down', 'unless', 'regarding', 'whereupon', 'some', 'might', 'ever', 'full', 'beforehand', 'on', 'during', 'say', 'by', 'forty', 'three', 'except', 'those', 'around', 'whence', 'besides', 'have', 'would', 'such', 'call', 'indeed', 'third', 'its', 'nor', 'below', 'eight', 'per', 'whole', 'among', 'i', 'keep', 'doing', '‘m', 'someone', 'whereas', 'everything', 'anyone', 'somehow', 'least', 'each', 'else', 'is', 'there', 'thereby', 'used', 'get', 'sometime', 'was', 'within', 'thus', 'made', 'latter', 'be', 'without', 'until', 'that', 'every', 'your', 'when', 'after', 'latterly', 'towards', 'hence', 'still', 'through', 'due', 'former', 'become', '‘re', 'however', 'please', 'thence', 'mine', 'do', 'noone', 'very', 'few', 'formerly', 'amongst

In [50]:
# is stop word
nlp.vocab["is"].is_stop

True

In [51]:
# Remove Stop word
nlp.Defaults.stop_words.remove("beyond")

# Vocabulary and Matching

In [52]:
from spacy.matcher import Matcher

In [53]:
matcher = Matcher(nlp.vocab)

In [54]:
pattern_1 = [
    {"LOWER": "solarpower"}
]
pattern_2 = [
    {"LOWER": "solar"}, 
    {"IS_PUNCT": True},
    {"LOWER": "power"}
]
pattern_3 = [
    {"LOWER": "solar"},
    {"LOWER": "power"}
]

In [55]:
matcher.add('SolarPower', None, pattern_1, pattern_2, pattern_3)

In [56]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [57]:
found_matches = matcher(doc)

In [58]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [70]:
pattern_1 = [
    {'LOWER': 'solarpower'}
]
pattern_2 = [
    {'LOWER': 'solar'}, 
    {'IS_PUNCT': True, 'OP':'*'}, 
    {'LEMMA': 'power'}
]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern_1, pattern_2)

In [71]:
doc_2 = nlp(u'Solar--power energy runs solar-powered cars.')
found_matches = matcher(doc_2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


# Manipulating Large Document

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.matcher import Matcher

In [4]:
# Reading a document in to spacy tokens Doc document format
with open("../raw_data/TextFiles/owlcreek.txt") as f:
    doc = nlp(f.read())

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
doc[:36]

AN OCCURRENCE AT OWL CREEK BRIDGE

by Ambrose Bierce

I

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  

In [7]:
# ? How many tokens
len(doc)

4835

In [8]:
# ? How may sentences
doc_sentences = [sentence for sentence in doc.sents]
print(len(doc_sentences))

249


In [9]:
# ? find the 3rd sentence
print(doc_sentences[2].text)

A man stood upon a railroad bridge in northern Alabama, looking down
into the swift water twenty feet below.  


In [10]:
for t in doc_sentences[2]:
    print("{:15}{:10}{:12}{:15}".format(t.text, t.pos_, t.dep_, t.lemma_))

A              DET       det         a              
man            NOUN      nsubj       man            
stood          VERB      ROOT        stand          
upon           SCONJ     prep        upon           
a              DET       det         a              
railroad       NOUN      compound    railroad       
bridge         NOUN      pobj        bridge         
in             ADP       prep        in             
northern       ADJ       amod        northern       
Alabama        PROPN     pobj        Alabama        
,              PUNCT     punct       ,              
looking        VERB      advcl       look           
down           ADV       prt         down           

              SPACE                 
              
into           ADP       prep        into           
the            DET       det         the            
swift          ADJ       amod        swift          
water          NOUN      pobj        water          
twenty         NUM       nummod      twenty   

In [15]:
# ? Find occurances of swimming vigoriously
macher = Matcher(nlp.vocab)

pattern = [
    {"LOWER": "swimming"},
    {"IS_SPACE": True, "OP": "*"},
    {"LOWER": "vigorously"}
]

In [16]:
macher.add("swimming", None, pattern)

In [17]:
found_mathces = macher(doc)
print(found_mathces)

[(12526975369366237900, 1274, 1277), (12526975369366237900, 3609, 3612)]


In [18]:
def surrounding_text(doc_obj, start, end):
    print(doc_obj[start - 5: end + 5])

In [20]:
surrounding_text(doc, found_mathces[0][1], found_mathces[0][2])
surrounding_text(doc, found_mathces[1][1], found_mathces[1][2])

evade the bullets and, swimming
vigorously, reach the bank,
shoulder; he was now swimming
vigorously with the current.  


In [31]:
for s in doc_sentences:
    if found_mathces[0][1] < s.end and found_mathces[0][2] > s.start:
        print("{}\n Sentence Start: {}, Occurance: {}, sentence End: {}".format(s, s.start, found_mathces[0], s.end))
        break

By diving I could evade the bullets and, swimming
vigorously, reach the bank, take to the woods and get away home.  
 Sentence Start: 1265, Occurance: (12526975369366237900, 1274, 1277), sentence End: 1292


# The End