# Tokenization.
The first step in creating a Doc object is to break down the incoming text into component pieces or "tokens".

In [3]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [5]:
#load string to a doc file.
doc =nlp(mystring) 

In [9]:
#print the string in tokens.
for token in doc:
    print(token.text,end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [13]:
#Note that the exclamation points, comma, and the hyphen in 'snail-mail' are assigned their own tokens, 
#yet both the email address and website are preserved.
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)


We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [14]:
#Here the distance unit and dollar sign are assigned their own tokens, yet the dollar amount is preserved.
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [15]:
#Here the abbreviations for "Saint" and "United States" are both preserved.
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [16]:
#counting tokens
len(doc4)

11

In [17]:
#Vocab objects contain a full library of items!
len(doc4.vocab)

510

In [22]:
doc5=nlp(u"this is complete understand of token.")

In [23]:
#slicing of tokens.
doc5[0]

this

In [38]:
doc5[2:-1]

complete understand of token

In [54]:
doc8=nlp(u"Ashish-Nayak can do Better then that in Canada with minimum $50000")

In [60]:
#spacy smart enough to understand the important word
for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Ashish | - | Nayak | can | do | Better | then | that | in | Canada | with | minimum | $ | 50000 | 
----
Ashish-Nayak - PERSON - People, including fictional
Canada - GPE - Countries, cities, states
50000 - MONEY - Monetary values, including unit


In [62]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Ashish-Nayak
PERSON
People, including fictional


Canada
GPE
Countries, cities, states


50000
MONEY
Monetary values, including unit




In [64]:
#checking the chunks of noun in sentence.
doc9 = nlp(u"Hy this is ashish nayak and who r you.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

ashish nayak
who
you


### Tokenization visualizer.

In [65]:
from spacy import displacy

In [66]:
doc=nlp(u"Hy this is an example of tokenization and break it into parts ")

In [72]:
#dependency parse
displacy.render(doc,style="dep",jupyter=True,options={'distance':110})

In [86]:
doc=nlp("Hy this is to inform that Ashish_nayak has got a job of $1 million in Canada and come back to India after 5 years")

In [87]:
#entity recognizer
displacy.render(doc, style='ent', jupyter=True)

In [82]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
#Creating Visualizations Outside of Jupyter
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [31/May/2019 12:54:47] "GET / HTTP/1.1" 200 3395
127.0.0.1 - - [31/May/2019 12:54:47] "GET /favicon.ico HTTP/1.1" 200 3395


In [None]:
#After this code have to run 127.0.0.1:5000 it will visualise in our browser.