#Tokenization through internal python libraries

In [74]:
#Splitting white space
import re
import string
from spacy import displacy

In [5]:
#Creating sentence
text = 'I\'m with you for the entire life in U.K.!'

In [6]:
#Splitting sentence by re function
re.split(r'\W+', text)

['I', 'm', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U', 'K', '']

In [7]:
#Splitting text by split function
text.split(' ')

["I'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U.K.!']

In [8]:
#Splitting the text
words = text.split()

#Extracting the punctuals
re_punc = re.compile('[%s]'% re.escape(string.punctuation))

#Replacing punctuals with 
stripped = [re_punc.sub('', w) for w in words]
stripped

['Im', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'UK']

In [14]:
#Splitting the text
words = text.split()

#Extracting the punctuals
re_print = re.compile('[^%s]'% re.escape(string.printable))

#Replacing punctuals with 
stripped = [re_print.sub('', w) for w in words]
stripped

["I'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U.K.!']

In [15]:
#Splitting the text
words = text.split()

#Converting text into lower case
words = [word.lower() for word in words]
words

["i'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'u.k.!']

# Tokenization through spacy library

In [18]:
#Importing spacy library
import spacy
nlp = spacy.load('en_core_web_sm')

In [45]:
#Creating text
text = 'I\'m with you for the entire life in U.K.!'
doc = nlp(text)
for i in doc:
  print(i.text, end = '  |  ')


I  |  'm  |  with  |  you  |  for  |  the  |  entire  |  life  |  in  |  U.K.  |  !  |  

In [32]:
#Creating text
text = u'We are here to help! Send snail-mail, email bhavik.0901@gmail.com or visit us at https://bhavik.blogsspot.com/!'
doc = nlp(text)
for i in doc:
  print(i.text, end = ' | ')

We | are | here | to | help | ! | Send | snail | - | mail | , | email | bhavik.0901@gmail.com | or | visit | us | at | https://bhavik.blogsspot.com/ | ! | 

In [33]:
#Creating text
text = u'A 5km NYC cab ride costs $10.30'
doc = nlp(text)
for i in doc:
  print(i.text, end = ' | ')

A | 5 | km | NYC | cab | ride | costs | $ | 10.30 | 

In [40]:
#Creating tokenization function
def tokenization(text):
  doc = nlp(text)
  for i in doc:
    print(i.text)

In [41]:
#Lets tokenize the text
tokenization(u"Let's visit St.Louis in the U.S. next year.")

Let
's
visit
St
.
Louis
in
the
U.S.
next
year
.


In [79]:
#Creating text
text = u'We are here to help! Send snail-mail, email bhavik.0901@gmail.com or visit us at https://bhavik.blogsspot.com/!'
doc = nlp(text)
for i,j in enumerate(doc):
  print(i+1,j.text)

1 We
2 are
3 here
4 to
5 help
6 !
7 Send
8 snail
9 -
10 mail
11 ,
12 email
13 bhavik.0901@gmail.com
14 or
15 visit
16 us
17 at
18 https://bhavik.blogsspot.com/
19 !


In [81]:
#Checking text length
print(len(text))
#Checking token length
print(len(doc))
#Checking how much vocabulary available
print(len(doc.vocab))

111
19
543


In [60]:
#Accessing the token
text1 = nlp(u'It is better to give than to receive.')
#Extracting third word
text1[2] 
#Extracting till third words
text1[0:3]
#Extracting last word
text1[-2:]

receive.

In [62]:
#Trying to change horrible to delicious
text2 = nlp(u'My dinner was horrible')
text3 = nlp(u'Your dinner was delicious')
text2[3]=text3[3]

TypeError: ignored

In [71]:
#Extracting token with label and label informatio
text4 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for i in text4:
  print(i.text, end = ' | ')

print('\n-------------')

for j in text4.ents:
  print(j.text + ' ------>', j.label_ + ' ------>' + str(spacy.explain(j.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
-------------
Apple ------> ORG ------>Companies, agencies, institutions, etc.
Hong Kong ------> GPE ------>Countries, cities, states
$6 million ------> MONEY ------>Monetary values, including unit


In [67]:
#Checking length of text ents
len(text4.ents)

3

In [72]:
#Extracting noun chunks
text5 = nlp(u'Autonomous cars shift insurance liability towards manufacturers')
for i in text5.noun_chunks:
  print(i.text)

Autonomous cars
insurance liability
manufacturers


In [73]:
#Extracting noun chunks
text5 = nlp(u'Red cars do not carry higher insurance rates')
for i in text5.noun_chunks:
  print(i.text)

Red cars
higher insurance rates


In [75]:
#Visualize part-of-speech tags and syntactic dependencies
text6 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(text6, style='dep', jupyter=True, options = {'distance':110})

In [78]:
#Visualize part-of-speech tags
text6 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(text6, style='ent', jupyter = True)

In [None]:
##Visualize part-of-speech tags on system server
text6 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(text6, style='dep')