# Tokenization
Tokenization is the process of splitting text into meaningful segments (tokens)

Two forms -> Sentence tokenization and Word tokenization

Sentence tokenization is the splitting of a paragraph or passage into sentences.

Word tokenization, on the other hand, is the process of splitting sentences into words

In [40]:
import spacy

In [41]:
nlp = spacy.blank("en") # create English language object

doc = nlp("Dr. Strange loves pav bhaji of Mumbai as it costs only $2 per plate.")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
Mumbai
as
it
costs
only
$
2
per
plate
.


In [42]:
doc[-1]

.

In [43]:
docu = nlp('''"Let's go to N.Y.!"''')

for token in docu:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [44]:
type(nlp)

spacy.lang.en.English

In [45]:
type(doc)

spacy.tokens.doc.Doc

In [46]:
type(docu)

spacy.tokens.doc.Doc

In [47]:
type(token)

spacy.tokens.token.Token

In [48]:
# span -> substring
span = doc[1:5]
type(span)

spacy.tokens.span.Span

In [49]:
document = nlp("Tony gave two $ to Peter.")

In [50]:
token0 = document[0]
token0 

Tony

In [51]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [52]:
token0.is_alpha # checking alphabet

True

In [53]:
token.like_num

False

In [54]:
token2 = document[2]
token2.text


'two'

In [55]:
token2.like_num # checking number

True

In [56]:
token3 = document[3]
token3.text

'$'

In [57]:
token3.is_currency

True

In [58]:
for token in document:
    print(token, '==>', "index: ", token.i, 
          " is_alpha: ", token.is_alpha,
          " is_punct: ", token.is_punct, 
          " like_num: ", token.like_num,
          " is_currency: ", token.is_currency 
          )

Tony ==> index:  0  is_alpha:  True  is_punct:  False  like_num:  False  is_currency:  False
gave ==> index:  1  is_alpha:  True  is_punct:  False  like_num:  False  is_currency:  False
two ==> index:  2  is_alpha:  True  is_punct:  False  like_num:  True  is_currency:  False
$ ==> index:  3  is_alpha:  False  is_punct:  False  like_num:  False  is_currency:  True
to ==> index:  4  is_alpha:  True  is_punct:  False  like_num:  False  is_currency:  False
Peter ==> index:  5  is_alpha:  True  is_punct:  False  like_num:  False  is_currency:  False
. ==> index:  6  is_alpha:  False  is_punct:  True  like_num:  False  is_currency:  False


In [60]:
with open("students.txt") as f:
    text = f.readlines()

text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [61]:
text = ' '.join(text)
text



In [63]:
Doc = nlp(text)
emails = [] 
for token in Doc:
    if token.like_email:
        emails.append(token.text)
        
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [65]:
nlp = spacy.blank("hi")
hindi_doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in hindi_doc:
    print(token, token.is_currency, token.like_num)

भैया False False
जी False False
! False False
5000 False True
₹ True False
उधार False False
थे False False
वो False False
वापस False False
देदो False False


In [66]:
cdoc = nlp("gimme double cheese extra large healthy piza")

tokens = [token.text for token in cdoc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'piza']

In [76]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}  
])

cdoc = nlp("gimme double cheese extra large healthy piza")

tokens = [token.text for token in cdoc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'piza']

In [88]:
# nlp.add_pipe('sentencizer')

In [89]:
nlp.pipe_names

['sentencizer']

In [90]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr.
Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi
