### Sentence Tokenization in Spacy
Tokenization is a process of splitting text into meaningful sehments

In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("The strange case of Dr. Jekyll and Mr Hyde. The Book Thief")

for sentence in doc.sents:
    print(sentence)

The strange case of Dr. Jekyll and Mr Hyde.
The Book Thief


In [5]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

The
strange
case
of
Dr.
Jekyll
and
Mr
Hyde
.
The
Book
Thief


In [6]:
nlp = spacy.blank('en')
doc = nlp('The strange case of Dr. Jekyll and Mr Hyde. The Book Thief')

for token in doc:
    print(token)

The
strange
case
of
Dr.
Jekyll
and
Mr
Hyde
.
The
Book
Thief


In [9]:
doc = nlp('"Let\'s go to N.Y.!"')

for token in doc:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [10]:
doc = nlp('James gave two $ to Peter.')

In [16]:
token0 = doc[0]
token2 = doc[2]
token3 = doc[3]
token0

James

In [12]:
type(token0)

spacy.tokens.token.Token

In [13]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [19]:
token3.is_currency

True

In [25]:
with open('students.txt') as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [26]:
text = ' '.join(text)
text



In [27]:
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token)
print(emails)

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]


In [30]:
doc = nlp("gimme double cheese extra large pizza")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'pizza']

In [35]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH:'gim'},
    {ORTH:'me'}
])
doc = nlp("gimme double cheese extra large pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'pizza']

In [38]:
nlp.add_pipe('sentencizer')

doc = nlp("The strange case of Dr. Jekyll and Mr Hyde. The Book Thief")

for sentence in doc.sents:
    print(sentence)

The strange case of Dr. Jekyll and Mr Hyde.
The Book Thief


In [39]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [44]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
websites = []
for token in doc:
    if token.like_url:
        websites.append(token)
print(websites)

[http://www.data.gov/, http://www.science, http://data.gov.uk/., http://www3.norc.org/gss+website/, http://www.europeansocialsurvey.org/.]


In [48]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc = nlp(transactions)
for token in doc:
    if token.like_num and token.is_currency:
        
        print(token)

two
$
500
€


### Sentence tokenization with NLTK

In [10]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [11]:
from nltk.tokenize import sent_tokenize

sent_tokenize("The strange case of Dr. Jekyll and Mr Hyde. The Book Thief")

['The strange case of Dr. Jekyll and Mr Hyde.', 'The Book Thief']

In [12]:
from nltk.tokenize import word_tokenize

word_tokenize("The strange case of Dr. Jekyll and Mr Hyde. The Book Thief")

['The',
 'strange',
 'case',
 'of',
 'Dr.',
 'Jekyll',
 'and',
 'Mr',
 'Hyde',
 '.',
 'The',
 'Book',
 'Thief']