### Import Necessary Libraries

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/isham993/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, TreebankWordTokenizer

### Sentence Tokenization

In [6]:
# Reference: Wizard of Oz - https://www.gutenberg.org/files/55/55-h/55-h.htm

corpus = """Folklore, legends, myths and fairy tales have followed childhood through the ages, for every healthy youngster has a wholesome and instinctive love for stories fantastic, marvelous and manifestly unreal. The winged fairies of Grimm and Andersen have brought more happiness to childish hearts than all other human creations.

Yet the old time fairy tale, having served for generations, may now be classed as “historical” in the children’s library; for the time has come for a series of newer “wonder tales” in which the stereotyped genie, dwarf and fairy are eliminated, together with all the horrible and blood-curdling incidents devised by their authors to point a fearsome moral to each tale. Modern education includes morality; therefore the modern child seeks only entertainment in its wonder tales and gladly dispenses with all disagreeable incident.

Having this thought in mind, the story of “The Wonderful Wizard of Oz” was written solely to please children of today. It aspires to being a modernized fairy tale, in which the wonderment and joy are retained and the heartaches and nightmares are left out.
"""

Sentence Tokenization

In [7]:
documents = sent_tokenize(corpus)

In [8]:
type(documents)

list

In [10]:
documents

['Folklore, legends, myths and fairy tales have followed childhood through the ages, for every healthy youngster has a wholesome and instinctive love for stories fantastic, marvelous and manifestly unreal.',
 'The winged fairies of Grimm and Andersen have brought more happiness to childish hearts than all other human creations.',
 'Yet the old time fairy tale, having served for generations, may now be classed as “historical” in the children’s library; for the time has come for a series of newer “wonder tales” in which the stereotyped genie, dwarf and fairy are eliminated, together with all the horrible and blood-curdling incidents devised by their authors to point a fearsome moral to each tale.',
 'Modern education includes morality; therefore the modern child seeks only entertainment in its wonder tales and gladly dispenses with all disagreeable incident.',
 'Having this thought in mind, the story of “The Wonderful Wizard of Oz” was written solely to please children of today.',
 'It a

Word Tokenization

In [13]:
for sentence in documents:
    print(word_tokenize(sentence))

['Folklore', ',', 'legends', ',', 'myths', 'and', 'fairy', 'tales', 'have', 'followed', 'childhood', 'through', 'the', 'ages', ',', 'for', 'every', 'healthy', 'youngster', 'has', 'a', 'wholesome', 'and', 'instinctive', 'love', 'for', 'stories', 'fantastic', ',', 'marvelous', 'and', 'manifestly', 'unreal', '.']
['The', 'winged', 'fairies', 'of', 'Grimm', 'and', 'Andersen', 'have', 'brought', 'more', 'happiness', 'to', 'childish', 'hearts', 'than', 'all', 'other', 'human', 'creations', '.']
['Yet', 'the', 'old', 'time', 'fairy', 'tale', ',', 'having', 'served', 'for', 'generations', ',', 'may', 'now', 'be', 'classed', 'as', '“', 'historical', '”', 'in', 'the', 'children', '’', 's', 'library', ';', 'for', 'the', 'time', 'has', 'come', 'for', 'a', 'series', 'of', 'newer', '“', 'wonder', 'tales', '”', 'in', 'which', 'the', 'stereotyped', 'genie', ',', 'dwarf', 'and', 'fairy', 'are', 'eliminated', ',', 'together', 'with', 'all', 'the', 'horrible', 'and', 'blood-curdling', 'incidents', 'devis

In [14]:
# we will see the use of wordpunct_tokenize which treats punctuation marks as words too 
for sentence in documents:
    print(wordpunct_tokenize(sentence))

['Folklore', ',', 'legends', ',', 'myths', 'and', 'fairy', 'tales', 'have', 'followed', 'childhood', 'through', 'the', 'ages', ',', 'for', 'every', 'healthy', 'youngster', 'has', 'a', 'wholesome', 'and', 'instinctive', 'love', 'for', 'stories', 'fantastic', ',', 'marvelous', 'and', 'manifestly', 'unreal', '.']
['The', 'winged', 'fairies', 'of', 'Grimm', 'and', 'Andersen', 'have', 'brought', 'more', 'happiness', 'to', 'childish', 'hearts', 'than', 'all', 'other', 'human', 'creations', '.']
['Yet', 'the', 'old', 'time', 'fairy', 'tale', ',', 'having', 'served', 'for', 'generations', ',', 'may', 'now', 'be', 'classed', 'as', '“', 'historical', '”', 'in', 'the', 'children', '’', 's', 'library', ';', 'for', 'the', 'time', 'has', 'come', 'for', 'a', 'series', 'of', 'newer', '“', 'wonder', 'tales', '”', 'in', 'which', 'the', 'stereotyped', 'genie', ',', 'dwarf', 'and', 'fairy', 'are', 'eliminated', ',', 'together', 'with', 'all', 'the', 'horrible', 'and', 'blood', '-', 'curdling', 'incidents'

In [15]:
# With TreebankWordTokenizer, only the last word full stop is treated as seperate word.
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Folklore',
 ',',
 'legends',
 ',',
 'myths',
 'and',
 'fairy',
 'tales',
 'have',
 'followed',
 'childhood',
 'through',
 'the',
 'ages',
 ',',
 'for',
 'every',
 'healthy',
 'youngster',
 'has',
 'a',
 'wholesome',
 'and',
 'instinctive',
 'love',
 'for',
 'stories',
 'fantastic',
 ',',
 'marvelous',
 'and',
 'manifestly',
 'unreal.',
 'The',
 'winged',
 'fairies',
 'of',
 'Grimm',
 'and',
 'Andersen',
 'have',
 'brought',
 'more',
 'happiness',
 'to',
 'childish',
 'hearts',
 'than',
 'all',
 'other',
 'human',
 'creations.',
 'Yet',
 'the',
 'old',
 'time',
 'fairy',
 'tale',
 ',',
 'having',
 'served',
 'for',
 'generations',
 ',',
 'may',
 'now',
 'be',
 'classed',
 'as',
 '“historical”',
 'in',
 'the',
 'children’s',
 'library',
 ';',
 'for',
 'the',
 'time',
 'has',
 'come',
 'for',
 'a',
 'series',
 'of',
 'newer',
 '“wonder',
 'tales”',
 'in',
 'which',
 'the',
 'stereotyped',
 'genie',
 ',',
 'dwarf',
 'and',
 'fairy',
 'are',
 'eliminated',
 ',',
 'together',
 'with',
 'al