## Importing dataset and libraries

In [189]:
import nltk

In [190]:
import re
import string
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")
stemmer = nltk.SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [191]:
nltk.download(["names","stopwords","gutenberg","punkt"])

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [192]:
stopwords = nltk.corpus.stopwords.words("english")

In [193]:
#load the corpus
corpus = nltk.corpus.gutenberg
corpus

<PlaintextCorpusReader in '/root/nltk_data/corpora/gutenberg'>

In [194]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [195]:
# Raw text from file
raw_text = corpus.raw('melville-moby_dick.txt')

In [196]:
#print the raw text first 10000 characters
print(raw_text[:1000])

[Moby Dick by Herman Melville 1851]


ETYMOLOGY.

(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.  He was ever dusting his old lexicons and grammars, with a queer
handkerchief, mockingly embellished with all the gay flags of all the
known nations of the world.  He loved to dust his old grammars; it
somehow mildly reminded him of his mortality.

"While you take in hand to school others, and to teach them by what
name a whale-fish is to be called in our tongue leaving out, through
ignorance, the letter H, which almost alone maketh the signification
of the word, you deliver that which is not true." --HACKLUYT

"WHALE. ... Sw. and Dan. HVAL.  This animal is named from roundness
or rolling; for in Dan. HVALT is arched or vaulted." --WEBSTER'S
DICTIONARY

"WHALE. ... It is more immediately from the Dut. and Ger. WALLEN;
A.S. WALW-IAN, to roll, to wallow." --RICHARDSON'S DICTIONARY


## Data Pre-processing

In [197]:
def clean(text):

    translator = str.maketrans('', '', string.punctuation)
    text = raw_text.translate(translator)
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', '', text)
    text = re.sub('   ', '', text)
    text = "".join(text)
    return text

In [198]:
text = clean(raw_text)

In [199]:
print(text[:1000])

Moby Dick by Herman Melville 1851ETYMOLOGY  Supplied by a Late Consumptive Usher to a Grammar School  The pale Usherthreadbare in coat heart body and brain I see him now  He was ever dusting his old lexicons and grammars with a queer handkerchief mockingly embellished with all the gay flags of all the known nations of the world  He loved to dust his old grammars it somehow mildly reminded him of his mortality  While you take in hand to school others and to teach them by what name a whalefish is to be called in our tongue leaving out through ignorance the letter H which almost alone maketh the signification of the word you deliver that which is not true HACKLUYT  WHALE  Sw and Dan HVAL  This animal is named from roundness or rolling for in Dan HVALT is arched or vaulted WEBSTERS DICTIONARY  WHALE  It is more immediately from the Dut and Ger WALLEN AS WALWIAN to roll to wallow RICHARDSONS DICTIONARY  KETOSGREEK CETUSLATIN WHOELANGLOSAXON HVALTDANISH WAL  DUTCH HWAL SWEDISH WHALEICELANDIC

In [200]:
text



## Word and Sentence Tokenization

#### Sentence Tokenization

In [201]:
# Tokenize raw text into sentences
sentences = nltk.sent_tokenize(raw_text)

In [202]:
sentences

['[Moby Dick by Herman Melville 1851]\r\n\r\n\r\nETYMOLOGY.',
 '(Supplied by a Late Consumptive Usher to a Grammar School)\r\n\r\nThe pale Usher--threadbare in coat, heart, body, and brain; I see him\r\nnow.',
 'He was ever dusting his old lexicons and grammars, with a queer\r\nhandkerchief, mockingly embellished with all the gay flags of all the\r\nknown nations of the world.',
 'He loved to dust his old grammars; it\r\nsomehow mildly reminded him of his mortality.',
 '"While you take in hand to school others, and to teach them by what\r\nname a whale-fish is to be called in our tongue leaving out, through\r\nignorance, the letter H, which almost alone maketh the signification\r\nof the word, you deliver that which is not true."',
 '--HACKLUYT\r\n\r\n"WHALE.',
 '... Sw. and Dan.',
 'HVAL.',
 'This animal is named from roundness\r\nor rolling; for in Dan.',
 'HVALT is arched or vaulted."',
 '--WEBSTER\'S\r\nDICTIONARY\r\n\r\n"WHALE.',
 '...',
 'It is more immediately from the Dut.',
 '

In [203]:
# Print 5 sentences
for i in range(5):
    print(sentences[i] + "\n")

[Moby Dick by Herman Melville 1851]


ETYMOLOGY.

(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.

He was ever dusting his old lexicons and grammars, with a queer
handkerchief, mockingly embellished with all the gay flags of all the
known nations of the world.

He loved to dust his old grammars; it
somehow mildly reminded him of his mortality.

"While you take in hand to school others, and to teach them by what
name a whale-fish is to be called in our tongue leaving out, through
ignorance, the letter H, which almost alone maketh the signification
of the word, you deliver that which is not true."



#### Word Tokenization

In [204]:
words = nltk.word_tokenize(text)

In [205]:
words[:5]

['Moby', 'Dick', 'by', 'Herman', 'Melville']

## Stop Word Removal

In [207]:
text = [word for word in text.split(' ') if word not in stopwords]
text=" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text=" ".join(text)

Corpus after removing Stopwords is given below

In [208]:
text

'mobi dick herman melvill 1851etymolog  suppli late consumpt usher grammar school  the pale usherthreadbar coat heart bodi brain i see  he ever dust old lexicon grammar queer handkerchief mock embellish gay flag known nation world  he love dust old grammar somehow mild remind mortal  while take hand school other teach name whalefish call tongu leav ignor letter h almost alon maketh signif word deliv true hackluyt  whale  sw dan hval  this anim name round roll dan hvalt arch vault webster dictionari  whale  it immedi dut ger wallen as walwian roll wallow richardson dictionari  ketosgreek cetuslatin whoelanglosaxon hvaltdanish wal  dutch hwal swedish whaleiceland whaleenglish balein french ballena spanish pekeenueenue  fege pekeenueenue  erromangoan  extract suppli subsublibrarian  it seen mere painstak burrow grubworm poor devil subsub appear gone long vatican streetstal earth pick whatev random allus whale could anyway find book whatsoev sacr profan  therefor must everi case least take