In [None]:
# Import the Gutenberg and stopwords databases from the nltk corpus 
from nltk.corpus import gutenberg, stopwords
# Import tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize

# Import nltk and download  the sentence tokenizer.
import nltk
nltk.download('punkt')

In [2]:
# Get all the fileids 
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
# Get Jane Austen's book, Persuasion.
persuasion_book = gutenberg.raw(fileids=('austen-persuasion.txt'))
print(persuasion_book)

[Persuasion by Jane Austen 1818]


Chapter 1


Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,
for his own amusement, never took up any book but the Baronetage;
there he found occupation for an idle hour, and consolation in a
distressed one; there his faculties were roused into admiration and
respect, by contemplating the limited remnant of the earliest patents;
there any unwelcome sensations, arising from domestic affairs
changed naturally into pity and contempt as he turned over
the almost endless creations of the last century; and there,
if every other leaf were powerless, he could read his own history
with an interest which never failed.  This was the page at which
the favourite volume always opened:

           "ELLIOT OF KELLYNCH HALL.

"Walter Elliot, born March 1, 1760, married, July 15, 1784, Elizabeth,
daughter of James Stevenson, Esq. of South Park, in the county of
Gloucester, by which lady (who died 1800) he has issue Elizabeth,
born June 1, 1785; Ann

In [4]:
# Use the sentence tokenizer on a random sentence in Persuasion.
one_sentence = sent_tokenize(persuasion_book)[8]
print(one_sentence)

He had been remarkably handsome
in his youth; and, at fifty-four, was still a very fine man.


In [5]:
# Get all the words in the sentence.
all_words = word_tokenize(one_sentence)
print(all_words)

['He', 'had', 'been', 'remarkably', 'handsome', 'in', 'his', 'youth', ';', 'and', ',', 'at', 'fifty-four', ',', 'was', 'still', 'a', 'very', 'fine', 'man', '.']


## NLTK Stopwords

In [6]:
# Get all the nltk stopwords
sw = set(stopwords.words('english'))
print(sw)

{'d', 'have', 'why', 'yourself', 'be', 'ourselves', "isn't", 'any', 'needn', 'hers', "hadn't", 'between', 'hadn', "you'll", 's', "you're", "don't", "doesn't", 'mustn', 'couldn', 'll', 'does', 'but', 'if', 'more', "she's", 'only', 'won', 'myself', 'just', 'against', "you've", 'don', 'hasn', "won't", 'below', 'so', 'or', 'an', "mustn't", 'isn', 'their', 'weren', 'himself', 'doing', 'her', 'doesn', 'through', 'and', 'because', 'above', "that'll", 'both', "wasn't", 'them', "shouldn't", 'these', 'can', 'for', 'should', 'each', 'been', 'is', 'of', 'aren', 've', 'to', 'some', 'own', 'now', 'your', 'again', 'where', "couldn't", 'him', 'what', 'ours', 'out', 'too', 'off', 'whom', 'were', 'as', 'do', 'when', 'who', 'the', 'having', 'up', 'with', 'you', "haven't", 'being', 'further', "didn't", 'all', 'than', 'i', 'herself', "it's", 'once', 'ain', 'yours', 'while', "wouldn't", "weren't", 'o', 'themselves', 'this', 'under', 'y', 'a', 'which', 'its', 'very', 'down', 'few', 'on', 're', 'how', "you'd"

In [7]:
# Filter out all the stopwords from the words in the sentence.
first_result = [word.lower() for word in all_words if word.lower() not in sw]
print(first_result)

['remarkably', 'handsome', 'youth', ';', ',', 'fifty-four', ',', 'still', 'fine', 'man', '.']


In [8]:
# We can define our own list of stopwords to add to the default nltk stopwords
sw_addon = {'still', 'fifty-four'}
second_result = [word.lower() for word in all_words if word.lower() not in sw.union(sw_addon)]
print(second_result)

['remarkably', 'handsome', 'youth', ';', ',', ',', 'fine', 'man', '.']


## Getting Rid of Non-Alpha Characters using Regular Expressions

In [9]:
# Import regular expressions library
import re

In [10]:
# Substitute everything that is not a letter with an empty string
regex = re.compile("[^a-zA-Z ]")
re_clean = regex.sub(' ', one_sentence)
print(re_clean)

He had been remarkably handsome in his youth  and  at fifty four  was still a very fine man 


In [11]:
# Retrieve everything that is not a letter with an empty string
re_clean_2 = re.findall("[^a-zA-Z ]", one_sentence)
print(re_clean_2)

['\n', ';', ',', '-', ',', '.']


In [12]:
# Remove all the stopwords from our cleaned regular expression.
re_words = word_tokenize(re_clean)
re_result = [word.lower() for word in re_words if word.lower() not in sw.union(sw_addon)]
print(re_result)

['remarkably', 'handsome', 'youth', 'fifty', 'four', 'fine', 'man']
