# Import dependencies

In [19]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# To download the tokenizer
nltk.download('punkt')

# To download the corpus of stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/volt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/volt/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Declare Text Data

Defining our text data.
> Aragorn, The Return of the King, Book 1, “Chapter II: The Passing of the Grey Company,” p. 794.

In [9]:
paragraph = "A time may come soon, when none will return. Then there will be need of valour without renown, for none shall remember the deeds that are done in the last defense of your homes. Yet the deeds will not be less valiant because they are unpraised."

Extracting the sentences from our paragraph.

In [11]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['A time may come soon, when none will return.',
 'Then there will be need of valour without renown, for none shall remember the deeds that are done in the last defense of your homes.',
 'Yet the deeds will not be less valiant because they are unpraised.']

Extracting the words from our paragraph.

In [14]:
words = nltk.word_tokenize(paragraph)
words

['A',
 'time',
 'may',
 'come',
 'soon',
 ',',
 'when',
 'none',
 'will',
 'return',
 '.',
 'Then',
 'there',
 'will',
 'be',
 'need',
 'of',
 'valour',
 'without',
 'renown',
 ',',
 'for',
 'none',
 'shall',
 'remember',
 'the',
 'deeds',
 'that',
 'are',
 'done',
 'in',
 'the',
 'last',
 'defense',
 'of',
 'your',
 'homes',
 '.',
 'Yet',
 'the',
 'deeds',
 'will',
 'not',
 'be',
 'less',
 'valiant',
 'because',
 'they',
 'are',
 'unpraised',
 '.']

# Stop Words Removal

Removing them using list comprehension.

In [34]:
for i in range(len(sentences)):
    # From a single sentence, store all the words 
    wordsInSentence = nltk.word_tokenize(sentences[i])
    
    # Filter out all the stop words
    wordsInSentence = [word for word in words if word not in set(stopwords.words('english'))]
    
    # Remake the sentences after removing the stop words
    sentences[i] = ' '.join(wordsInSentence)

Here's how `sentences` looks like.

In [35]:
sentences

['A time may come soon , none return . Then need valour without renown , none shall remember deeds done last defense homes . Yet deeds less valiant unpraised .',
 'A time may come soon , none return . Then need valour without renown , none shall remember deeds done last defense homes . Yet deeds less valiant unpraised .',
 'A time may come soon , none return . Then need valour without renown , none shall remember deeds done last defense homes . Yet deeds less valiant unpraised .']

# Stemming

Initialising our stemmer object.

In [36]:
stemmer = PorterStemmer()

Stemming each word through each sentence.

In [37]:
for i in range(len(sentences)):
    wordsInSentence = nltk.word_tokenize(sentences[i])
    
    wordsInSentence = [stemmer.stem(word) for word in wordsInSentence]
    
    sentences[i] = ' '.join(wordsInSentence)

Here's what sentences looks like after stemming.

In [38]:
sentences

['a time may come soon , none return . then need valour without renown , none shall rememb deed done last defens home . yet deed less valiant unprais .',
 'a time may come soon , none return . then need valour without renown , none shall rememb deed done last defens home . yet deed less valiant unprais .',
 'a time may come soon , none return . then need valour without renown , none shall rememb deed done last defens home . yet deed less valiant unprais .']