# Stopwords
Stopwords are non-informative words that we want to take out of the text before performing analysis on it. Luckily, NLTK has a ready-made list of such words that we can use to preprocess text...but is this enough?

In [18]:
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Code to download corpora
import nltk
nltk.download('stopwords')
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ddevii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to /Users/ddevii/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ddevii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# Locate the first article about crude oil
article = reuters.raw(fileids=reuters.fileids(categories="crude")[0])

In [20]:
print(article)

JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
  in the fiscal year ended March

In [21]:
# Sentence tokenize the article and store first sentence
sentence = sent_tokenize(article)[1]
print(sentence)

MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.


In [22]:
# Word tokenize the sentence 
words = word_tokenize(sentence)
print(words)

['MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.']


## NLTK Stopwords

In [23]:
# NLTK list of stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
# Remove stop words
stop_words = set(stopwords.words("english"))
first_result = [word.lower() for word in words if word.lower() not in stop_words]

In [28]:
print(first_result)

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', 'mln', 'kilolitres', '(', 'kl', ')', '600', 'mln', ',', 'said', '.']


In [38]:
# We can define our own list of stopwords to add to the default nltk words
stop_words_added = {"said", "mln", "kilolitres", "kl"}
second_result = [word.lower() for word in words if word.lower() not in stop_words.union(stop_words_added)]

In [39]:
print(second_result)

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', '(', ')', '600', ',', '.']


## Getting Rid of Non-Alpha Characters: Regex

In [40]:
# Import regular expressions library
import re

In [41]:
# Substitute everything that is NOT a letter with empty string
regex = re.compile("[^a-zA-Z]")
reclean = regex.sub(" ", sentence)
reclean

'MITI is expected to lower the projection for primary energy   supplies in the year      to     mln kilolitres  kl  from       mln  they said '

In [42]:
# Remove stop words
re_wording = word_tokenize(reclean)
result = [word.lower() for word in re_wording if word.lower() not in stop_words.union(stop_words_added)]

In [43]:
result

['miti',
 'expected',
 'lower',
 'projection',
 'primary',
 'energy',
 'supplies',
 'year']