In [None]:
# Import reuters database from the nltk corpus 
from nltk.corpus import reuters
# Import tokenizers
from nltk.tokenize import sent_tokenize, word_tokenize

# Download the reuters corpora and the "punkt" sentence tokenizer.
import nltk
nltk.download("reuters")
nltk.download('punkt')

## The NLTK Reuters corpus

In [2]:
# The reuters corpus includes over 10,000 news articles, many of which are about financial markets
# These articles are tagged by topic, or category
# Get the categories
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [3]:
# We'll find the first article about cocoa.
reuters.fileids(categories = 'cocoa')[0]

'test/15095'

In [4]:
# Get the raw text from the first article. 
article = reuters.raw('test/15095')
print(article)

COCOA EXPORTERS EXPECTED TO LIMIT SALES
  Major cocoa exporters are likely to
  limit sales in the weeks ahead in an effort to boost world
  prices, sources close to a meeting of the Cocoa Producers
  Alliance (CPA) said.
      The sources said the depressed world market had been one of
  the main topics discussed in a closed door meeting of the
  11-member CPA which began on Monday.
      They said producers agreed that cutting sales would aid the
  buffer stock manager of a new international cocoa pact in his
  effort to support prices.
      Major cocoa producing and consuming nations agreed
  operation rules for the buffer stock at a meeting in London
  last month and the stock manager is expected to enter the
  market soon.
      Prices, under the weight of three successive cocoa
  surpluses, recently fell to the level at which the manager has
  to buy cocoa under stock rules.
      The buffer stock aims to keep prices within a pre-set range
  by buying when prices fall and sellin

## Tokenizing with Python `split()`

In [5]:
# We can mimic tokenization first by using `split()` on the article.


['COCOA EXPORTERS EXPECTED TO LIMIT SALES\n  Major cocoa exporters are likely to\n  limit sales in the weeks ahead in an effort to boost world\n  prices, sources close to a meeting of the Cocoa Producers\n  Alliance (CPA) said',
 '\n      The sources said the depressed world market had been one of\n  the main topics discussed in a closed door meeting of the\n  11-member CPA which began on Monday',
 '\n      They said producers agreed that cutting sales would aid the\n  buffer stock manager of a new international cocoa pact in his\n  effort to support prices',
 '\n      Major cocoa producing and consuming nations agreed\n  operation rules for the buffer stock at a meeting in London\n  last month and the stock manager is expected to enter the\n  market soon',
 '\n      Prices, under the weight of three successive cocoa\n  surpluses, recently fell to the level at which the manager has\n  to buy cocoa under stock rules',
 '\n      The buffer stock aims to keep prices within a pre-set range

In [11]:
# Then we split the first sentence on the whitespace.


['COCOA', 'EXPORTERS', 'EXPECTED', 'TO', 'LIMIT', 'SALES\n', '', 'Major', 'cocoa', 'exporters', 'are', 'likely', 'to\n', '', 'limit', 'sales', 'in', 'the', 'weeks', 'ahead', 'in', 'an', 'effort', 'to', 'boost', 'world\n', '', 'prices,', 'sources', 'close', 'to', 'a', 'meeting', 'of', 'the', 'Cocoa', 'Producers\n', '', 'Alliance', '(CPA)', 'said']


## NLTK tokenization

In [7]:
# NLTK tokenizes in similar way by using the `sent_tokenize` function


['COCOA EXPORTERS EXPECTED TO LIMIT SALES\n  Major cocoa exporters are likely to\n  limit sales in the weeks ahead in an effort to boost world\n  prices, sources close to a meeting of the Cocoa Producers\n  Alliance (CPA) said.',
 'The sources said the depressed world market had been one of\n  the main topics discussed in a closed door meeting of the\n  11-member CPA which began on Monday.',
 'They said producers agreed that cutting sales would aid the\n  buffer stock manager of a new international cocoa pact in his\n  effort to support prices.',
 'Major cocoa producing and consuming nations agreed\n  operation rules for the buffer stock at a meeting in London\n  last month and the stock manager is expected to enter the\n  market soon.',
 'Prices, under the weight of three successive cocoa\n  surpluses, recently fell to the level at which the manager has\n  to buy cocoa under stock rules.',
 'The buffer stock aims to keep prices within a pre-set range\n  by buying when prices fall and 

In [8]:
# We can tokenize the first sentence with the `word_tokenize` function.


['COCOA', 'EXPORTERS', 'EXPECTED', 'TO', 'LIMIT', 'SALES', 'Major', 'cocoa', 'exporters', 'are', 'likely', 'to', 'limit', 'sales', 'in', 'the', 'weeks', 'ahead', 'in', 'an', 'effort', 'to', 'boost', 'world', 'prices', ',', 'sources', 'close', 'to', 'a', 'meeting', 'of', 'the', 'Cocoa', 'Producers', 'Alliance', '(', 'CPA', ')', 'said', '.']


**Question: What differences are there between using Python to split the sentence and tokenizer functions?**
- Python: 
- NLTK tokenizer:  