# N-Gram Counter

In [None]:
# Import reuters and stopwords 
from nltk.corpus import reuters, stopwords
# Import ngrams
from nltk.util import ngrams
# Import the WordNetLemmatizer class 
from nltk.stem import WordNetLemmatizer 
# Import the word tokenizer
from nltk.tokenize import word_tokenize
# Import regular expressions
import re
# Download "punkt" sentence tokenizer and "wordnet" that the lemmatizer uses.
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [2]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
# Get the categories
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [4]:
# Get a random article from the consumer price index (cpi) category
cpi_article = reuters.raw(reuters.fileids(categories='cpi')[2])
print(cpi_article)

HUNGARY RAISES PRICES IN EFFORT TO CURB DEFICIT
  Hungary has announced sharp price
  increases for a range of food and consumer products as part of
  its efforts to curb a soaring budget deficit.
      The official MTI news agency said the government decided
  consumer price subsidies had to be cut to reduce state
  spending. From today the price of meat will rise by an average
  18 pct and that of beer and spirits by 10 pct, MTI said.
      MTI said consumer goods will also become more expensive,
  with the price of refrigerators rising some five pct. It also
  announced a number of measures to ease hardship, including
  higher pensions and family allowances.
      Statistics indicate the budget deficit tripled in 1986 to
  47 billion forints. Central banker Janos Fekete has said the
  Finance Ministry is trying to cut the 1987 shortfall to between
  30 and 35 billion from a planned 43.8 billion.
      A major tax reform, including the introduction of a
  Western-style valued added t

In [7]:
# Write a function that processes the words for the article and and lemmatizes the words to their root words.
def process_text(article):
    """
    Preprocesses a given text article by performing the following steps:
    
    1. Removes stopwords (common words in English language).
    2. Uses regular expressions to remove non-alphabet characters (e.g., punctuation).
    3. Tokenizes the cleaned text into words.
    4. Lemmatizes the words to their base form.
    5. Filters out words that are not stopwords.
    
    Parameters:
        article (str): The input text article to be processed.

    Returns:
        list of str: A list of preprocessed words from the input article.
    """
    # Get the stopwords
    sw = set(stopwords.words('english'))
    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    # Tokenize the words 
    words = word_tokenize(re_clean)
    # Lemmatize the words
    lem = [lemmatizer.lemmatize(word) for word in words]
    # Retrieve only the words that aren't in the stopwords
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [8]:
# Pass the article to function and print the processed text.
processed_article = process_text(cpi_article)
print(processed_article)

['hungary', 'raises', 'prices', 'effort', 'curb', 'deficit', 'hungary', 'ha', 'announced', 'sharp', 'price', 'increase', 'range', 'food', 'consumer', 'product', 'part', 'effort', 'curb', 'soaring', 'budget', 'deficit', 'official', 'mti', 'news', 'agency', 'said', 'government', 'decided', 'consumer', 'price', 'subsidy', 'cut', 'reduce', 'state', 'spending', 'today', 'price', 'meat', 'rise', 'average', 'pct', 'beer', 'spirit', 'pct', 'mti', 'said', 'mti', 'said', 'consumer', 'good', 'also', 'become', 'expensive', 'price', 'refrigerator', 'rising', 'five', 'pct', 'also', 'announced', 'number', 'measure', 'ease', 'hardship', 'including', 'higher', 'pension', 'family', 'allowance', 'statistics', 'indicate', 'budget', 'deficit', 'tripled', 'billion', 'forint', 'central', 'banker', 'janos', 'fekete', 'ha', 'said', 'finance', 'ministry', 'trying', 'cut', 'shortfall', 'billion', 'planned', 'billion', 'major', 'tax', 'reform', 'including', 'introduction', 'westernstyle', 'valued', 'added', 'tax'

## Frequency Analysis: Word Counts

In [9]:
# Import the Counter class from the collections library.
from collections import Counter

In [10]:
# Get the word counts by passing in the processed article to the Counter class.
word_counts = Counter(processed_article)
# Print the dictionary of the word counts.
print(dict(word_counts))

{'hungary': 2, 'raises': 1, 'prices': 1, 'effort': 3, 'curb': 2, 'deficit': 4, 'ha': 2, 'announced': 2, 'sharp': 2, 'price': 5, 'increase': 1, 'range': 1, 'food': 1, 'consumer': 3, 'product': 1, 'part': 1, 'soaring': 1, 'budget': 2, 'official': 1, 'mti': 3, 'news': 1, 'agency': 1, 'said': 7, 'government': 1, 'decided': 1, 'subsidy': 1, 'cut': 2, 'reduce': 1, 'state': 3, 'spending': 2, 'today': 1, 'meat': 1, 'rise': 2, 'average': 1, 'pct': 3, 'beer': 1, 'spirit': 1, 'good': 2, 'also': 4, 'become': 1, 'expensive': 2, 'refrigerator': 1, 'rising': 1, 'five': 1, 'number': 1, 'measure': 2, 'ease': 1, 'hardship': 1, 'including': 2, 'higher': 1, 'pension': 1, 'family': 1, 'allowance': 1, 'statistics': 1, 'indicate': 1, 'tripled': 1, 'billion': 3, 'forint': 1, 'central': 1, 'banker': 1, 'janos': 1, 'fekete': 1, 'finance': 1, 'ministry': 1, 'trying': 1, 'shortfall': 1, 'planned': 2, 'major': 1, 'tax': 2, 'reform': 1, 'introduction': 1, 'westernstyle': 1, 'valued': 1, 'added': 1, 'january': 1, 'c

In [11]:
# Print the top 10 most common words.
print(dict(word_counts.most_common(10)))

{'said': 7, 'price': 5, 'deficit': 4, 'also': 4, 'effort': 3, 'consumer': 3, 'mti': 3, 'state': 3, 'pct': 3, 'billion': 3}


## Frequency Analysis: N-gram Counts

In [12]:
# Get the number of bigrams.
bigram_counts = Counter(ngrams(processed_article, n=2))
print(dict(bigram_counts))

{('hungary', 'raises'): 1, ('raises', 'prices'): 1, ('prices', 'effort'): 1, ('effort', 'curb'): 2, ('curb', 'deficit'): 1, ('deficit', 'hungary'): 1, ('hungary', 'ha'): 1, ('ha', 'announced'): 1, ('announced', 'sharp'): 1, ('sharp', 'price'): 2, ('price', 'increase'): 1, ('increase', 'range'): 1, ('range', 'food'): 1, ('food', 'consumer'): 1, ('consumer', 'product'): 1, ('product', 'part'): 1, ('part', 'effort'): 1, ('curb', 'soaring'): 1, ('soaring', 'budget'): 1, ('budget', 'deficit'): 2, ('deficit', 'official'): 1, ('official', 'mti'): 1, ('mti', 'news'): 1, ('news', 'agency'): 1, ('agency', 'said'): 1, ('said', 'government'): 1, ('government', 'decided'): 1, ('decided', 'consumer'): 1, ('consumer', 'price'): 1, ('price', 'subsidy'): 1, ('subsidy', 'cut'): 1, ('cut', 'reduce'): 1, ('reduce', 'state'): 1, ('state', 'spending'): 2, ('spending', 'today'): 1, ('today', 'price'): 1, ('price', 'meat'): 1, ('meat', 'rise'): 1, ('rise', 'average'): 1, ('average', 'pct'): 1, ('pct', 'beer')

In [13]:
# Print the top 5 most common bigrams
print(dict(bigram_counts.most_common(5)))

{('effort', 'curb'): 2, ('sharp', 'price'): 2, ('budget', 'deficit'): 2, ('state', 'spending'): 2, ('mti', 'said'): 2}
