# N-Gram Counter

In [None]:
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Code to download corpora
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [2]:
lemmatizer = WordNetLemmatizer()

In [None]:
article = reuters.raw(reuters.fileids(categories='cpi')[2])
print(article)

In [4]:
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [None]:
processed = process_text(article)
print(processed)

## Frequency Analysis: Word Counts

In [6]:
from collections import Counter

In [None]:
# get word counts
word_counts = Counter(processed)
print(dict(word_counts))

In [None]:
# get top x words
print(dict(word_counts.most_common(20)))

## Frequency Analysis: N-gram Counts

In [None]:
bigram_counts = Counter(ngrams(processed, n=2))
print(dict(bigram_counts))

In [10]:
print(dict(bigram_counts.most_common(10)))

{('effort', 'curb'): 2, ('sharp', 'price'): 2, ('budget', 'deficit'): 2, ('state', 'spending'): 2, ('mti', 'said'): 2, ('diplomat', 'said'): 2, ('hungary', 'raises'): 1, ('raises', 'prices'): 1, ('prices', 'effort'): 1, ('curb', 'deficit'): 1}
