**Author:** Ahmadreza Attarpour  
**Email:** [a.attarpour@mail.utoronto.ca](mailto:a.attarpour@mail.utoronto.ca)  

This is my answers notebook #1 to the assignment of NLP course at AI4PH

-Load the Brown Corpus from NLTK using paras(). 

-Remove punctuation and stopwords. 

-Apply the lancaster stemmer. 

-Print to the screen the top 10 words in terms of TF. Show the TF values as well. 

-Print to the screen the top 10 words in terms of TF-IDF. Use the paragraphs as documents for calculating TF-IDF. Show the TF-IDF values as well. 

-Use pos_tag() to tag each token. 

-Print to the screen the 10 most common trigrams of word-tag pairs. Show their frequencies as well. Use nltk.trigrams(). 

In [1]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/ahmadreza/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
# import all packages used in this file
from nltk.corpus import brown
import string
from nltk.corpus import stopwords
import math
import operator
from collections import Counter
import re


# Load the Brown Corpus from NLTK using paras(). 

In [5]:
# Load the Brown Corpus using paras()
brown_paras = brown.paras()
# Print the first paragraph
print(brown_paras[0])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']]


# Remove punctuation and stopwords. 

In [7]:
# Function to remove punctuation and stopwords
def clean_paragraph(paragraph):
    cleaned_paragraph = []
    for sentence in paragraph:
        cleaned_sentence = [word.lower() for word in sentence if word.lower() not in set(stopwords.words('english')) and word not in string.punctuation and re.match(r'^\w+$', word)]
        cleaned_paragraph.append(cleaned_sentence)
    return cleaned_paragraph


In [8]:
# Clean the paragraphs
cleaned_brown_paras = [clean_paragraph(paragraph) for paragraph in brown_paras]

# Print the first cleaned paragraph
print(cleaned_brown_paras[0])

[['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place']]


# Apply the lancaster stemmer. 

In [9]:
lancaster = nltk.LancasterStemmer()
# Function to stem the words in the paragraphs
def lancaster_stemmer(paragraph):
    lancaster_stems = []
    for sentence in paragraph:
        sentence_stem = [lancaster.stem(word) for word in sentence]
        lancaster_stems.append(sentence_stem)
    return lancaster_stems

In [10]:
# stem the words in the paragraphs
cleaned_brown_paras_stems = [lancaster_stemmer(paragraph) for paragraph in cleaned_brown_paras]

# Print the first cleaned paragraph
print(cleaned_brown_paras_stems[0])

[['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investig', 'rec', 'prim', 'elect', 'produc', 'evid', 'irregul', 'took', 'plac']]


# Print to the screen the top 10 words in terms of TF. Show the TF values as well. 

In [11]:
tokens_clean = [word for paragraph in cleaned_brown_paras_stems for sentence in paragraph for word in sentence]

print(tokens_clean[:10])

['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investig', 'rec', 'prim', 'elect']


In [12]:
# Calculate term frequency
tf_clean = nltk.FreqDist(tokens_clean)

# Print the top 10 words along with their TF values
top_10_words = tf_clean.most_common(10)
for word, freq in top_10_words:
    print(f'{word}: {freq}')

on: 3431
would: 2715
us: 2490
stat: 2095
said: 1961
tim: 1957
ev: 1944
new: 1785
man: 1700
year: 1620


# Print to the screen the top 10 words in terms of TF-IDF. Use the paragraphs as documents for calculating TF-IDF. Show the TF-IDF values as well. 

In [13]:

# Calculate TF-IDF
tf_idf = {}
doc_size = len(brown_paras)
ndocs = len(cleaned_brown_paras_stems)

for token in tf_clean:
    count = 0
    for paragraph in cleaned_brown_paras_stems:
        if any(token in sentence for sentence in paragraph):
            count += 1
    tf_idf[token] = tf_clean[token] * math.log(ndocs / (1 + count))

# Sort and print the top 10 words along with their TF-IDF values
sorted_tf_idf = sorted(tf_idf.items(), key=operator.itemgetter(1), reverse=True)
for word, value in sorted_tf_idf[:10]:
    print(f'{word}: {value}')

on: 6096.876533436755
would: 5713.5908380628825
us: 5275.538945943473
stat: 5012.2550712287175
tim: 4423.8485854492155
ev: 4364.9440280267145
new: 4321.154824085933
said: 4259.562959566486
man: 4164.951740043983
af: 4083.3627652471705


# Use pos_tag() to tag each token. 

In [16]:
# use NLTK to tag 
tags = nltk.pos_tag(tokens_clean)
tags[:20]

[('fulton', 'NN'),
 ('county', 'NN'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('said', 'VBD'),
 ('friday', 'JJ'),
 ('investig', 'JJ'),
 ('rec', 'NN'),
 ('prim', 'NN'),
 ('elect', 'VBP'),
 ('produc', 'NN'),
 ('evid', 'NN'),
 ('irregul', 'NN'),
 ('took', 'VBD'),
 ('plac', 'JJ'),
 ('jury', 'NN'),
 ('said', 'VBD'),
 ('pres', 'NNS'),
 ('city', 'NN'),
 ('execut', 'VBP')]

# Print to the screen the 10 most common trigrams of word-tag pairs. Show their frequencies as well. Use nltk.trigrams(). 

In [18]:
# Generate trigrams of word-tag pairs
trigrams_tags = list(nltk.trigrams((tags)))

# Calculate the frequency distribution of the trigrams
trigrams_freq = Counter(trigrams_tags)

# Print the 10 most common trigrams along with their frequencies
top_10_trigrams = trigrams_freq.most_common(10)
for trigram, freq in top_10_trigrams:
    print(f'{trigram}: {freq}')

(('world', 'NN'), ('war', 'NN'), ('2', 'CD')): 35
(('new', 'JJ'), ('york', 'NN'), ('city', 'NN')): 27
(('new', 'JJ'), ('york', 'NN'), ('tim', 'NN')): 22
(('index', 'NN'), ('word', 'NN'), ('electron', 'NN')): 21
(('govern', 'JJ'), ('unit', 'NN'), ('stat', 'NN')): 18
(('word', 'NN'), ('electron', 'NN'), ('switch', 'NN')): 18
(('unit', 'NN'), ('stat', 'NN'), ('americ', 'JJ')): 16
(('new', 'JJ'), ('york', 'NN'), ('cent', 'NN')): 15
(('af', 'NN'), ('af', 'NN'), ('af', 'NN')): 15
(('world', 'NN'), ('war', 'NN'), ('1', 'CD')): 14
