### Name: Sai Anish Garapati
### UIN: 650208577

## Importing Libraries

In [19]:
import os, string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Functions used for processing

In [20]:
def preprocessing(path):
	file_list = os.listdir(path)

	file_string = ''
	for file_name in file_list:
		file = open(path + file_name, 'r')
		file_string += file.read()
	
	words_list = word_tokenize(file_string)
	words_list = [word.translate(str.maketrans('', '', string.punctuation)) for word in words_list]
	words_list = [word.lower() for word in words_list if word != '']
	return words_list

def list_to_word_freq(words_list):
	word_freq = {}
	for word in words_list:
		if word in word_freq:
			word_freq[word] += 1
		else:
			word_freq[word] = 1
	return word_freq

def remove_stop_words_from_dict(words_freq):
	return {word: freq for (word, freq) in word_freq.items() if word not in stop_words}

def stemmer_on_dict(words_freq):
	ps = PorterStemmer()
	words_freq_stemmed = {}

	for word, freq in words_freq.items():
		word = ps.stem(word)
		if (word in words_freq_stemmed):
			words_freq_stemmed[word] += freq
		else:
			words_freq_stemmed[word] = freq
	return dict(sorted(words_freq_stemmed.items(), key=lambda item: item[1], reverse=True))


## 1) Preprocessing the collection

In [21]:
path = 'citeseer/'
words_list = preprocessing(path)

## 2) Frequency of occurrence for all words 

In [22]:
word_freq = list_to_word_freq(words_list)

## 2.a) Total number of words in the collection

In [23]:
print(len(words_list))

477989


## 2.b) Vocabulary size

In [24]:
print(len(word_freq))

19630


## 2.c) Top 20 words in the ranking

In [25]:
word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse = True))
word_freq_top_20 = dict(list(word_freq.items())[:20])
print(word_freq_top_20)

{'the': 25667, 'of': 18643, 'and': 14134, 'a': 13372, 'to': 11539, 'in': 10069, 'for': 7382, 'is': 6580, 'we': 5147, 'that': 4821, 'this': 4447, 'are': 3738, 'on': 3653, 'an': 3281, 'with': 3200, 'as': 3060, 'by': 2767, 'data': 2694, 'be': 2500, 'information': 2326}


## 2.d) Stop words from top 20

In [26]:
stop_words = set(stopwords.words('english'))

print([word for word in word_freq_top_20 if word in stop_words])

['the', 'of', 'and', 'a', 'to', 'in', 'for', 'is', 'we', 'that', 'this', 'are', 'on', 'an', 'with', 'as', 'by', 'be']


## 2.e) Unique words accounting for 15% of total words

In [27]:
print(len([word for word in word_freq if word_freq[word] >= 0.15 * len(words_list)]))

0


## 3) Integrating stemmer and stopword eliminator

In [28]:
# Removing stop words from the vocabulary
words_freq_new = remove_stop_words_from_dict(word_freq)

# Applying stemmer on the new vocabulary
words_freq_new_stemmed = stemmer_on_dict(words_freq_new)

## 3.a) Total number of words in the new collection

In [29]:
print(sum(words_freq_new_stemmed.values()))

294927


## 3.b) Vocabulary size

In [30]:
print(len(words_freq_new_stemmed))

13625


## 3.c) Top 20 words in the ranking

In [31]:
print(dict(list(words_freq_new_stemmed.items())[:20]))

{'system': 3745, 'use': 3741, 'agent': 2695, 'data': 2694, 'inform': 2402, 'model': 2314, 'paper': 2247, 'queri': 1905, 'user': 1758, 'learn': 1742, 'algorithm': 1584, '1': 1569, 'problem': 1545, 'approach': 1544, 'applic': 1524, 'present': 1507, 'base': 1499, 'web': 1440, 'databas': 1425, 'comput': 1414}


## 3.d) Stop words from top 20

In [32]:
print([word for word in dict(list(words_freq_new_stemmed.items())[:20]) if word in stop_words])

[]


## 3.e) Unique words accounting for 15% of total words

In [33]:
print(len([word for word in words_freq_new_stemmed if words_freq_new_stemmed[word]
                           >= 0.15 * sum(words_freq_new_stemmed.values())]))

0
