In [1]:
import os,sys
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('punkt')

In [2]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [3]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.translate(str.maketrans("","",string.punctuation))
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub(r'\d+','',word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stop_words = set(stopwords.words("english"))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

In [4]:
def lexical_analysis(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    return words

In [5]:
vocabulary = []

In [6]:
for filename in os.listdir(os.getcwd()+"/Docs"):
    with open(os.path.join(os.getcwd()+"/Docs",filename),"r") as rf:
        
        print("size of",filename,":",os.stat(os.getcwd()+"/Docs/"+filename).st_size,"bytes")
        
        processed_doc_name = "processed_"+filename
        
        sample = rf.read()

        sample = strip_html(sample)
        words = word_tokenize(sample)

        words = lexical_analysis(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after lexical analysis:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = remove_stopwords(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after removing stopwords:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = stem_words(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after stemming:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        words = remove_stopwords(words)
        with open(os.path.join(os.getcwd()+"/processed_docs",processed_doc_name),"w") as wf:
            n = wf.write(" ".join(words))
        print("size after removing stop words once again after stemming:",os.stat(os.getcwd()+"/processed_docs/"+processed_doc_name).st_size,"bytes")
        
        vocabulary=vocabulary+words
        print("\n\n")

size of LargeText.txt : 674425 bytes
size after lexical analysis: 623529 bytes
size after removing stopwords: 419041 bytes
size after stemming: 320857 bytes
size after removing stop words once again after stemming: 318005 bytes



size of Text1.txt : 29387 bytes
size after lexical analysis: 18774 bytes
size after removing stopwords: 14452 bytes
size after stemming: 11467 bytes
size after removing stop words once again after stemming: 11405 bytes



size of Text2.txt : 11588 bytes
size after lexical analysis: 10989 bytes
size after removing stopwords: 7032 bytes
size after stemming: 5498 bytes
size after removing stop words once again after stemming: 5430 bytes



size of Text3.txt : 27201 bytes
size after lexical analysis: 25852 bytes
size after removing stopwords: 17057 bytes
size after stemming: 12652 bytes
size after removing stop words once again after stemming: 12569 bytes





In [7]:
vocabulary = list(set(vocabulary))

In [8]:
vocabulary.sort()

In [9]:
with open(os.path.join(os.getcwd(),"vocabulary.txt"),"w") as wf:
    wf.write(" ".join(vocabulary))