In [40]:
import os
import regex as re

In [41]:
data_path = '../data'
doc_id_tag = 'DOCNO'
doc_text_tag = 'TEXT'

# Loading stopwords

In [42]:
stopwords_path = '../stopwordlist.txt'
with open(stopwords_path, 'r') as file:
    stopwords = file.read()
    stopwords = stopwords.split('\n')
    stopwords_list = [word.strip().lower() for word in stopwords if word]

print(stopwords_list)


['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', 'came', 'can', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'course', 'currentl

In [54]:
def extract_text_between_tags(doc, tag) -> str:
    '''
    inputs:
        - doc: string of individual html doc
        - tag: name of the tag between which the text should be extracted

    output:
        - string of extracted text
    '''
    start_tag = f'<{tag}>'  # opening tag
    end_tag = f'</{tag}>'   # closing tag
    start_index = 0

    while True:
        start_index = doc.find(start_tag, start_index)
        if start_index == -1:
            break
        start_index += len(start_tag)
        end_index = doc.find(end_tag, start_index)
        if end_index == -1:
            break
        extracted_text = doc[start_index:end_index].strip()
        start_index = end_index + len(end_tag)

    return extracted_text

def tokenizer(text) -> list:
    # excluding numbers and any words that contain numbers in the provided text
    cleaned_text = re.sub(r'\b\w*\d\w*\b', '', text)
    # removing any extra spaces between words that was created after excluding numbers from above line 
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()    # removing any leading or trailing spaces with .strip()
    # split on all nonalphanumeric characters
    words = re.split(r'\W+', cleaned_text)
    # removing any empty strings created after splitting and converting all the words to lower case
    words = [word.lower() for word in words if word]
    # soring all the words
    words = sorted(words)
    # removing stop words
    words = [word for word in words if word not in stopwords_list]

    # eliminating duplicate words in the list
    words = list(set(words))
    
    return words

In [55]:
# to extract text from all folders and files
extracted_text = {}

for folderID in os.listdir(data_path):
    extracted_text[folderID] = {}
    with open(os.path.join(data_path, folderID), 'r') as file:
        content = file.read()
        sub_files = content.split('<DOC>')
        sub_files = [file for file in sub_files if len(file) > 0]
        for file in sub_files:
            docID = extract_text_between_tags(file, doc_id_tag)
            text = extract_text_between_tags(file, doc_text_tag)
            words = tokenizer(text)
            extracted_text[folderID][docID] = words

In [57]:
for key, _ in extracted_text.items():
    print(len(extracted_text[key]))

375
371
322
350
383
380
240
346
377
380
353
380
409
329
373
