In [1]:
import os
import sys
import regex as re
from nltk.stem import PorterStemmer
sys.path.append("../")
from data_extraction.utils import data_extraction

ps = PorterStemmer()

In [2]:
# data path
data_path = '../data'
doc_id_tag = 'DOCNO'
doc_text_tag = 'TEXT'

# Loading stopwords

In [3]:
stopwords_path = '../stopwordlist.txt'
with open(stopwords_path, 'r') as file:
    stopwords = file.read()
    stopwords = stopwords.split('\n')
    stopwords_list = [word.strip().lower() for word in stopwords if word]

print(stopwords_list[:10])


['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards']


# Forward index

In [4]:
# extract text from all folders and files
extracted_text = {}
document_count = 0
forward_index = {}
all_tokens = []
for folderID in os.listdir(data_path):
    extracted_text[folderID] = {}
    with open(os.path.join(data_path, folderID), 'r') as file:
        content = file.read()
        sub_files = content.split('<DOC>')
        sub_files = [file for file in sub_files if len(file) > 0]
        for file in sub_files:
            docID = data_extraction.extract_text_between_tags(file, doc_id_tag)
            forward_index[docID] = {}
            text = data_extraction.extract_text_between_tags(file, doc_text_tag)
            words, unique_words = data_extraction.tokenizer(text, stopwords_list)
            # stemming with Porter Stemmer
            stemmed_words = [ps.stem(word) for word in words]
            stemmed_unique_words = [ps.stem(unique_word) for unique_word in unique_words]
            stemmed_unique_words.sort()
            all_tokens.extend(stemmed_unique_words)
            for unique_word in stemmed_unique_words:
                count = stemmed_words.count(unique_word)
                forward_index[docID][unique_word] = count

all_tokens = sorted(list(set(all_tokens)))

In [5]:
print("Total tokens : ", len(all_tokens))

Total tokens :  32606


In [6]:
print("Length of forward index : ", len(forward_index))
forward_index

Length of forward index :  5368


{'FT911-1': {'ago': 1,
  'aircraft': 2,
  'airport': 1,
  'append': 1,
  'articl': 1,
  'attend': 1,
  'august': 1,
  'bloodi': 1,
  'bone': 1,
  'bournemouth': 1,
  'british': 3,
  'brown': 1,
  'caption': 1,
  'celebr': 1,
  'charl': 1,
  'correct': 2,
  'cranwel': 3,
  'design': 1,
  'engin': 1,
  'eric': 1,
  'event': 1,
  'fli': 2,
  'flight': 3,
  'frank': 4,
  'front': 1,
  'ga': 1,
  'geoffrey': 1,
  'gloster': 3,
  'health': 1,
  'heinkel': 1,
  'ill': 1,
  'issu': 1,
  'jet': 5,
  'lincolnshir': 1,
  'made': 1,
  'maiden': 2,
  'mcclure': 1,
  'meteor': 1,
  'minut': 1,
  'month': 1,
  'mr': 3,
  'nf': 1,
  'page': 1,
  'patent': 1,
  'pictur': 1,
  'pilot': 2,
  'prototyp': 1,
  'publish': 2,
  'raf': 3,
  'repli': 1,
  'restor': 1,
  'return': 1,
  'shout': 1,
  'sir': 3,
  'state': 1,
  'test': 1,
  'tuesday': 1,
  'turbin': 1,
  'unabl': 1,
  'wasn': 1,
  'whittl': 4,
  'winkl': 1,
  'year': 4,
  'yesterday': 1},
 'FT911-2': {'agreement': 1,
  'ahead': 1,
  'announc': 1,


# Inverted index

In [7]:
inverted_index = {}
keys = forward_index.keys()
for token in all_tokens:
    inverted_index[token] = {}
    for key in keys:
        if token in forward_index[f'{key}'].keys():
            inverted_index[token][f'{key}'] = forward_index[f'{key}'][token]

In [8]:
print("Length of inverted index : ", len(inverted_index))
inverted_index

Length of inverted index :  32606


{'aa': {'FT911-245': 1,
  'FT911-4288': 1,
  'FT911-5120': 2,
  'FT911-710': 2,
  'FT911-1456': 2},
 'aaa': {'FT911-710': 1, 'FT911-2220': 1},
 'aachen': {'FT911-4943': 1, 'FT911-5032': 1, 'FT911-780': 2},
 'aaf': {'FT911-351': 2, 'FT911-4225': 1},
 'aah': {'FT911-1751': 1},
 'aakvaag': {'FT911-4970': 2},
 'aalborg': {'FT911-4308': 1, 'FT911-4454': 1},
 'aaron': {'FT911-1402': 1, 'FT911-2986': 2},
 'ab': {'FT911-19': 1,
  'FT911-4889': 1,
  'FT911-644': 1,
  'FT911-2334': 1,
  'FT911-2486': 4,
  'FT911-2802': 4,
  'FT911-2803': 1},
 'ababa': {'FT911-2326': 1,
  'FT911-2678': 2,
  'FT911-2899': 5,
  'FT911-3180': 1,
  'FT911-3286': 1},
 'aback': {'FT911-5034': 1,
  'FT911-659': 1,
  'FT911-669': 1,
  'FT911-1442': 1,
  'FT911-3109': 1},
 'abalkin': {'FT911-112': 1},
 'abandon': {'FT911-12': 1,
  'FT911-31': 1,
  'FT911-48': 1,
  'FT911-140': 1,
  'FT911-151': 1,
  'FT911-223': 1,
  'FT911-307': 1,
  'FT911-323': 1,
  'FT911-354': 1,
  'FT911-3363': 1,
  'FT911-3381': 1,
  'FT911-3407': 