In [1]:
import os
import sys
import heapq
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import PorterStemmer

# nltk.download("stopwords")
# nltk.download("punkt")

stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [2]:
INDEX_FOLDER_PATH = os.path.abspath("INDEX_FOLDER")
QUERY_OUTPUT_DIR = os.path.abspath("out")
SECONDARY_INDEX_FN = "secondary_index.txt"
SECONDARY_INDEXES = []

SECTION_WEIGHT = [1000.0, 650.0, 50.0, 150.0, 200.0, 175.0]
TITLE_PER_FILE = 2000
K = 10 #top K documents for query

query = ""
query_output_file_index = -1
query_out_folder = ""

In [3]:
def preprocess(query) :
    #case folding and tokenization
    query = word_tokenize(query.lower())
    #removing stopwords
    query = [word for word in query if word not in stopwords]
    #stemming
    query = [stemmer.stem(word) for word in query]

    return query

In [4]:
def get_index_file(word):
    global SECONDARY_INDEXES
    for idx, entry in enumerate(SECONDARY_INDEXES):
        if(word < entry):
            return idx - 1
    return len(SECONDARY_INDEXES) - 1

In [5]:
def get_fields_count(fields) :
    field_count = [0, 0, 0, 0, 0, 0]
    for i in range(len(fields)) :
        if fields[i] == 't':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[0] = int(fields[i + 1:j])
            i = j
        elif fields[i] == 'i':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[1] = int(fields[i + 1:j])
            i = j
        elif fields[i] == 'b':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[2] = int(fields[i + 1:j])
            i = j
        elif fields[i] == 'c':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[3] = int(fields[i + 1:j])
            i = j
        elif fields[i] == 'l':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[4] = int(fields[i + 1:j])
            i = j
        elif fields[i] == 'r':
            j = i + 1
            while(j < len(fields) and fields[j] >= "0" and fields[j] <= "9") :
                j += 1
            field_count[5] = int(fields[i + 1:j])
            i = j
        
    return field_count

In [6]:
def get_score(fields, IDF):
    score = 0
    for i in range(0, len(fields)):
        score = score + (float(fields[i]) * float(SECTION_WEIGHT[i]) * float(IDF))
    return score

In [7]:
def get_word_info(word) :
    ind = get_index_file(word)
    index_fn = "index_" + str(ind) + ".txt"
    index_fp = open(os.path.join(INDEX_FOLDER_PATH, index_fn), "r", encoding="utf-8")

    word_found = False
    line = index_fp.readline()

    while(line != "") :
        line = line.strip("\n")
        if(line != "") :
            line = line.split("=")
            if(line[0] == word) :
                word_found = True
                break
        line = index_fp.readline()

    word_info = {"IDF": 0, "doc_count": 0, "freq": 0, "posting_list" : {}, "score" : {}}

    if(word_found) :
        word_info["IDF"] = line[1]
        word_info["doc_count"] = line[2]
        word_info["freq"] = line[3]
        posting_list = line[4].split("|")
        for posting in posting_list :
            docID, fields = posting.split(" ")
            fields = get_fields_count(fields)
            score = get_score(fields, word_info["IDF"])
            word_info["posting_list"][docID] = fields
            word_info["score"][docID] = score

    return word_info

In [8]:
def get_title_from_id(docId) :
    title_file_ind = docId // TITLE_PER_FILE
    title_file_offset = docId % TITLE_PER_FILE
    title_fn = "title_" + str(title_file_ind) + ".txt"
    title_fp = open(os.path.join(INDEX_FOLDER_PATH, title_fn), "r", encoding="utf-8")
    lines = title_fp.readlines()
    title = lines[title_file_offset].strip("\n")
    return title

In [9]:
def processQuery(query) :
    if(query != "") :
        query_start_time = datetime.utcnow()

        query = preprocess(query)

        query_word_info = {}
        for word in query :
            query_word_info[word] = get_word_info(word)

        docIds = set()
        for word in query_word_info :
            docIds = docIds.union(set(query_word_info[word]["score"].keys()))

        max_heap = []
        for docID in docIds :
            score = 0
            for word in query_word_info :
                if docID in query_word_info[word]["score"] :
                    score += query_word_info[word]["score"][docID]

            max_heap.append((-score, docID))

        heapq.heapify(max_heap)

        topK_docId = []
        for i in range(0, K) :
            topK_docId.append(heapq.heappop(max_heap)[1])
            
        topK_doc_info = []
        for docId in topK_docId :
            topK_doc_info.append([docId, get_title_from_id(int(docId))])

        global query_output_file_index
        query_output_file_index += 1
        if(os.path.isdir(QUERY_OUTPUT_DIR) == False):
            os.mkdir(QUERY_OUTPUT_DIR)
        query_out_fn = "query_" + str(query_output_file_index) + "_out.txt"
        
        with open(os.path.join(QUERY_OUTPUT_DIR, query_out_fn), "w", encoding="utf-8") as query_out_fp:
            for doc_info in topK_doc_info :
                query_out_fp.write("Doc Id : " + doc_info[0] + " Title : " + doc_info[1] + "\n")
            query_out_fp.write("\nQuery Processing Time :  %.2f seconds\n" % (datetime.utcnow() - query_start_time).total_seconds())
            

In [10]:
def load_secondary_index():
    secondary_fp = open(os.path.join(INDEX_FOLDER_PATH, SECONDARY_INDEX_FN), "r", encoding="utf-8")
    global SECONDARY_INDEXES
    SECONDARY_INDEXES = []
    lines = secondary_fp.readlines()
    for line in lines:
        line = line.strip("\n")
        if(line != ""):
            SECONDARY_INDEXES.append(line)
    secondary_fp.close()

In [11]:
def searcher() :
    query_or_file = "q"
    query_in_or_file = "prim algorithm minimum spanning tree"
    load_secondary_index() 
    if(query_or_file == "q") :
        global query 
        query = query_in_or_file
        processQuery(query)
    else :
        query_in_fp = open(os.path.abspath(query_in_or_file), "r", encoding="utf-8")
        queries = query_in_fp.readlines()
        for query in queries :
            processQuery(query.strip("\n"))

In [12]:
searcher()

(-4140.483900795409, '3685')
prim {'8231': [0, 0, 2, 0, 0, 0]}
algorithm {'810': [0, 0, 2, 0, 0, 0], '2285': [0, 0, 1, 0, 0, 0], '3064': [0, 0, 1, 0, 0, 0], '3105': [0, 0, 1, 0, 0, 0], '3325': [0, 0, 1, 0, 0, 0], '4204': [0, 0, 9, 1, 0, 0], '4601': [0, 0, 1, 0, 0, 0], '7766': [0, 0, 4, 1, 0, 0], '8571': [0, 0, 3, 0, 0, 0], '8580': [0, 0, 0, 1, 0, 0], '9640': [0, 1, 4, 0, 0, 0], '10101': [0, 0, 1, 0, 0, 0], '10266': [0, 0, 1, 0, 0, 0], '11083': [0, 0, 1, 0, 0, 0], '11533': [0, 0, 13, 0, 0, 0]}
minimum {'44': [0, 0, 2, 0, 0, 0], '284': [0, 0, 2, 0, 0, 0], '1116': [0, 0, 1, 0, 0, 0], '1374': [0, 0, 1, 0, 0, 0], '1607': [0, 0, 1, 0, 0, 0], '2025': [0, 0, 1, 0, 0, 0], '2584': [0, 0, 1, 0, 0, 0], '2945': [0, 0, 2, 0, 0, 0], '2947': [0, 0, 2, 0, 0, 0], '3306': [0, 0, 1, 0, 0, 0], '3311': [0, 0, 1, 0, 0, 0], '3362': [0, 0, 1, 0, 0, 0], '3658': [0, 0, 1, 0, 0, 0], '3855': [0, 0, 1, 0, 0, 0], '4142': [0, 0, 2, 0, 0, 0], '5024': [0, 0, 1, 0, 0, 0], '5105': [0, 0, 1, 0, 0, 0], '5151': [0, 0, 1, 0,