In [1]:
import re
import pandas as pd
from collections import defaultdict
import nltk, string
import pickle, math
from tqdm import tqdm

var_dict = {}
var_dict["D_SID"] = '.I '
var_dict["D_UI"] = '.U'
var_dict["D_MESH"] = '.M'
var_dict["D_TITLE"] = '.T'
var_dict["D_PUB"] = '.P'
var_dict["D_ABS"] = '.W'
var_dict["D_SOURCE"] = '.S'
var_dict["D_AUTHOR"] = '.A'

nltk.download('stopwords')
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(['.U', '.S','.M','.T','.P','.W','.M','.I'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def get_id(doc):
    # Obtain string that contains the Document ID
    id_string = doc[0]
    # Split the string and make it a list
    # the ID is the last element of that list
    id_string = list(id_string.split("\n"))
    return int(id_string[2])

def remove_stopwords(text, stpwrd):
    # Create a list of words without the stop words
    word_list = [word for word in text.split() if word not in stpwrd]
    # Create a string of the created list
    text =  ' '.join(word_list) 
    return text

def clean_text(input_text):
    # Remove . Headers
    input_text = re.sub(r'\.[A-Z]', '', input_text)
    # Remove punctuation
    input_text = input_text.translate(str.maketrans("", "", string.punctuation))
    # Remove \n, \r and \t
    input_text = re.sub(r'[\n\t\r]', ' ', input_text)
    # Remove digits
    input_text = re.sub(r'[\d]', '', input_text)
    # Make text lower case
    input_text = input_text.lower()
    # Remove leading and trailing spaces
    input_text = input_text.strip()
    # Convert the text to a list of words
    final_text = input_text.split()

    return final_text

def get_text(doc):
    # Obtain string that contains the text 
    text_string = doc[1]
    # Remove the stop words from the list
    text_string = remove_stopwords(text_string, stpwrd)
    # Process the string and convert it into a list of words
    text_string = clean_text(text_string) 
    return text_string
    
def read_doc_file(path):
    # Read the file and for each document, split the ID string and the text string
    with open(path, 'r') as f:
        documents = f.read().split(var_dict["D_SID"]) 
    # Make a list for each file such that the ID and the text are separate elements
    documents = [doc.split("\n"+var_dict["D_SOURCE"], maxsplit=1) for doc in documents[1:]]
    return documents

def process_data(documents):
    text_id_dict = {}
    for doc in tqdm(documents):
        try:
            doc_id = get_id(doc)
            doc_text = get_text(doc)
#             print('----------')
#             print(doc_text)
#             print('----------')
            text_id_dict[doc_id] = doc_text
        except:
            continue
    return text_id_dict

def index_generator(termlists):
    # Empty dictionary to store the resulting indexes.
    generated_index_list = {}
    for filename, termlist in tqdm(termlists.items()):
        # Empty dictionary to store the index for a file.
        fileIndex = {}
        for index, term in enumerate(termlist):
            # checks whether it already exists as a key.
            if term in fileIndex:
                # Index is appended to existing list of positions.
                fileIndex[term].append(index)
            else:
                # New key created for the term.
                fileIndex[term] = [index]
        # Add to index dictionary with the filename as the key.
        generated_index_list[filename] = fileIndex
    return generated_index_list

def inverted_index_generator(generated_index_list):
    # Empty dictionary to store the resulting inverted indexes.
    inv_index_dict = {}
    # Iterate through each file's index list.
    for filename, file_index in tqdm(generated_index_list.items()):
        # Iterate through each term in a file's index list
        for term, pos in file_index.items():
            # If the term is not present in the inverted index dict,
            # add an empty key field to it.
            if term not in inv_index_dict:
                inv_index_dict[term] = {}
            # If it is already present, add the filename and positions
            # to the existing sub-dictionary.
            inv_index_dict[term][filename] = pos
    return inv_index_dict

def generate_FTQ(curr_query, inverted_index):
    # create a set to store the unique documents
    set_of_docs = set()
    # Iterate through each term in a query
    for term in curr_query:
        # Check if the term is present in the inverted index dictionary
        if term in inverted_index_dict:
            # If it is present, ger the unique docs
            unique_docs = set(inverted_index_dict[term].keys())
            # Update the set of documents
            set_of_docs.update(unique_docs)
    return list(set_of_docs)

def parse_queries(file):
    queries = []
    current_query = None
    for line in file:
        line = line[:-1]
        if '<top>' in line:
            current_query = {}
        elif '</top>' in line:
            queries.append(current_query)
            current_query = {}
        elif '<num>' in line:
            current_query['num'] = line.split(':')[1].strip()
        elif '<title>' in line:
            current_query['title'] = line.split('>')[1].strip()
        elif (not '<desc>' in line and len(line) > 2):
            current_query['description'] = line
    return queries

def extract_queries(queries):
    # Remove the stopwords and clean the text
    extracted_queries = {query['num']: clean_text(remove_stopwords(query['description'], stpwrd)) for query in queries}
    return extracted_queries

def query_reader(filename):
    # Open the file and parse through the quaries
    with open(filename, 'r+') as file:
        queries = parse_queries(file)
    # Extract and clean/process the queries
    extracted_queries = extract_queries(queries)        
    return extracted_queries

def get_k_top(document_score, top_k):
    # Convert the directory items to a list
    docs_score_list = list(document_score.items())
    # Sort the list according to the scores
    sorted_docs_score_list = sorted(docs_score_list, key=lambda item: item[1], reverse=True)
    # Pick the top scoring document
    top_docs_score_list = sorted_docs_score_list[:top_k]
    return top_docs_score_list

def get_intersection_score(document, query):
    doc_terms = index_dict[document].keys()
    intersection = set(list(doc_terms)) & set(query)
    boolean_score = len(list(intersection))
    return boolean_score

def get_boolean_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        boolean_score = get_intersection_score(document, query)
        document_score[document] = boolean_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_term_frequency(document, query):
    curr_doc_len = len(index_dict[document])
    frequency = 0
    for term in query:
        term_frequency = 0
        if term in index_dict[document]:
            term_frequency = len(index_dict[document][term])
        frequency += term_frequency
    return frequency/curr_doc_len

def get_inv_doc_frequency(document, query):
    curr_doc_len = len(index_dict[document])
    cumulative_inv_doc_frequency = 0
    for term in query:
        if term in inverted_index_dict:
            document_frequency = len(inverted_index_dict[term])
            inv_doc_frequency = len(inverted_index_dict) / document_frequency
            inv_doc_frequency = math.log(inv_doc_frequency)
            cumulative_inv_doc_frequency += inv_doc_frequency
    return cumulative_inv_doc_frequency

def get_tf_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        term_frequency = get_term_frequency(document, query)
        document_score[document] = term_frequency
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_tf_idf_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        term_frequency = get_term_frequency(document, query)
        inv_doc_frequency = get_inv_doc_frequency(document, query)
        tf_idf_score = term_frequency * inv_doc_frequency
        document_score[document] = tf_idf_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_custom_rank(document_list, query, top_k):
    discount_factor = 0.75
    relevant_scores = get_tf_idf_rank(document_list, query, top_k)
    relevant_documents = [doc for doc, score in relevant_scores]
    document_score = {}
    for count, document in enumerate(relevant_documents):
        term_frequency = get_term_frequency(document, query)
        inv_doc_frequency = get_inv_doc_frequency(document, query)
        tf_idf_score = term_frequency * inv_doc_frequency
        d_val = ((discount_factor)**count)
        custom_score = (d_val + (tf_idf_score)) - math.log(tf_idf_score)
        document_score[document] = custom_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

In [11]:
print('Reading documents...')
documents = read_doc_file('ohsumed.88-91')

print('Preprocessing data...')
cleaned_data = process_data(documents)

print('Generating indices...')
index_dict = index_generator(cleaned_data)

print('Generating inverted indices...')
inv_index_dict = inverted_index_generator(index_dict)

print('Saving the index file...')
index_pickle = open("./pickle_files/index.pkl", "wb")
pickle.dump(index_dict, index_pickle)
index_pickle.close()

print('Saving the inverted index file...')
inverted_index_pickle = open("./pickle_files/inverted_index.pkl", "wb")
pickle.dump(inv_index_dict, inverted_index_pickle)
inverted_index_pickle.close()

Reading documents...
Preprocessing data...


100%|████████████████████████████████████████████████████████████████████████| 293867/293867 [01:35<00:00, 3072.97it/s]


Generating indices...


100%|███████████████████████████████████████████████████████████████████████| 293856/293856 [00:24<00:00, 12047.84it/s]


Generating inverted indices...


100%|███████████████████████████████████████████████████████████████████████| 293856/293856 [00:11<00:00, 26314.81it/s]


Saving the index file...
Saving the inverted index file...


In [3]:
queries = query_reader('query.ohsu.1-63')

In [4]:
a_file = open("./pickle_files/inverted_index.pkl", "rb")
inverted_index_dict = pickle.load(a_file)

b_file = open("./pickle_files/index.pkl", "rb")
index_dict = pickle.load(b_file)

In [33]:
def get_ranking_scores(method, inverted_index_dict, index_dict):
    print('Generating Scores...')
    print('Method:', method)
    file_name = method + '.txt'
    f = open('./output/'+file_name, 'w')
    top_k = 50
    for qid, query in tqdm(queries.items()):
        document_list = generate_FTQ(query, inverted_index_dict)
        if method == 'Boolean':
            generated_scores = get_boolean_rank(document_list, query, top_k)
        elif method == 'TF':
            generated_scores = get_tf_rank(document_list, query, top_k)
        elif method == 'TF-IDF':
            generated_scores = get_tf_idf_rank(document_list, query, top_k)
        elif method == 'Custom':
            generated_scores = get_custom_rank(document_list, query, top_k)
        total_scores = len(generated_scores)
        for i in range(total_scores):
            f.write(qid + "\tQ0\t" + str(generated_scores[i][0]) + "\t" 
                    + str(i+1) + "\t" + str(generated_scores[i][1]) + "\t" + method +"\n")
            
    print('\n')
    print('Score generation complete...')
    print('Please check the file in the output sub-directory')

            
print('Choose the ranking method: \n 1) Boolean\n 2) TF\n 3) TF-IDF\n 4) Custom\n')
method = input('Enter the method to be used: ')
get_ranking_scores(method, inverted_index_dict, index_dict)

Choose the ranking method: 
 1) Boolean
 2) TF
 3) TF-IDF
 4) Custom

Enter the method to be used: TF
Generating Scores...
Method: TF


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:14<00:00,  4.45it/s]



Score generation complete...
Please check the file in the output sub-directory





In [32]:
def get_term_frequency(document):
    curr_doc_len = len(index_dict[document])
    frequency = 0
    for term in query:
        term_frequency = 0
        if term in index_dict[document]:
            term_frequency = len(index_dict[document][term])
        frequency += term_frequency
    return frequency/curr_doc_len

def get_k_top(document_score, top_k):
    # Convert the directory items to a list
    docs_score_list = list(document_score.items())
    # Sort the list according to the scores
    sorted_docs_score_list = sorted(docs_score_list, key=lambda item: item[1], reverse=True)
    # Pick the top scoring document
    top_docs_score_list = sorted_docs_score_list[:top_k]
    return top_docs_score_list

def get_tf_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        term_frequency = get_term_frequency(document)
        document_score[document] = term_frequency
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

In [54]:
def get_k_top(document_score, top_k):
    docs_score_list = list(document_score.items())
    sorted_docs_score_list = sorted(docs_score_list, key=lambda item: item[1], reverse=True)
    top_docs_score_list = sorted_docs_score_list[:top_k]
    return top_docs_score_list

def get_intersection_score(document, query):
    doc_terms = index_dict[document].keys()
    intersection = set(list(doc_terms)) & set(query)
    boolean_score = len(list(intersection))
    return boolean_score

def get_boolean_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        boolean_score = get_intersection_score(document, query)
        document_score[document] = boolean_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

In [41]:
def get_term_frequency(document):
    curr_doc_len = len(index_dict[document])
    frequency = 0
    for term in query:
        term_frequency = 0
        if term in index_dict[document]:
            term_frequency = len(index_dict[document][term])
        frequency += term_frequency
    return frequency/curr_doc_len

def get_inv_doc_frequency(document):
    curr_doc_len = len(index_dict[document])
    cumulative_inv_doc_frequency = 0
    for term in query:
        if term in inverted_index_dict:
            document_frequency = len(inverted_index_dict[term])
            inv_doc_frequency = len(inverted_index_dict) / document_frequency
            inv_doc_frequency = math.log(inv_doc_frequency)
            cumulative_inv_doc_frequency += inv_doc_frequency
    return cumulative_inv_doc_frequency

def get_k_top(document_score, top_k):
    docs_score_list = list(document_score.items())
    sorted_docs_score_list = sorted(docs_score_list, key=lambda item: item[1], reverse=True)
    top_docs_score_list = sorted_docs_score_list[:top_k]
    return top_docs_score_list

def get_tf_idf_rank(document_list, query, top_k):
    document_score = {}
    for document in document_list:
        term_frequency = get_term_frequency(document)
        inv_doc_frequency = get_inv_doc_frequency(document)
        tf_idf_score = term_frequency * inv_doc_frequency
        document_score[document] = tf_idf_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

In [88]:
def get_custom_rank(document_list, query, top_k):
    discount_factor = 0.75
    relevant_scores = get_tf_idf_rank(docs, query, 50)
    relevant_documents = [doc for doc, score in relevant_scores]
    document_score = {}
    for count, document in enumerate(relevant_documents):
        term_frequency = get_term_frequency(document)
        inv_doc_frequency = get_inv_doc_frequency(document)
        tf_idf_score = term_frequency * inv_doc_frequency
        d_val = ((discount_factor)**count)
        custom_score = (d_val + (tf_idf_score)) - math.log(tf_idf_score)
        document_score[document] = custom_score
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

In [89]:
for query_code, query in queries.items():
    docs = generate_FTQ(query, inverted_index_dict)
    docs_score = get_custom_rank(docs, query, 50)

In [90]:
docs_score

[(91255752, 6.62219744337424),
 (91295700, 6.070629253065326),
 (89363736, 5.883129253065325),
 (90319778, 5.742504253065325),
 (91333684, 5.4970671961436075),
 (88310191, 5.035790957153294),
 (88287652, 4.976464785278294),
 (91215324, 4.931970156372044),
 (91218591, 4.8985991846923564),
 (89002042, 4.438175358650875),
 (91296148, 4.419404187081051),
 (91167805, 4.405325808403683),
 (88063114, 4.203423661287281),
 (91124570, 4.019151647153143),
 (91269932, 3.9301495328432434),
 (90074399, 3.8773882933876944),
 (89001955, 3.691512823841468),
 (90382798, 3.689007174902063),
 (90023136, 3.6871279381975097),
 (90355638, 3.6857185106690946),
 (90066683, 3.6846614400227833),
 (88073722, 3.68386863703805),
 (91369527, 3.412777657716157),
 (91213849, 3.4123317060372442),
 (89162735, 3.31131855524717),
 (91325316, 3.29022946302501),
 (90313621, 3.2900413271604685),
 (89135238, 3.289900225262063),
 (91046708, 3.2897943988382585),
 (90135678, 3.2897150290204054),
 (89218414, 3.289655501657015),
 

In [23]:
def tf_ranking(docs, query):
    docs_score = {}
    for doc in docs:
        doc_length = len(index_dict[doc])
        absolute_freq = 0
        normalized_freq = 0
        for term in query:
            try:
                term_freq = len(index_dict[doc][term])
            except:
                term_freq = 0
            absolute_freq += term_freq
        normalized_freq = absolute_freq / doc_length
        docs_score[doc] = normalized_freq
    docs_score = list( sorted(docs_score.items(),
                           key=lambda item: item[1],
                           reverse=True))
    return docs_score[:50]

for query_code, query in queries.items():
    docs = generate_FTQ(query, inverted_index_dict)
    docs_score_1 = tf_ranking(docs, query)
    docs_score_2 = get_tf_rank(docs, query, 50)

In [26]:
docs_score_

[(91255752, 0.2857142857142857),
 (91295700, 0.2727272727272727),
 (89363736, 0.2727272727272727),
 (90319778, 0.2727272727272727),
 (91333684, 0.26666666666666666),
 (88310191, 0.25),
 (88287652, 0.25),
 (91215324, 0.25),
 (91218591, 0.25),
 (89002042, 0.23076923076923078),
 (91296148, 0.23076923076923078),
 (91167805, 0.23076923076923078),
 (88063114, 0.2222222222222222),
 (91124570, 0.21428571428571427),
 (91269932, 0.21052631578947367),
 (90074399, 0.20833333333333334),
 (89001955, 0.2),
 (90382798, 0.2),
 (90023136, 0.2),
 (90355638, 0.2),
 (90066683, 0.2),
 (88073722, 0.2),
 (91369527, 0.1875),
 (91213849, 0.1875),
 (89162735, 0.1827956989247312),
 (91325316, 0.18181818181818182),
 (90313621, 0.18181818181818182),
 (89135238, 0.18181818181818182),
 (91046708, 0.18181818181818182),
 (90135678, 0.18181818181818182),
 (89218414, 0.18181818181818182),
 (89353008, 0.18181818181818182),
 (89156406, 0.18181818181818182),
 (89026234, 0.17894736842105263),
 (91369519, 0.17777777777777778)

In [58]:
for query_code, query in queries.items():
    docs = generate_FTQ(query, inverted_index_dict)
    docs_score_1 = tf_idf_ranking(docs, query)
    docs_score_2 = get_tf_idf_rank(docs, query, 50)

In [59]:
docs_score_1 == docs_score_2

True

In [52]:
for query_code, query in queries.items():
    docs = generate_FTQ(query, inverted_index_dict)
    docs_score_bool = boolean_ranking(docs, query)
    docs_score_bool2 = get_boolean_rank(docs, query, 50)

In [53]:
docs_score_bool == docs_score_bool2

True

In [67]:
docs_score_1

[(91255752, 7.657940530763741),
 (91295700, 7.309852324819935),
 (89363736, 7.309852324819935),
 (90319778, 7.309852324819935),
 (91333684, 7.147411162046159),
 (88310191, 6.700697964418274),
 (88287652, 6.700697964418274),
 (91215324, 6.700697964418274),
 (91218591, 6.700697964418274),
 (89002042, 6.185259659463022),
 (91296148, 6.185259659463022),
 (91167805, 6.185259659463022),
 (88063114, 5.956175968371799),
 (91124570, 5.743455398072806),
 (91269932, 5.64269302266802),
 (90074399, 5.5839149703485615),
 (89001955, 5.360558371534619),
 (90382798, 5.360558371534619),
 (90023136, 5.360558371534619),
 (90355638, 5.360558371534619),
 (90066683, 5.360558371534619),
 (88073722, 5.360558371534619),
 (91369527, 5.025523473313705),
 (91213849, 5.025523473313705),
 (89162735, 4.899435070757447),
 (91325316, 4.87323488321329),
 (90313621, 4.87323488321329),
 (89135238, 4.87323488321329),
 (91046708, 4.87323488321329),
 (90135678, 4.87323488321329),
 (89218414, 4.87323488321329),
 (89353008, 4.