In [1]:
import re
import pandas as pd
from collections import defaultdict
import nltk, string
import pickle, math
from tqdm import tqdm

var_dict = {}
var_dict["D_SID"] = '.I '
var_dict["D_UI"] = '.U'
var_dict["D_MESH"] = '.M'
var_dict["D_TITLE"] = '.T'
var_dict["D_PUB"] = '.P'
var_dict["D_ABS"] = '.W'
var_dict["D_SOURCE"] = '.S'
var_dict["D_AUTHOR"] = '.A'

nltk.download('stopwords')
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(['.U', '.S','.M','.T','.P','.W','.M','.I'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anirudh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#############################################################################################################################
############################### FUNCTIONS TO PROCESS AND CLEAN THE DATA #####################################################
#############################################################################################################################

def get_id(doc):
    # Obtain string that contains the Document ID
    id_string = doc[0]
    # Split the string and make it a list
    # the ID is the last element of that list
    id_string = list(id_string.split("\n"))
    return int(id_string[2])

def remove_stopwords(text, stpwrd):
    # Create a list of words without the stop words
    word_list = [word for word in text.split() if word not in stpwrd]
    # Create a string of the created list
    text =  ' '.join(word_list) 
    return text

def clean_text(input_text):
    # Remove . Headers
    input_text = re.sub(r'\.[A-Z]', '', input_text)
    # Remove punctuation
    input_text = input_text.translate(str.maketrans("", "", string.punctuation))
    # Remove \n, \r and \t
    input_text = re.sub(r'[\n\t\r]', ' ', input_text)
    # Remove digits
    input_text = re.sub(r'[\d]', '', input_text)
    # Make text lower case
    input_text = input_text.lower()
    # Remove leading and trailing spaces
    input_text = input_text.strip()
    # Convert the text to a list of words
    final_text = input_text.split()

    return final_text

def get_text(doc):
    # Obtain string that contains the text 
    text_string = doc[1]
    # Remove the stop words from the list
    text_string = remove_stopwords(text_string, stpwrd)
    # Process the string and convert it into a list of words
    text_string = clean_text(text_string) 
    return text_string
    
def read_doc_file(path):
    # Read the file and for each document, split the ID string and the text string
    with open(path, 'r') as f:
        documents = f.read().split(var_dict["D_SID"]) 
    # Make a list for each file such that the ID and the text are separate elements
    documents = [doc.split("\n"+var_dict["D_SOURCE"], maxsplit=1) for doc in documents[1:]]
    return documents

def process_data(documents):
    text_id_dict = {}
    for doc in tqdm(documents):
        try:
            doc_id = get_id(doc)
            doc_text = get_text(doc)
#             print('----------')
#             print(doc_text)
#             print('----------')
            text_id_dict[doc_id] = doc_text
        except:
            continue
    return text_id_dict

#############################################################################################################################
############################### FUNCTIONS TO GENERATE DOCUMENT INDICES AND INVERTED INDICES #################################
#############################################################################################################################

def index_generator(termlists):
    # Empty dictionary to store the resulting indexes.
    generated_index_list = {}
    for filename, termlist in tqdm(termlists.items()):
        # Empty dictionary to store the index for a file.
        fileIndex = {}
        for index, term in enumerate(termlist):
            # checks whether it already exists as a key.
            if term in fileIndex:
                # Index is appended to existing list of positions.
                fileIndex[term].append(index)
            else:
                # New key created for the term.
                fileIndex[term] = [index]
        # Add to index dictionary with the filename as the key.
        generated_index_list[filename] = fileIndex
    return generated_index_list

def inverted_index_generator(generated_index_list):
    # Empty dictionary to store the resulting inverted indexes.
    inv_index_dict = {}
    # Iterate through each file's index list.
    for filename, file_index in tqdm(generated_index_list.items()):
        # Iterate through each term in a file's index list
        for term, pos in file_index.items():
            # If the term is not present in the inverted index dict,
            # add an empty key field to it.
            if term not in inv_index_dict:
                inv_index_dict[term] = {}
            # If it is already present, add the filename and positions
            # to the existing sub-dictionary.
            inv_index_dict[term][filename] = pos
    return inv_index_dict

#############################################################################################################################
############################### FUNCTIONS TO PARSE AND PROCESS THE QUERIES ##################################################
#############################################################################################################################

def parse_queries(file):
    queries = []
    current_query = None
    for line in file:
        line = line[:-1]
        if '<top>' in line:
            current_query = {}
        elif '</top>' in line:
            queries.append(current_query)
            current_query = {}
        elif '<num>' in line:
            current_query['num'] = line.split(':')[1].strip()
        elif '<title>' in line:
            current_query['title'] = line.split('>')[1].strip()
        elif (not '<desc>' in line and len(line) > 2):
            current_query['description'] = line
    return queries

def generate_FTQ(curr_query, inverted_index_dict):
    # create a set to store the unique documents
    set_of_docs = set()
    # Iterate through each term in a query
    for term in curr_query:
        # Check if the term is present in the inverted index dictionary
        if term in inverted_index_dict:
            # If it is present, ger the unique docs
            unique_docs = set(inverted_index_dict[term].keys())
            # Update the set of documents
            set_of_docs.update(unique_docs)
    return list(set_of_docs)

def extract_queries(queries):
    # Remove the stopwords and clean the text
    extracted_queries = {query['num']: clean_text(remove_stopwords(query['description'], stpwrd)) for query in queries}
    return extracted_queries

def query_reader(filename):
    # Open the file and parse through the quaries
    with open(filename, 'r+') as file:
        queries = parse_queries(file)
    # Extract and clean/process the queries
    extracted_queries = extract_queries(queries)        
    return extracted_queries

#############################################################################################################################
############################### FUNCTIONS TO OBRAIN THE RANKING SCORES ######################################################
#############################################################################################################################

def get_k_top(document_score, top_k):
    # Convert the directory items to a list
    docs_score_list = list(document_score.items())
    # Sort the list according to the scores
    sorted_docs_score_list = sorted(docs_score_list, key=lambda item: item[1], reverse=True)
    # Pick the top scoring document
    top_docs_score_list = sorted_docs_score_list[:top_k]
    return top_docs_score_list

def get_intersection_score(document, query):
    # Get the document terms from the index dictionary
    doc_terms = index_dict[document].keys()
    # Get the intersection between the query terms and the document terms
    intersection = set(list(doc_terms)) & set(query)
    # The length of the intersection gives the boolean score
    boolean_score = len(list(intersection))
    return boolean_score

def get_boolean_rank(document_list, query, top_k):
    # Dictionary to store the document scores
    document_score = {}
    # Iterate through each document
    for document in document_list:
        # Get the boolean score
        boolean_score = get_intersection_score(document, query)
        # Store the score in the dictionary
        document_score[document] = boolean_score
    # Get the top K relevant documents
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_term_frequency(document, query):
    curr_doc_len = len(index_dict[document])
    frequency = 0
    # Iterate through each term in the query
    for term in query:
        term_frequency = 0
        # Check if the term is in the index dictionary
        if term in index_dict[document]:
            # Check its occurance
            term_frequency = len(index_dict[document][term])
        # Update term frequency
        frequency += term_frequency
    return frequency/curr_doc_len

def get_inv_doc_frequency(document, query):
    curr_doc_len = len(index_dict[document])
    cumulative_inv_doc_frequency = 0
    # Iterate through each term in the query
    for term in query:
        # Check if the term is in the inverted index dictionary
        if term in inverted_index_dict:
            # Calculate the inverted document frequency
            document_frequency = len(inverted_index_dict[term])
            inv_doc_frequency = len(inverted_index_dict) / document_frequency
            inv_doc_frequency = math.log(inv_doc_frequency)
            cumulative_inv_doc_frequency += inv_doc_frequency
    return cumulative_inv_doc_frequency

def get_tf_rank(document_list, query, top_k):
    document_score = {}
    # Iterate through each document in the document list
    for document in document_list:
        # Get the term frequency
        term_frequency = get_term_frequency(document, query)
        # Add it to the document score
        document_score[document] = term_frequency
    # Choose the top K relevant documents
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_tf_idf_rank(document_list, query, top_k):
    document_score = {}
    # Iterate through each document in the document list
    for document in document_list:
        # Get the term frequency
        term_frequency = get_term_frequency(document, query)
        # Get the inverted document frequency
        inv_doc_frequency = get_inv_doc_frequency(document, query)
        tf_idf_score = term_frequency * inv_doc_frequency
        # Add it to the document score
        document_score[document] = tf_idf_score
    # Choose the top K relevant documents
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

def get_custom_rank(document_list, query, top_k):
    # Discount factor 
    discount_factor = 0.75
    # Get initial set of relevant documents using TF-IDF
    relevant_scores = get_tf_idf_rank(document_list, query, top_k)
    relevant_documents = [doc for doc, score in relevant_scores]
    document_score = {}
    # Iterate through each document in the relevant document list 
    for count, document in enumerate(relevant_documents):
        # Get the term frequency 
        term_frequency = get_term_frequency(document, query)
        # Get the inverted document frequency
        inv_doc_frequency = get_inv_doc_frequency(document, query)
        tf_idf_score = term_frequency * inv_doc_frequency
        # Discount value keeps reducing as the relevance of the document reduces
        d_val = ((discount_factor)**count)
        # Get the custom score
        custom_score = (d_val + (tf_idf_score)) - math.log(tf_idf_score)
        # Add it to the document score
        document_score[document] = custom_score
    # Choose the top K relevant documents
    top_k_documents = get_k_top(document_score, top_k)
    return top_k_documents

#############################################################################################################################
#############################################################################################################################
#############################################################################################################################

In [3]:
print('Reading documents...')
documents = read_doc_file('ohsumed.88-91')

print('Preprocessing data...')
cleaned_data = process_data(documents)

print('Generating indices...')
index_dict = index_generator(cleaned_data)

print('Generating inverted indices...')
inv_index_dict = inverted_index_generator(index_dict)

print('Saving the index file...')
index_pickle = open("./pickle_files/index.pkl", "wb")
pickle.dump(index_dict, index_pickle)
index_pickle.close()

print('Saving the inverted index file...')
inverted_index_pickle = open("./pickle_files/inverted_index.pkl", "wb")
pickle.dump(inv_index_dict, inverted_index_pickle)
inverted_index_pickle.close()

Reading documents...
Preprocessing data...


100%|████████████████████████████████████████████████████████████████████████| 293867/293867 [01:37<00:00, 2999.85it/s]


Generating indices...


100%|███████████████████████████████████████████████████████████████████████| 293856/293856 [00:23<00:00, 12736.77it/s]


Generating inverted indices...


100%|███████████████████████████████████████████████████████████████████████| 293856/293856 [00:11<00:00, 25497.58it/s]


Saving the index file...
Saving the inverted index file...


In [3]:
queries = query_reader('query.ohsu.1-63')
inv_index_file = open("./pickle_files/inverted_index.pkl", "rb")
inverted_index_dict = pickle.load(inv_index_file)
index_file = open("./pickle_files/index.pkl", "rb")
index_dict = pickle.load(index_file)

In [7]:
def get_ranking_scores(method, inverted_index_dict, index_dict):
    print('Generating Scores...')
    print('Method:', method)
    file_name = method + '.txt'
    f = open('./output/'+file_name, 'w')
    top_k = 50
    for qid, query in tqdm(queries.items()):
        document_list = generate_FTQ(query, inverted_index_dict)
        if method == 'Boolean':
            generated_scores = get_boolean_rank(document_list, query, top_k)
        elif method == 'TF':
            generated_scores = get_tf_rank(document_list, query, top_k)
        elif method == 'TF-IDF':
            generated_scores = get_tf_idf_rank(document_list, query, top_k)
        elif method == 'Custom':
            generated_scores = get_custom_rank(document_list, query, top_k)
        total_scores = len(generated_scores)
        for i in range(total_scores):
            f.write(qid + "\tQ0\t" + str(generated_scores[i][0]) + "\t" + str(i+1) + "\t" + str(generated_scores[i][1]) + "\t" + method +"\n")      
    print('\n')
    print('Score generation complete...')
    print('Please check the file in the output sub-directory')

print('Choose the ranking method: \n 1) Boolean\n 2) TF\n 3) TF-IDF\n 4) Custom\n')
method = input('Enter the method to be used: ')
get_ranking_scores(method, inverted_index_dict, index_dict)

Choose the ranking method: 
 1) Boolean
 2) TF
 3) TF-IDF
 4) Custom

Enter the method to be used: Custom
Generating Scores...
Method: Custom


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:15<00:00,  4.00it/s]



Score generation complete...
Please check the file in the output sub-directory



