In [12]:
#Pre-Processing the Research Papers and Stop Words.
import glob as gb
import re
from collections import defaultdict


document_list = defaultdict(str)
directory = 'ResearchPapers/' #Directory Name
originals = set() #List of all Document IDs to be used in Query Processing
total_docs = 0
#Reading all text files (research papers) in the directory
for filename in gb.glob(directory + '*.txt'):
    with open(filename, 'r', encoding='cp1252') as f:
        filenum = re.search(r'\d+', filename) #Getting only the decimal from the filename using regulax expression
        if filenum:
                document_list[int(filenum.group())] = (f.read().lower()) #Group function helps unpack the decimal value
                originals.add(int(filenum.group()))
                total_docs+=1

stopwords = ""
#Reading Stopwords Text File 
with open("Stopword-List.txt", 'r') as f:
        stopwords += f.read().strip() #Cleaning the String of any leading or trailing new line character
stopwords = stopwords.replace(" ", "") #Removing any extra whitespace characters
stopwords = stopwords.replace('\n', " ") #Removing any new line character in between the string

In [13]:
#Cleaning data 
def clean_data(): #Gets rid of anything that is not in lowerecase alphabet set or a number
    cleaned_data = defaultdict(str)
    for docID in document_list:
        string = document_list[docID]
        cleaned_data[docID] = (re.sub(r'[^a-z]'," ", string)) #Returns space seperated.
    return cleaned_data

document_list = clean_data() #Getting the cleaned data 

In [27]:
#Tokenization
def tokenize(doc):
    cleaned_doc = []
    doc = list(doc.split()) #For whitespaced words
    for word in doc:
        if word not in stopwords: #Only including words that are not in our Stop Words File.
            cleaned_doc.append(word)
    return cleaned_doc

token_list = defaultdict(list)
for docID, doc in document_list.items():
    cleaned_doc = tokenize(doc)
    token_list[docID] = cleaned_doc


In [28]:
#Stemming
from nltk import PorterStemmer

stemmer = PorterStemmer() #Using Built-in Stemmer

stemmed_list = defaultdict(dict)
for docID, doc in token_list.items():
    stemmed_list[docID] = ([stemmer.stem(word) for word in doc]) #Creates a Stemmed List and Assigns it to its corresponding Document

# stemmed_sw_list = defaultdict(list) 
# for docID, doc in tokens_with_stopword.items(): #Stemmed List including stopwords.
#     stemmed_sw_list[docID] = [stemmer.stem(word) for word in doc]

In [29]:
#Term Frequency & Document Frequency
from math import log10
term_occur = defaultdict(set)
term_freq = defaultdict(lambda: defaultdict(int))
for docID, doc in stemmed_list.items():
    for i in doc:
        term_freq[docID][i]+=1 #Term Frequency of each term in its corresponding document
        term_occur[i].add(docID)

doc_freq = {}
for key, value in term_occur.items():
    doc_freq[key] = len(value)
    # print(f"Document Frequency of {key} --> {doc_freq[key]}")

#Inverse Document Frequency
idf = {}
for key, value in doc_freq.items():
    idf[key] = log10(total_docs/value)
    # print(f"IDF of {key} -> {idf[key]}")

In [30]:
#Inverted Index
inv_index = defaultdict(list)

#TF-IDF Weighting
tf_idf = defaultdict(lambda: defaultdict(float))
for key, value in term_freq.items():
    for term, freq in value.items():
        tf_idf[term][key] = freq * idf[term]
        inv_index[key].append(term)

# print(inv_index)

In [35]:
#Building Vector Space Model
vsm = defaultdict(lambda: defaultdict(float))
for key, values in inv_index.items():
    doc_vec = {}
    for i in values:
        doc_vec[i] = (tf_idf[i][key]) #Building Document Vector. At each term, its tf-idf weight is appended.
    sorted_doc_vec = dict(sorted(doc_vec.items(), key=lambda x: x[1], reverse=True)) #Sorting the document vector using its tf-idf weighting
    vsm[key] = sorted_doc_vec #Adding document vector to its document number.
# print(vsm[16])

In [38]:
#Normalization
from math import sqrt

def cosine_similarity(query, document):
    dot_prod = sum(query.get(term, 0) * document.get(term, 0) for term in set(query) and set(document))
    query_mag = sqrt(sum(v**2 for v in query.values()))
    doc_mag = sqrt(sum(v**2 for v in document.values()))
    if query_mag == 0 or doc_mag == 0:
        return 0
    return dot_prod / (query_mag * doc_mag)


In [37]:
#Function to preprocess raw query to the algorithm specific format.
from collections import Counter
def preproccess_query(raw_query: str):
    raw_query = raw_query.lower()
    c_query = (re.sub(r'[^a-z]'," ", raw_query))
    tokens = tokenize(c_query)
    refined_query = Counter([stemmer.stem(i) for i in tokens])
    return refined_query

In [52]:
def run_vsm_queries(query, alpha=0.03):
    fine_query = preproccess_query(query)
    ranked_documents = []
    for docID, doc in vsm.items():
        sim = cosine_similarity(fine_query, doc)
        if sim >= alpha:
            ranked_documents.append((docID, sim))
    ranked_documents.sort(key=lambda x: x[1], reverse=True)
    if len(ranked_documents):
        return [tup[0] for tup in ranked_documents]
    return None
        
# if __name__ == '__main__':
#     query = input("Enter your Query: ")
#     ranks = run_vsm_queries(query, 0.03)
#     if ranks is None:
#         print("No relevant documents found.")
#     else: 
#         for id, sim in ranks:
#             print(f"Document ID: {id} with Similarity: {sim}")


In [48]:
#Inverted Index
class InvertedIndex:
    def __init__(self, document: dict, totalDocuments: set):
        self.originals = totalDocuments
        self.document = document
        self.index = defaultdict(set) #Dictionary of Set to get Document ID in the Set of each term
    
    def buildIndex(self):
        for docID, doc in self.document.items():
            for words in doc:
                self.index[words].add(docID) 
        self.sortDoc()

    #Helper Function to Sort Postings List    
    def sortDoc(self):
        for key, _ in self.index.items():
            lst = list(self.index[key])
            lst.sort()
            self.index[key] = set(lst)

    #Testing Function to Test Inverted Index
    def displayIndex(self):
        count = 0
        for key, value in self.index.items():
            print(f"Term: {key} -> Posting List: {value}")
            count+=1
        print(f"Number of Terms: {count}")

    #Helper Function To Retrieve Postings List
    def getPostingList(self, term):
        return self.index[stemmer.stem(term)]
    
    #AND Function Helper Function
    def Intersection(self, list1, list2):
        if len(list1) < len(list2):
            common_docs = [i for i in list1 if i in list2]
        else:
            common_docs = [i for i in list2 if i in list1]
        
        return sorted(set(common_docs))
    
    #NOT Operation Helper Function
    def inversePostingList(self, w1):
        list1 = self.getPostingList(w1)
        inverseList = set()
        for i in self.originals:
            if i not in list1:
                inverseList.add(i)
                # print(type(i))

        if len(inverseList) > 0:
            return sorted(inverseList)
        return None
    
    #OR Operation Helper Function
    def union(self, list1, list2):
        return sorted(set(list(list1)+list(list2)))
    
    #Processing Queries
    def processQueries(self, words: list, op1_not = None, op2_not = None, op3_not = None, op1 = None, op2 = None):
        list1 = set()
        list2 = set()
        list3 = set()
        
        if op1 == "None" and op2 == "None": #One Term Query Processing
            if stemmer.stem[words[0]] not in self.index:
                return None
            print("here1")
            if op1_not != "None": #If Query has NOT Operator
                return self.inversePostingList(words[0]) 
            else:
                return self.getPostingList(words[0])
        elif op2 == "None": #Two Term Query Processing
            if stemmer.stem(words[0]) not in self.index:
                return None
            if stemmer.stem(words[1]) not in self.index:
                return None
            print("here2")
            if op1_not != "None":
                list1 = self.inversePostingList(words[0]) #For NOT Operator
            else:
                list1 = self.getPostingList(words[0])
            
            if op2_not != "None":
                list2 = self.inversePostingList(words[1]) #For NOT Operator
            else:
                list2 = self.getPostingList(words[1])
            
            if op1 == "AND":
                return self.Intersection(list1, list2) #For AND Operator
            elif op1 == "OR":
                return self.union(list1, list2) #For OR Operator
            
        else: #Three Term Query Processing
            if stemmer.stem(words[0]) not in self.index:
                return None
            if stemmer.stem(words[1]) not in self.index:
                return None
            if stemmer.stem(words[2]) not in self.index:
                return None
            print("here3")
            if op1_not != "None":
                list1 = self.inversePostingList(words[0]) #For NOT Operator
            else:
                list1 = self.getPostingList(words[0])
            
            if op2_not != "None":
                list2 = self.inversePostingList(words[1]) #For NOT Operator
            else:
                list2 = self.getPostingList(words[1])

            if op3_not != "None":
                list3 = self.inversePostingList(words[2]) #For NOT Operator
            else:
                list3 = self.getPostingList(words[2])

            n = len(list1)
            m = len(list2)
            k = len(list3)
            if (n+m) < (m+k): #Comparison to reduce the required operations to find the resultant list
                if op1 == "AND":
                    result1 = self.Intersection(list1, list2) #AND Operator
                elif op1 == "OR":
                    result1 = self.union(list1, list2) #OR Operator
                
                if op2 == "AND":
                    return self.Intersection(result1, list3) #AND Operator
                elif op2 == "OR":
                    return self.union(result1, list3) #OR Operator
            else:
                if op2 == "AND":
                    result1 = self.Intersection(list2, list3) #AND Operator
                elif op2 == "OR":
                    result1 = self.union(list2, list3) #OR Operator
                
                if op1 == "AND":
                    return self.Intersection(list1, result1) #AND Operator
                elif op1 == "OR":
                    return self.union(list1, result1) #OR Operator    

In [49]:
#Positional Index
class PositionalIndex:
    def __init__(self):
        self.posIndex = {}

    #Helper Function for Retrieving Term Index List
    def getTermIndexList(self, w1):
        return self.posIndex[w1]

    #Building Positional Index Here
    def buildIndex(self):
        for docID , tokens in stemmed_sw_list.items():
            for i in range(len(tokens)):
                if tokens[i] not in self.posIndex:
                    self.posIndex[tokens[i]] = {}
                if docID not in self.posIndex[tokens[i]]:
                    self.posIndex[tokens[i]][docID] = set()
                self.posIndex[tokens[i]][docID].add(i+1)
    
    #Processing Proximity Queries Here
    def processQuery(self, words, distance): 
        w1 = stemmer.stem(words[0])
        w2 = stemmer.stem(words[1])
        
        #Validating if words exist or not
        if w1 not in self.posIndex: 
            return None
        if w2 not in self.posIndex:
            return None
        
        result = set()

        #Getting Term Index List of Both Words
        dict1 = self.getTermIndexList(w1)
        dict2 = self.getTermIndexList(w2)
        
        #Processing the Dictionary of both
        for key, value in dict1.items():
            if key in list(dict2.keys()):
                list1 = value
                list2 = dict2[key]

                for i in list1:
                    for j in list2:
                        if abs(i-j) <= distance:
                            result.add(key)

        result = list(result)
        result.sort()

        return result
                
                

# if __name__ == "__main__":
#     pos_index = PositionalIndex() 
#     pos_index.buildIndex()   
#     print(pos_index.processQuery(["past", "research"], 3))

In [63]:
#Driver Code including GUI
import tkinter as tk
from tkinter import ttk

class InformationRetrievalSearch(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Rohan's Retrieval")
        
        # Set window size
        self.geometry("800x600")

        # Variables to store input values
        self.term1_prox_value = tk.StringVar(value=None)
        self.term2_prox_value = tk.StringVar(value=None)
        self.term1_value = tk.StringVar(value=None)
        self.term2_value = tk.StringVar(value=None)
        self.term3_value = tk.StringVar(value=None)
        self.not1_value = tk.StringVar(value=None)
        self.not2_value = tk.StringVar(value=None)
        self.not3_value = tk.StringVar(value=None)
        self.operator1_value = tk.StringVar(value=None)
        self.operator2_value = tk.StringVar(value=None)
        self.proximity_value = tk.StringVar(value=None)
        self.result = set()
        self.pos_result = set()
        
        # Create notebook to hold multiple pages
        self.notebook = ttk.Notebook(self)
        self.notebook.pack(expand=True, fill=tk.BOTH)
        
        #Building Inverted Index as soon as the GUI is Launched
        self.inv_index = InvertedIndex(stemmed_list, originals)
        self.inv_index.buildIndex()

        #Building Positional Index
        self.pos_index = PositionalIndex()
        self.pos_index.buildIndex()

        # Add inverted index page
        self.add_inverted_index_page()

        # Add positional index page
        self.add_positional_index_page()

        #Add Vector Space Model Page
        self.add_vsm_page()
    
    #Processing Inverted Index Queries
    def processInvertedIndexQuery(self, output_text):
        index = self.inv_index
        words = [self.term1_value.get().lower(), self.term2_value.get().lower(), self.term3_value.get().lower()]
        
        #Validating Inputs
        if self.not1_value.get() == "":
            self.not1_value.initialize(None)
        if self.not2_value.get() == "":
            self.not2_value.initialize(None)
        if self.not3_value.get() == "":
            self.not3_value.initialize(None)
        if self.operator1_value.get() == "":
            self.operator1_value.initialize(None)
        if self.operator2_value.get() == "":
            self.operator2_value.initialize(None)

        if words[0] == "Enter term 1":
            words[0] = None
        if words[1] == "Enter term 2":
            words[1] = None
        if words[2] == "Enter term 3":
            words[2] = None

        #Query Processing
        self.result = index.processQueries(words, self.not1_value.get(), self.not2_value.get(), self.not3_value.get(), self.operator1_value.get(), self.operator2_value.get())
        self.display_result(self.result, output_text)

    #Inverted Index Page
    def add_inverted_index_page(self):
        inverted_index_frame = tk.Frame(self.notebook)
        
        #Dropdown for NOT operator before term 1
        not_dropdown1 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not1_value)
        not_dropdown1.pack(pady=5)
        
        #Term 1 input
        term1_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term1_value)
        term1_entry.insert(0, "Enter term 1")
        term1_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term1_entry))
        term1_entry.pack(pady=5)
        
        #Dropdown between term 1 and term 2
        operator_dropdown1 = ttk.Combobox(inverted_index_frame, values=["","AND", "OR"], width=5, textvariable=self.operator1_value)
        operator_dropdown1.pack(pady=5)
        
        #Dropdown for NOT operator before term 2
        not_dropdown2 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not2_value)
        not_dropdown2.pack(pady=5)
        
        # Term 2 input
        term2_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term2_value)
        term2_entry.insert(0, "Enter term 2")
        term2_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term2_entry))
        term2_entry.pack(pady=5)
        
        #Dropdown between term 2 and term 3
        operator_dropdown2 = ttk.Combobox(inverted_index_frame, values=["","AND", "OR"], width=5, textvariable=self.operator2_value)
        operator_dropdown2.pack(pady=5)

        #Dropdown for NOT operator before term 3
        not_dropdown1 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not3_value)
        not_dropdown1.pack(pady=5)
        
        #Term 3 input
        term3_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term3_value)
        term3_entry.insert(0, "Enter term 3")
        term3_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term3_entry))
        term3_entry.pack(pady=5)
        
        
        # Search button
        inverted_search_button = tk.Button(inverted_index_frame, text="Search", width=20, command=lambda: self.processInvertedIndexQuery(output_text))
        inverted_search_button.pack(pady=10)

        #Heading for Output Box
        output_label = tk.Label(inverted_index_frame, text="Retrieved Documents")
        output_label.pack(pady=5)

        #Output box
        output_text = tk.Text(inverted_index_frame, height=10, width=70)
        output_text.pack(pady=15)
        output_text.insert(tk.END, "Enter a Query.")
        output_text.config(state="disabled")

        self.notebook.add(inverted_index_frame, text="Boolean Queries")

    def processProximityQueries(self, output_text):
            words = [self.term1_prox_value.get().lower(), self.term2_prox_value.get().lower()]
            distance = int(self.proximity_value.get())
            pos_index = self.pos_index
            self.pos_result = pos_index.processQuery(words, distance)

            self.display_result(self.pos_result, output_text)

    #Positional Index Page
    def add_positional_index_page(self):
        positional_index_frame = tk.Frame(self.notebook)
        
        #Search bars for term 1 and term 2
        positional_search_entry_1 = tk.Entry(positional_index_frame, width=50, textvariable=self.term1_prox_value)
        positional_search_entry_1.insert(0, "Enter term 1")
        positional_search_entry_1.bind("<FocusIn>", lambda event: self.clear_placeholder(event, positional_search_entry_1))
        positional_search_entry_1.pack(pady=5)
        
        positional_search_entry_2 = tk.Entry(positional_index_frame, width=50, textvariable=self.term2_prox_value)
        positional_search_entry_2.insert(0, "Enter term 2")
        positional_search_entry_2.bind("<FocusIn>", lambda event: self.clear_placeholder(event, positional_search_entry_2))
        positional_search_entry_2.pack(pady=5)

        #Entry for proximity integer value
        proximity_entry = tk.Entry(positional_index_frame, width=50, textvariable=self.proximity_value)
        proximity_entry.insert(0, "Enter proximity value")
        proximity_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, proximity_entry))
        proximity_entry.pack(pady=5)
        
        #Search button
        positional_search_button = tk.Button(positional_index_frame, text="Search", width=20, command=lambda: self.processProximityQueries(output_text))
        positional_search_button.pack(pady=10)
        
        #Heading for output box
        output_label = tk.Label(positional_index_frame, text="Retrieved Documents")
        output_label.pack(pady=5)

        #Output box
        output_text = tk.Text(positional_index_frame, height=10, width=60)
        output_text.pack(pady=10)
        output_text.insert(tk.END, "Enter a Query.")
        output_text.config(state="disabled")


        self.notebook.add(positional_index_frame, text="Proximity Queries")
    
    #Vector Space Model
    def add_vsm_page(self):
        vsm_frame = tk.Frame(self.notebook)

        # Textbox for entering terms for VSM query
        query_textbox = tk.Text(vsm_frame, height=2, width=60)
        query_textbox.pack(pady=10)

        # Add a search button for VSM queries
        vsm_search_button = tk.Button(vsm_frame, text="Search", width=20, command=lambda: self.process_vsm_query(output_text, query_textbox))
        vsm_search_button.pack(pady=10)
        
        # Heading for output box
        output_label = tk.Label(vsm_frame, text="Retrieved Documents")
        output_label.pack(pady=5)

        # Output box for displaying results
        output_text = tk.Text(vsm_frame, height=10, width=60)
        output_text.pack(pady=10)
        output_text.insert(tk.END, "Enter a Query.")
        output_text.config(state="disabled")

        # Add the VSM frame to the notebook
        self.notebook.add(vsm_frame, text="Vector Space Model")

    def process_vsm_query(self, output_text, query_textbox):
        # Get the query string from the textbox
        query_string = query_textbox.get("1.0", tk.END).strip()
        
        # Split the query string into individual terms
        query_terms = str(query_string.split())

        # Perform VSM query processing
        vsm_result = run_vsm_queries(query_terms)  # Implement this method
        
        # Display VSM query results in the output text box
        self.display_result(vsm_result, output_text)


    def clear_placeholder(self, event, entry_widget):
        if entry_widget.get() == "Enter term 1" or entry_widget.get() == "Enter term 2" or entry_widget.get() == "Enter term 3" or entry_widget.get() == "Enter proximity value":
            entry_widget.delete(0, tk.END)

    def display_result(self, result, output_text):
        #Get search query and perform search
        if result:
            result_str = "\n".join(map(str, result))
        else:
            result_str = "No Documents Found...."
        
        #Display result in the output box
        output_text.config(state="normal")
        output_text.delete("1.0", tk.END)  #Clear previous content
        output_text.insert(tk.END, result_str)
        output_text.config(state="disabled")

#Main Driver Code
if __name__ == "__main__":
    app = InformationRetrievalSearch()
    app.mainloop()