In [12]:
#Pre-Processing the Research Papers and Stop Words.
import glob as gb
import re
from collections import defaultdict


document_list = defaultdict(str)
directory = 'ResearchPapers/' #Directory Name
originals = set() #List of all Document IDs to be used in Query Processing
total_docs = 0
#Reading all text files (research papers) in the directory
for filename in gb.glob(directory + '*.txt'):
    with open(filename, 'r', encoding='cp1252') as f:
        filenum = re.search(r'\d+', filename) #Getting only the decimal from the filename using regulax expression
        if filenum:
                document_list[int(filenum.group())] = (f.read().lower()) #Group function helps unpack the decimal value
                originals.add(int(filenum.group()))
                total_docs+=1

stopwords = ""
#Reading Stopwords Text File 
with open("Stopword-List.txt", 'r') as f:
        stopwords += f.read().strip() #Cleaning the String of any leading or trailing new line character
stopwords = stopwords.replace(" ", "") #Removing any extra whitespace characters
stopwords = stopwords.replace('\n', " ") #Removing any new line character in between the string

In [13]:
#Cleaning data 
def clean_data(): #Gets rid of anything that is not in lowerecase alphabet set or a number
    cleaned_data = defaultdict(str)
    for docID in document_list:
        string = document_list[docID]
        cleaned_data[docID] = (re.sub(r'[^a-z]'," ", string)) #Returns space seperated.
    return cleaned_data

document_list = clean_data() #Getting the cleaned data 

In [14]:
#Tokenization
token_list = defaultdict(list)
tokens_with_stopword = defaultdict(list)
for docID, doc in document_list.items():
    cleaned_doc = []
    normal_doc = []
    doc = list(doc.split()) #For whitespaced words
    for word in doc:
        if word not in stopwords: #Only including words that are not in our Stop Words File.
            cleaned_doc.append(word)
        normal_doc.append(word)
    token_list[docID] = (cleaned_doc) #The document is appended back to its corresponding docID after tokenizing.
    tokens_with_stopword[docID] = normal_doc



In [15]:
#Stemming
from nltk import PorterStemmer

stemmer = PorterStemmer() #Using Built-in Stemmer

stemmed_list = defaultdict(dict)
for docID, doc in token_list.items():
    stemmed_list[docID] = ([stemmer.stem(word) for word in doc]) #Creates a Stemmed List and Assigns it to its corresponding Document

stemmed_sw_list = defaultdict(list) 
for docID, doc in tokens_with_stopword.items(): #Stemmed List including stopwords.
    stemmed_sw_list[docID] = [stemmer.stem(word) for word in doc]

In [16]:
#Term Frequency & Document Frequency
from math import log10
term_occur = defaultdict(set)
term_freq = defaultdict(lambda: defaultdict(int))
for docID, doc in stemmed_list.items():
    for i in doc:
        term_freq[docID][i]+=1 #Term Frequency of each term in its corresponding document
        term_occur[i].add(docID)

doc_freq = {}
for key, value in term_occur.items():
    doc_freq[key] = len(value)
    # print(f"Document Frequency of {key} --> {doc_freq[key]}")

#Inverse Document Frequency
idf = {}
for key, value in doc_freq.items():
    idf[key] = log10(total_docs/value)
    # print(f"IDF of {key} -> {idf[key]}")

In [17]:
#Inverted Index
inv_index = defaultdict(list)

#TF-IDF Weighting
tf_idf = defaultdict(lambda: defaultdict(float))
for key, value in term_freq.items():
    for term, freq in value.items():
        tf_idf[term][key] = freq * idf[term]
        inv_index[key].append(term)

# print(inv_index)

defaultdict(<class 'list'>, {15: ['ieee', 'transact', 'neural', 'network', 'learn', 'system', 'vol', 'juli', 'toward', 'automat', 'time', 'seri', 'forecast', 'use', 'weizhong', 'yan', 'senior', 'member', 'abstract', 'over', 'past', 'few', 'decad', 'applic', 'arti', 'cial', 'ann', 'tsf', 'beengrow', 'rapidli', 'due', 'sever', 'uniqu', 'featur', 'model', 'howev', 'date', 'consist', 'perform', 'differentstudi', 'not', 'been', 'achiev', 'mani', 'factor', 'contribut', 'theinconsist', 'one', 'such', 'that', 'involv', 'determininga', 'larg', 'number', 'design', 'paramet', 'current', 'designpractic', 'essenti', 'heurist', 'hoc', 'thi', 'doe', 'exploitth', 'full', 'potenti', 'systemat', 'modelingprocess', 'strategi', 'therefor', 'greatli', 'need', 'motiv', 'by', 'paper', 'attempt', 'develop', 'anautomat', 'scheme', 'it', 'base', 'generalizedregress', 'grnn', 'special', 'type', 'neuralnetwork', 'take', 'advantag', 'properti', 'asingl', 'fast', 'incorporatingsever', 'g', 'fuse', 'multipl', 'haveb

In [22]:
#Building Vector Space Model
vsm = defaultdict(lambda: defaultdict(float))
for key, values in inv_index.items():
    doc_vec = {}
    for i in values:
        doc_vec[i] = (tf_idf[i][key]) #Building Document Vector. At each term, its tf-idf weight is appended.
    sorted_doc_vec = dict(sorted(doc_vec.items(), key=lambda x: x[1]))
    vsm[key] = sorted_doc_vec #Accessing document vector using its document number.
# print(vsm)

defaultdict(<function <lambda> at 0x1575ab1c0>, {15: {'system': 0.0, 'time': 0.0, 'use': 0.0, 'sever': 0.0, 'not': 0.0, 'been': 0.0, 'mani': 0.0, 'one': 0.0, 'such': 0.0, 'that': 0.0, 'larg': 0.0, 'thi': 0.0, 'by': 0.0, 'it': 0.0, 'base': 0.0, 'make': 0.0, 'gener': 0.0, 'exampl': 0.0, 'which': 0.0, 'data': 0.0, 'more': 0.0, 'than': 0.0, 'with': 0.0, 'their': 0.0, 'result': 0.0, 'also': 0.0, 'show': 0.0, 'onli': 0.0, 'other': 0.0, 'from': 0.0, 'most': 0.0, 'x': 0.0, 'univers': 0.0, 'follow': 0.0, 'there': 0.0, 'refer': 0.0, 'but': 0.0, 'when': 0.0, 'between': 0.0, 'increas': 0.0, 'j': 0.0, 'interest': 0.0, 'inform': 0.022276394711152208, 'report': 0.022276394711152208, 'due': 0.04575749056067514, 'http': 0.04575749056067514, 'di': 0.04575749056067514, 'higher': 0.04575749056067514, 'among': 0.06682918413345662, 'correspond': 0.06682918413345662, 'chang': 0.07058107428570728, 'develop': 0.08910557884460883, 'provid': 0.08910557884460883, 'state': 0.08910557884460883, 'form': 0.0915149811

In [None]:
#Normalization
l_norm = {}
for key, values in vsm.items():
    

In [7]:
#Driver Code including GUI
import tkinter as tk
from tkinter import ttk

class InformationRetrievalSearch(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Rohan's Retrieval")
        
        # Set window size
        self.geometry("800x600")

        # Variables to store input values
        self.term1_prox_value = tk.StringVar(value=None)
        self.term2_prox_value = tk.StringVar(value=None)
        self.term1_value = tk.StringVar(value=None)
        self.term2_value = tk.StringVar(value=None)
        self.term3_value = tk.StringVar(value=None)
        self.not1_value = tk.StringVar(value=None)
        self.not2_value = tk.StringVar(value=None)
        self.not3_value = tk.StringVar(value=None)
        self.operator1_value = tk.StringVar(value=None)
        self.operator2_value = tk.StringVar(value=None)
        self.proximity_value = tk.StringVar(value=None)
        self.result = set()
        self.pos_result = set()
        
        # Create notebook to hold multiple pages
        self.notebook = ttk.Notebook(self)
        self.notebook.pack(expand=True, fill=tk.BOTH)
        
        #Building Inverted Index as soon as the GUI is Launched
        self.inv_index = InvertedIndex(stemmed_list, originals)
        self.inv_index.buildIndex()

        #Building Positional Index
        self.pos_index = PositionalIndex()
        self.pos_index.buildIndex()

        # Add inverted index page
        self.add_inverted_index_page()

        # Add positional index page
        self.add_positional_index_page()
    
    #Processing Inverted Index Queries
    def processInvertedIndexQuery(self, output_text):
        index = self.inv_index
        words = [self.term1_value.get().lower(), self.term2_value.get().lower(), self.term3_value.get().lower()]
        
        #Validating Inputs
        if self.not1_value.get() == "":
            self.not1_value.initialize(None)
        if self.not2_value.get() == "":
            self.not2_value.initialize(None)
        if self.not3_value.get() == "":
            self.not3_value.initialize(None)
        if self.operator1_value.get() == "":
            self.operator1_value.initialize(None)
        if self.operator2_value.get() == "":
            self.operator2_value.initialize(None)

        if words[0] == "Enter term 1":
            words[0] = None
        if words[1] == "Enter term 2":
            words[1] = None
        if words[2] == "Enter term 3":
            words[2] = None

        #Query Processing
        self.result = index.processQueries(words, self.not1_value.get(), self.not2_value.get(), self.not3_value.get(), self.operator1_value.get(), self.operator2_value.get())
        self.display_result(self.result, output_text)

    #Inverted Index Page
    def add_inverted_index_page(self):
        inverted_index_frame = tk.Frame(self.notebook)
        
        #Dropdown for NOT operator before term 1
        not_dropdown1 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not1_value)
        not_dropdown1.pack(pady=5)
        
        #Term 1 input
        term1_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term1_value)
        term1_entry.insert(0, "Enter term 1")
        term1_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term1_entry))
        term1_entry.pack(pady=5)
        
        #Dropdown between term 1 and term 2
        operator_dropdown1 = ttk.Combobox(inverted_index_frame, values=["","AND", "OR"], width=5, textvariable=self.operator1_value)
        operator_dropdown1.pack(pady=5)
        
        #Dropdown for NOT operator before term 2
        not_dropdown2 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not2_value)
        not_dropdown2.pack(pady=5)
        
        # Term 2 input
        term2_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term2_value)
        term2_entry.insert(0, "Enter term 2")
        term2_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term2_entry))
        term2_entry.pack(pady=5)
        
        #Dropdown between term 2 and term 3
        operator_dropdown2 = ttk.Combobox(inverted_index_frame, values=["","AND", "OR"], width=5, textvariable=self.operator2_value)
        operator_dropdown2.pack(pady=5)

        #Dropdown for NOT operator before term 3
        not_dropdown1 = ttk.Combobox(inverted_index_frame, values=["", "NOT"], width=5, textvariable=self.not3_value)
        not_dropdown1.pack(pady=5)
        
        #Term 3 input
        term3_entry = tk.Entry(inverted_index_frame, width=30, textvariable=self.term3_value)
        term3_entry.insert(0, "Enter term 3")
        term3_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, term3_entry))
        term3_entry.pack(pady=5)
        
        
        # Search button
        inverted_search_button = tk.Button(inverted_index_frame, text="Search", width=20, command=lambda: self.processInvertedIndexQuery(output_text))
        inverted_search_button.pack(pady=10)

        #Heading for Output Box
        output_label = tk.Label(inverted_index_frame, text="Retrieved Documents")
        output_label.pack(pady=5)

        #Output box
        output_text = tk.Text(inverted_index_frame, height=10, width=70)
        output_text.pack(pady=15)
        output_text.insert(tk.END, "This is a sample result.")
        output_text.config(state="disabled")

        self.notebook.add(inverted_index_frame, text="Boolean Queries")

    def processProximityQueries(self, output_text):
            words = [self.term1_prox_value.get().lower(), self.term2_prox_value.get().lower()]
            distance = int(self.proximity_value.get())
            pos_index = self.pos_index
            self.pos_result = pos_index.processQuery(words, distance)

            self.display_result(self.pos_result, output_text)

    #Positional Index Page
    def add_positional_index_page(self):
        positional_index_frame = tk.Frame(self.notebook)
        
        #Search bars for term 1 and term 2
        positional_search_entry_1 = tk.Entry(positional_index_frame, width=50, textvariable=self.term1_prox_value)
        positional_search_entry_1.insert(0, "Enter term 1")
        positional_search_entry_1.bind("<FocusIn>", lambda event: self.clear_placeholder(event, positional_search_entry_1))
        positional_search_entry_1.pack(pady=5)
        
        positional_search_entry_2 = tk.Entry(positional_index_frame, width=50, textvariable=self.term2_prox_value)
        positional_search_entry_2.insert(0, "Enter term 2")
        positional_search_entry_2.bind("<FocusIn>", lambda event: self.clear_placeholder(event, positional_search_entry_2))
        positional_search_entry_2.pack(pady=5)

        #Entry for proximity integer value
        proximity_entry = tk.Entry(positional_index_frame, width=50, textvariable=self.proximity_value)
        proximity_entry.insert(0, "Enter proximity value")
        proximity_entry.bind("<FocusIn>", lambda event: self.clear_placeholder(event, proximity_entry))
        proximity_entry.pack(pady=5)
        
        #Search button
        positional_search_button = tk.Button(positional_index_frame, text="Search", width=20, command=lambda: self.processProximityQueries(output_text))
        positional_search_button.pack(pady=10)
        
        #Heading for output box
        output_label = tk.Label(positional_index_frame, text="Retrieved Documents")
        output_label.pack(pady=5)

        #Output box
        output_text = tk.Text(positional_index_frame, height=10, width=60)
        output_text.pack(pady=10)
        output_text.insert(tk.END, "This is a sample result.")
        output_text.config(state="disabled")


        self.notebook.add(positional_index_frame, text="Proximity Queries")

    def clear_placeholder(self, event, entry_widget):
        if entry_widget.get() == "Enter term 1" or entry_widget.get() == "Enter term 2" or entry_widget.get() == "Enter term 3" or entry_widget.get() == "Enter proximity value":
            entry_widget.delete(0, tk.END)

    def display_result(self, result, output_text):
        #Get search query and perform search
        if result:
            result_str = "\n".join(map(str, result))
        else:
            result_str = "No Documents Found...."
        
        #Display result in the output box
        output_text.config(state="normal")
        output_text.delete("1.0", tk.END)  #Clear previous content
        output_text.insert(tk.END, result_str)
        output_text.config(state="disabled")

#Main Driver Code
if __name__ == "__main__":
    app = InformationRetrievalSearch()
    app.mainloop()



here2
here2
here3
