In [3]:
import nltk
import os
import glob
import re
import string
from nltk.stem import PorterStemmer
#nltk.download('punkt')

stopwords = []
dictionary = {}


def read_stopwords(filename):
    file_read = open(filename,'r')
    content = file_read.read();
    file_read.close()
    stopwords = content.split('\n')
    return stopwords    

def tokenizer(fileContent):
    fileContent = fileContent.lower()
    fileContent = fileContent.replace("-", " ")  # replace '-' with simple space
    fileContent = fileContent.replace("•", " ")  # replace '.' with simple space
    fileContent = re.sub(r'https?://(?:www\.)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', fileContent)  # remove urls
    fileContent = re.sub(r'\S+\.com\b', '', fileContent)  # remove .com
    fileContent = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', fileContent)  # remove emails
    fileContent = re.sub(r'\b\d+\b', '', fileContent)  # remove numbers
    fileContent = re.sub(r'[^\w\s]', '', fileContent)  # remove other useless punctuation
    words = re.split(r'\s+|\n+', fileContent)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return words

docs_directory = "ResearchPapers"

def clean_and_generate_positions(docs_directory): #cleaning the terms and creating the inverted indexes
    # Build inverted index
    inverted_index = {}

    file_pattern = '*.txt'

    # Use glob to find all files that match the pattern in the folder
    file_list = glob.glob(os.path.join(docs_directory, file_pattern))
    
    # Loop over each file in the list and read its contents
    for file_path in file_list:
        with open(file_path, 'r') as file:
            document_content = file.read()
            file_name = os.path.basename(file_path).split('.')[0]
            # Preprocess the document
            processed_words = tokenizer(document_content)
            # Update inverted index
            for position, term in enumerate(processed_words):
                if term not in stopwords:
                    if term not in inverted_index:
                        inverted_index[term] = {int(file_name): [position]}
                    else:
                        if int(file_name) not in inverted_index[term]:
                            inverted_index[term][int(file_name)] = [position]
                        else:
                            inverted_index[term][int(file_name)].append(position)
    return inverted_index



stemmer = PorterStemmer()

def boolean_and(query_terms, inverted_index):
    print("\nThe terms obtained by spliting th user query is",query_terms)
    result = None
    common_docs = set()
    for term in query_terms:
        stemmed_term = stemmer.stem(term)
        postings = set(inverted_index.get(stemmed_term, {}).keys())
        if result is None:
            result = postings
        else:
            # Take the intersection of postings based on document name only
            common_docs = common_docs.union(postings)
    # Convert the result to a sorted list by docID
    sorted_result = sorted(list(common_docs), key=lambda x: (int(x.split('.')[0]) if '.' in x else int(x)) if isinstance(x, str) else x)

    return sorted_result


def boolean_or(query_terms, inverted_index):
    result = set()
    print("\nThe terms obtained by spliting th user query is",query_terms)
    for term in query_terms:
        stemmed_term = stemmer.stem(term)
        postings = set(inverted_index.get(stemmed_term, []))
        result.update(postings)

    result = list(result)

    sorted_result = sorted(result, key=lambda x: (int(x.split('.')[0]) if '.' in x else int(x)) if isinstance(x, str) else x)

    return sorted_result

def boolean_not(not_query_terms, inverted_index):
    result = set()
    print("The terms obtained by spliting th user query is",not_query_terms)
    # Find all documents
    all_docs = set()
    for postings in inverted_index.values():
        all_docs.update(postings.keys())
    
    # Find documents containing any of the terms from the NOT query
    not_docs = set()
    for term in not_query_terms:
        stemmed_term = stemmer.stem(term)
        postings = set(inverted_index.get(stemmed_term, {}).keys())
        not_docs.update(postings)
    
    # Find documents not containing any of the terms from the NOT query
    for doc_id in all_docs:
        if str(doc_id) + '.txt' not in not_docs and str(doc_id) + '.txt' != "Stopword-List.txt":
            result.add(str(doc_id) + '.txt')
    
    # Sort the result by docID
    sorted_result = sorted(result, key=lambda x: int(x.split('.')[0]) if '.' in x else x)

    return sorted_result

def process_boolean_query(user_query, inverted_index):
    # Tokenize the user input into a list of terms
    query_terms = user_query.split()

    # Define lists to store subqueries and operators
    subqueries = []
    operators = []

    # Split the query into subqueries and operators
    for term in query_terms:
        if term.upper() in {'AND', 'OR', 'NOT'}:
            operators.append(term.upper())
        else:
            subqueries.append(term)

    # Evaluate the subqueries based on operators
    result = []

    if not operators:
        for term in subqueries:
            result.extend(inverted_index.get(term, []))
        return result

    # Iterate through operators and apply boolean operations
    for i, operator in enumerate(operators):
        if operator == 'AND':
            i = i + 2
            result = boolean_and(subqueries[:i], inverted_index)
        elif operator == 'OR':
            result = boolean_or(subqueries, inverted_index)
        elif operator == 'NOT':
            result = boolean_not(subqueries, inverted_index)
            result = [doc.replace('.txt', '') for doc in result]

    return result


if __name__ == "__main__":
    #reading stopwords from stopwords file
    #read_stopwords function takes filename as an input and return the list of stopwords
    stopwords = read_stopwords('Stopword-List.txt')

    #read_all_files takes the stopwords as a parameter and reads all the files in the folder with a .txt extension and call tokenize method
    
    
    #dictionary is sorted first on keys and then on document id's
    #sorted_dictionary = sort_dictionary(dictionary)
    inverted_index = clean_and_generate_positions(docs_directory)
    # Printing the first 10 terms from the sorted dictionary
    """
 print("Inverted index : ")
    for term in inverted_index.items():
        print(term, ":", postings)
        count += 1
        if count == 1000:
            break 
            
    # Example AND query
    and_query = ['feature', 'selection','redundancy']
    result_and = boolean_and(and_query, inverted_index)

    # Print the sorted result
    print(f" AND Query Result: {result_and}")
    
    
    # Example OR query
    or_query = ['transformer', 'model']
    result_or = boolean_or(or_query, inverted_index)

    print(f"OR Query Result: {result_or}")
    
    
    #Example NOT query
    not_query = ['cancer', 'feature']
    result_not = boolean_not(not_query, inverted_index)

    result_not = [doc.replace('.txt', '') for doc in result_not]
    # Print the result without "txt" extension
    print(f"NOT Query Result: {result_not}")
    
    
     #taking the user query
    user_query = input("Enter any query : ")
    result = process_boolean_query(user_query, inverted_index)
    print(f"The inverted index if the :{user_query} is",result)
    
"""

    user_query = input("Enter any query : ")
    result = process_boolean_query(user_query, inverted_index)
    print(f"The inverted index if the :{user_query} is",result)
    

FileNotFoundError: [Errno 2] No such file or directory: 'Stopword-List.txt'