In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import os

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def is_valid_word(word):
    return word.isalpha()

def preprocess_text(text):
    words = word_tokenize(text.lower())
    return [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if (is_valid_word(word) or "/" in word) and word not in stop_words]

def build_inverted_index(dir_path):
    inverted_index = {}
    for file_name in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, file_name)):
            with open(os.path.join(dir_path, file_name), "r") as file:
                words = preprocess_text(file.read())
                for word in set(words):
                    inverted_index.setdefault(word, set()).add(int(file_name.replace(".txt", "")))
    return inverted_index

positional_index = {}
def build_positional_index(dir_path):
    
    for file_name in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, file_name)):
            with open(os.path.join(dir_path, file_name), "r") as file:
                words = preprocess_text(file.read())
                for position, word in enumerate(words):
                    positional_index.setdefault(word, {}).setdefault(file_name.replace(".txt", ""), []).append(position)
    return positional_index

def complement(x):
    universalSet = set()
    for i in os.listdir("ResearchPapers"):
        universalSet.add(int(i.replace(".txt", "")))
    
    sol = set()
    for i in universalSet:
        if i not in x:
            sol.add(i)
    return sol

def process_simple_query(query, inverted_index):
    query = preprocess_text(query)
    result = set()
    for token in query:
        result.update(inverted_index.get(token, set()))
    return result

def infix_to_postfix(infix_tokens):
    precedence = {'not': 3, 'and': 2, 'or': 1}
    postfix = []
    operator_stack = []
    for token in infix_tokens:
        if token in {'and', 'or', 'not'}:
            while operator_stack and operator_stack[-1] != '(' and precedence[operator_stack[-1]] >= precedence[token]:
                postfix.append(operator_stack.pop())
            operator_stack.append(token)
        elif token == '(':
            operator_stack.append(token)
        elif token == ')':
            while operator_stack[-1] != '(':
                postfix.append(operator_stack.pop())
            operator_stack.pop()
        else:
            postfix.append(token)
    while operator_stack:
        postfix.append(operator_stack.pop())
    return postfix

def process_complex_query(query, inverted_index):
    postfix_query = infix_to_postfix(query.split())
    stack = []
    for token in postfix_query:
        if token in {'and', 'or', 'not'}:
            if token == 'not':
                operand = stack.pop()
                stack.append(complement(operand))
            else:
                operand2 = stack.pop()
                operand1 = stack.pop()
                if token == 'and':
                    stack.append(operand1.intersection(operand2))
                elif token == 'or':
                    stack.append(operand1.union(operand2))
        else:
            stack.append(inverted_index.get(token, set()))
    return stack.pop()

def process_proximity_query(query, positional_index):
    query = preprocess_text(query)
    term1, term2, k = query[0], query[1], int(query[2].replace("/", ""))
    result = set()
    for doc_id, positions1 in positional_index.get(term1, {}).items():
        positions2 = positional_index.get(term2, {}).get(doc_id, [])
        if any(abs(pos2 - pos1) <= k for pos1 in positions1 for pos2 in positions2):
            result.add(doc_id)
    return result

dir_path = "ResearchPapers"
inverted_index = build_inverted_index(dir_path)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jupit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jupit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jupit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
query = input("Enter the query: ")
if "/" in query:
    result = process_proximity_query(query, positional_index)
else:
    if any(op in query for op in {"and", "or", "not"}):
        result = process_complex_query(query.split(), inverted_index)
    else:
        result = process_simple_query(query, inverted_index)

print(sorted(result))

[1, 2, 3, 7, 8, 9, 11, 16, 26]
