In [65]:
import nltk
nltk.download ("stopwords")
nltk.download ("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [67]:
import nltk
nltk.download('punkt', download_dir='C:/nltk_data')

[nltk_data] Downloading package punkt to C:/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [82]:
import os
import re
import logging
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [84]:
class DocumentProcessor:
    
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.documents = {}
        self.doc_mapping = {}
        
    def load_documents(self):
        print(f"Loading the documents from: {self.directory_path}")
        doc_counter = 0
        
        for file in os.listdir(self.directory_path):
            print(f"Processing: {file}")
            if file.endswith(".txt"):
                file_path = os.path.join(self.directory_path, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text_content = f.read()
                    self.documents[doc_counter] = text_content
                    self.doc_mapping[doc_counter] = file
                    print(f"Document ID {doc_counter} mapped to {file}")
                    doc_counter += 1
        
        print(f" {len(self.documents)} Successfully loaded")
        return self.documents, self.doc_mapping
    
    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        tokens = word_tokenize(text)
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) > 1]
        return processed_tokens

In [86]:
class InvertedIndexBuilder:
    
    def __init__(self):
        self.index = defaultdict(set)
        self.term_stats = Counter()
    
    def create_index(self, documents, processor):
        for doc_id, text in documents.items():
            tokens = processor.preprocess_text(text)
            for token in tokens:
                self.index[token].add(doc_id)
                self.term_stats[token] += 1
        
        return self.index, self.term_stats

In [88]:
class BooleanSearchEngine:
    
    def __init__(self, inverted_index, doc_mapping):
        self.inverted_index = inverted_index
        self.doc_mapping = doc_mapping
    
    def execute_query(self, query_string):
        query_string = query_string.lower()
        query_tokens = query_string.split()
        
        matching_docs = set()

        operators = ['and', 'or', 'not']
        search_terms = [token for token in query_tokens if token not in operators]
        
        if 'and' in query_tokens:
            if all(term in self.inverted_index for term in search_terms):
                matching_docs = self.inverted_index[search_terms[0]].copy()
                for term in search_terms[1:]:
                    matching_docs &= self.inverted_index[term]
    
        elif 'or' in query_tokens:
            for term in search_terms:
                if term in self.inverted_index:
                    matching_docs |= self.inverted_index[term]
        
        elif 'not' in query_tokens:
            excluded_term = query_tokens[1]
            all_document_ids = set(self.doc_mapping.keys())
            if excluded_term in self.inverted_index:
                matching_docs = all_document_ids - self.inverted_index[excluded_term]
            else:
                matching_docs = all_document_ids
        
    
        else:
            if query_string in self.inverted_index:
                matching_docs = self.inverted_index[query_string]
    
        result_files = [self.doc_mapping[doc_id] for doc_id in matching_docs 
                       if doc_id in self.doc_mapping]
        
        logging.info(f"Query: '{query_string}' | Results: {result_files}")
        return result_files
        

In [90]:
def create_query_file(term_statistics, output_path="queries.txt", num_queries=5):
    sample_queries = [
        "update AND feature",
        "android OR window",
        "NOT support"
    ]
    
    with open(output_path, "w", encoding="utf-8") as query_file:
        for q in sample_queries:
            query_file.write(q + "\n")
    
    print(f"Query file created: {output_path}")

In [94]:
def run_search_system():
    """Main execution function"""
    docs_folder = r"C:\nltk_data"

    doc_processor = DocumentProcessor(docs_folder)
    documents, doc_mapping = doc_processor.load_documents()
    
    for doc_id, content in documents.items():
        tokens = doc_processor.preprocess_text(content)
        print(f"Document {doc_id} preview:", tokens[:20])
    
    index_builder = InvertedIndexBuilder()
    inverted_index, term_stats = index_builder.create_index(documents, doc_processor)
    print("Index preview:", list(inverted_index.keys())[:20])
   
    create_query_file(term_stats)
    
  
    search_engine = BooleanSearchEngine(inverted_index, doc_mapping)
    test_queries = [
        "update AND feature",
        "android OR window",
        "NOT support"
    ]
    

    with open("search_results.txt", 'w', encoding='utf-8') as output:
        for query in test_queries:
            results = search_engine.execute_query(query)
            output_line = f"Query: '{query}' => Results: {results}\n"
            print(output_line)
            output.write(output_line)


if __name__ == "__main__":
    run_search_system()

Loading the documents from: C:\nltk_data
Processing: tokenizers
 0 Successfully loaded
Index preview: []
Query file created: queries.txt
Query: 'update AND feature' => Results: []

Query: 'android OR window' => Results: []

Query: 'NOT support' => Results: []

