In [1]:
from src.preprocessing import preprocess_documents
from src.indexing import InvertedIndex
from src.querying import QueryProcessor



In [2]:
# Sample documents:
documents = [
    'This is a simple example document. It contains several words. The words should be processed and indexed.',
    'Another example document with different content. Document indexing is important for retrieval.',
    'Another example document to test Boolean search capabilities. This document contains relevant content.',
    'Automat Automata Automation Automatic nobody nood need nid nobody nearby nekoray neyshabour nooobbbbboooy']

# Preprocess documents
preprocessed_documents = preprocess_documents(documents)

# Initialize the Information Retrieval System with the preprocessed documents
inverted_index = InvertedIndex(preprocessed_documents)


In [3]:
# Create posting lists
inverted_index.create_posting_list()

# Let's see the posting list
inverted_index.posting_list


[another,
 automat,
 automata,
 automatic,
 automation,
 boolean,
 capabilities,
 contains,
 content,
 different,
 document,
 example,
 important,
 indexed,
 indexing,
 nearby,
 need,
 nekoray,
 neyshabour,
 nid,
 nobody,
 nood,
 nooobbbbboooy,
 processed,
 relevant,
 retrieval,
 search,
 several,
 simple,
 test,
 words]

In [4]:
ir_system = QueryProcessor(inverted_index)
ir_system.create_prefix_trie()
# Execute a standard Boolean query
print(ir_system.search('example and content'))

# Execute a proximity query
print(ir_system.search('not example'))

# Execute an OR query
print(ir_system.search('example or content'))

# Execute a NOT query
print(ir_system.search('not example'))

# Execute wildcard queries and spell correction
print(ir_system.search('not t*'))
print(ir_system.search('exa*le and contrnt'))
print(ir_system.search('n*d and Automation'))
print(ir_system.search('n*b*y and Automation'))

{1, 2}
{3}
{0, 1, 2}
{3}
{0, 1, 2}
{1, 2}
{3}
{3}
