In [1]:
import string

In [2]:
documents = [
    "This is the first document",
    "This document is the second document",
    "And this is the third one",
    "Is this the first document"
]

In [3]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [4]:
def create_inverted_index(docs):
    inverted_index = {}
    for doc_id, doc in enumerate(docs):
        words = preprocess(doc).split()
        for word in set(words):
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(doc_id)
    return inverted_index

In [6]:
def retrieve_documents(query, inverted_index):
    query = preprocess(query).split()
    doc_ids = set()
    for word in query:
        if word in inverted_index:
            doc_ids.update(inverted_index[word])
    return sorted(doc_ids)

In [7]:
inverted_index = create_inverted_index(documents)

In [13]:
query = "first document"
retrieved_docs = retrieve_documents(query, inverted_index)

In [14]:
print(f"Query: '{query}'")
print("Retrieved documents:", retrieved_docs)

Query: 'first document'
Retrieved documents: [0, 1, 3]


In [15]:
for doc_id in retrieved_docs:
    print(f"Document {doc_id}: {documents[doc_id]}")

Document 0: This is the first document
Document 1: This document is the second document
Document 3: Is this the first document
