Implement a program for retrieval of documents using inverted 
files.

In [3]:
from collections import defaultdict
import re

In [5]:
def tokenize(text):
    # Convert to lowercase and tokenize the text by words, keeping only alphanumeric tokens
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

In [7]:
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    
    for doc_id, text in enumerate(documents):
        tokens = tokenize(text)
        for token in tokens:
            if doc_id not in inverted_index[token]:
                inverted_index[token].append(doc_id)  # Add doc_id if not already present
    
    return inverted_index

In [9]:
def retrieve_documents(query, inverted_index):
    query_tokens = tokenize(query)
    doc_sets = []
    
    for token in query_tokens:
        if token in inverted_index:
            doc_sets.append(set(inverted_index[token]))
        else:
            return []  # If any token is not found, return an empty list (no documents found)
    
    # Intersect all sets of document IDs to get the final matching documents
    result_docs = set.intersection(*doc_sets) if doc_sets else set()
    return list(result_docs)

In [11]:
# List of example documents
documents = [
    "Natural language processing with Python.",
    "Deep learning models for AI and NLP tasks.",
    "Python programming for machine learning.",
    "AI models in healthcare and NLP."
]

# Build the inverted index
inverted_index = build_inverted_index(documents)
print("Inverted Index:", dict(inverted_index))

# Query the inverted index
query = "NLP and Python"
matching_docs = retrieve_documents(query, inverted_index)

# Display the matching documents
print("Documents matching query:", matching_docs)
for doc_id in matching_docs:
    print(f"Doc {doc_id}: {documents[doc_id]}")

Inverted Index: {'natural': [0], 'language': [0], 'processing': [0], 'with': [0], 'python': [0, 2], 'deep': [1], 'learning': [1, 2], 'models': [1, 3], 'for': [1, 2], 'ai': [1, 3], 'and': [1, 3], 'nlp': [1, 3], 'tasks': [1], 'programming': [2], 'machine': [2], 'in': [3], 'healthcare': [3]}
Documents matching query: []
