<a href="https://colab.research.google.com/github/DeepakKumar2005fg/AIML-/blob/main/Untitled43.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re

class InformationRetrievalSystem:
    def __init__(self):
        self.documents = {}  # Stores doc_id: content
        self.inverted_index = {} # Stores word: [doc_ids] - a list of document IDs where the word appears

    def add_document(self, doc_id, content):
        """Adds a document to the system and updates the inverted index."""
        self.documents[doc_id] = content
        self._build_inverted_index(doc_id, content)
        print(f"Document '{doc_id}' added and indexed.")

    def _build_inverted_index(self, doc_id, content):
        """Builds or updates the inverted index for a given document."""
        # Simple tokenization: convert to lowercase and split by non-alphanumeric characters
        # \b\w+\b matches whole words (alphanumeric characters)
        words = re.findall(r'\b\w+\b', content.lower())
        for word in set(words):  # Use set(words) to get unique words from the document
            if word not in self.inverted_index:
                self.inverted_index[word] = []
            if doc_id not in self.inverted_index[word]: # Ensure no duplicate doc_ids per word
                self.inverted_index[word].append(doc_id)

    def search(self, query):
        """
        Searches for documents containing all keywords in the query (AND logic).
        Returns a list of doc_ids that match the query.
        """
        query_words = re.findall(r'\b\w+\b', query.lower())

        if not query_words:
            return []

        # Start with the document list for the first query word
        # .get(word, []) handles cases where a word is not in the index
        results = set(self.inverted_index.get(query_words[0], []))

        # Intersect with results for subsequent query words
        for i in range(1, len(query_words)):
            word_results = set(self.inverted_index.get(query_words[i], []))
            results = results.intersection(word_results)
            if not results: # If at any point the intersection is empty, no need to continue
                break # No documents contain all words

        return sorted(list(results)) # Return sorted list for consistent output

    def get_document_content(self, doc_id):
        """Returns the content of a document given its ID."""
        return self.documents.get(doc_id, "Document not found.")

# --- Demonstration ---
if __name__ == "__main__":
    ir_system = InformationRetrievalSystem()

    # Add some documents
    ir_system.add_document("doc1", "The quick brown fox jumps over the lazy dog.")
    ir_system.add_document("doc2", "A brown cat sleeps on the mat.")
    ir_system.add_document("doc3", "The dog barks loudly at the cat.")
    ir_system.add_document("doc4", "Quickly, the fox ran away.")
    ir_system.add_document("doc5", "The fox and the dog are friends.")


    print("--- Question 9: Basic Information Retrieval System ---")

    # Search for a single keyword
    query1 = "fox"
    matching_docs1 = ir_system.search(query1)
    print(f"\nSearching for '{query1}': Found in documents {matching_docs1}")
    for doc_id in matching_docs1:
        print(f"  Content of {doc_id}: {ir_system.get_document_content(doc_id)}")

    # Search for multiple keywords (AND logic)
    query2 = "brown dog"
    matching_docs2 = ir_system.search(query2)
    print(f"\nSearching for '{query2}': Found in documents {matching_docs2}")
    for doc_id in matching_docs2:
        print(f"  Content of {doc_id}: {ir_system.get_document_content(doc_id)}")

    query3 = "cat sleeps"
    matching_docs3 = ir_system.search(query3)
    print(f"\nSearching for '{query3}': Found in documents {matching_docs3}")
    for doc_id in matching_docs3:
        print(f"  Content of {doc_id}: {ir_system.get_document_content(doc_id)}")

    query4 = "nonexistent word"
    matching_docs4 = ir_system.search(query4)
    print(f"\nSearching for '{query4}': Found in documents {matching_docs4}")

    query5 = "quick cat" # Should find nothing, as 'quick' and 'cat' don't appear together
    matching_docs5 = ir_system.search(query5)
    print(f"\nSearching for '{query5}': Found in documents {matching_docs5}")
    for doc_id in matching_docs5:
        print(f"  Content of {doc_id}: {ir_system.get_document_content(doc_id)}")

    query6 = "fox dog"
    matching_docs6 = ir_system.search(query6)
    print(f"\nSearching for '{query6}': Found in documents {matching_docs6}")
    for doc_id in matching_docs6:
        print(f"  Content of {doc_id}: {ir_system.get_document_content(doc_id)}")

Document 'doc1' added and indexed.
Document 'doc2' added and indexed.
Document 'doc3' added and indexed.
Document 'doc4' added and indexed.
Document 'doc5' added and indexed.
--- Question 9: Basic Information Retrieval System ---

Searching for 'fox': Found in documents ['doc1', 'doc4', 'doc5']
  Content of doc1: The quick brown fox jumps over the lazy dog.
  Content of doc4: Quickly, the fox ran away.
  Content of doc5: The fox and the dog are friends.

Searching for 'brown dog': Found in documents ['doc1']
  Content of doc1: The quick brown fox jumps over the lazy dog.

Searching for 'cat sleeps': Found in documents ['doc2']
  Content of doc2: A brown cat sleeps on the mat.

Searching for 'nonexistent word': Found in documents []

Searching for 'quick cat': Found in documents []

Searching for 'fox dog': Found in documents ['doc1', 'doc5']
  Content of doc1: The quick brown fox jumps over the lazy dog.
  Content of doc5: The fox and the dog are friends.
