# **1. Import Documents**

In [None]:
from google.colab import files
uploaded = files.upload()

def load_documents(file_path):
    documents = {}
    with open(file_path, 'r') as file:
        content = file.read().strip()
        docs = content.split("\n\n")  # Split based on double newlines
        for i, doc in enumerate(docs):
            doc_id = f"doc{i+1}"
            documents[doc_id] = doc.lower()
    return documents

file_path = '/content/sample_documents.txt'
documents = load_documents(file_path)


Saving sample_documents.txt to sample_documents (3).txt


# **2. Apply Tokenization and Stemming**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Remove punctuation and lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Stem
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **3. Create Dictionary and Inverted Index**

In [None]:
from collections import defaultdict

def build_dictionary_and_index(documents):
    dictionary = set()
    inverted_index = defaultdict(set)

    for doc_id, text in documents.items():
        tokens = preprocess_text(text)
        dictionary.update(tokens)

        for token in tokens:
            inverted_index[token].add(doc_id)

    return dictionary, inverted_index

dictionary, inverted_index = build_dictionary_and_index(documents)


# **4. Implement Boolean Retrieval**

In [None]:
def boolean_retrieval(query, inverted_index):
    operators = {'and': set.intersection, 'or': set.union, 'not': set.difference}
    query_terms = query.lower().split()

    result_set = None
    operator = None

    for term in query_terms:
        if term in operators:
            operator = operators[term]
        else:
            term_results = inverted_index.get(term, set())
            if result_set is None:
                result_set = term_results
            elif operator:
                result_set = operator(result_set, term_results)

    return result_set if result_set else set()

# Example usage
print(boolean_retrieval('cloud and computing', inverted_index))
print(boolean_retrieval('machine or learning', inverted_index))
print(boolean_retrieval('not quantum', inverted_index))


set()
set()
{'doc9'}
