In [11]:
import os
import pickle

# Function to preprocess a document and create unigram inverted index
def construct_inverted_index(input_directory):
    inverted_index = {}
    for filename in os.listdir(input_directory):
        with open(os.path.join(input_directory, filename), 'r') as file:
            terms = set(file.read().lower().split())
            for term in terms:
                if term not in inverted_index:
                    inverted_index[term] = set()
                inverted_index[term].add(filename)
    return inverted_index

# Function to save inverted index to a file using pickle
def save_inverted_index(inverted_index, filename):
    with open(filename, 'wb') as file:
        pickle.dump(inverted_index, file)

# Function to load inverted index from a file using pickle
def load_inverted_index(filename):
    with open(filename, 'rb') as file:
        inverted_index = pickle.load(file)
    return inverted_index

inverted_index1 = load_inverted_index('/kaggle/working/unigram_inverted_index.pickle')

# # # Print terms present in the inverted index
# print("Terms in Inverted Index:")
# for term in inverted_index1.keys():
#     print(term)




In [16]:
# # Function to perform AND operation between two sets
def AND(term1_docs, term2_docs):
    return term1_docs.intersection(term2_docs)

# Function to perform OR operation between two sets
def OR(term1_docs, term2_docs):
    return term1_docs.union(term2_docs)

# Function to perform AND NOT operation between two sets
def AND_NOT(term1_docs, term2_docs):
    return term1_docs.difference(term2_docs)

# Function to perform OR NOT operation between two sets
def OR_NOT(term1_docs, term2_docs, all_docs):
    return term1_docs.union(all_docs.difference(term2_docs))

# # Function to perform query
# # Function to perform query
# def perform_query(query, inverted_index, all_docs):
#     terms = query.split()
#     operation = None
#     result_docs = None
    
#     for i in range(1, len(terms), 2):
#         if terms[i] == 'AND':
#             operation = AND if operation is None else operation
#         elif terms[i] == 'OR':
#             operation = OR if operation is None else operation
#         elif terms[i] == 'AND_NOT':
#             operation = AND_NOT if operation is None else operation
#         elif terms[i] == 'OR_NOT':
#             operation = lambda term1_docs, term2_docs: OR_NOT(term1_docs, term2_docs, all_docs)
#         else:
#             term_docs = inverted_index.get(terms[i], set())
#             if result_docs is None:
#                 result_docs = term_docs
#             else:
#                 if operation is None:
#                     operation = OR
#                 result_docs = operation(result_docs, term_docs)
    
#     return result_docs


# # Function to process queries and generate output
# def process_queries(queries, inverted_index, all_docs):
#     results = []
#     for i in range(0, len(queries), 2):
#         query = queries[i]
#         operations = queries[i + 1].split(',')
#         result_docs = perform_query(query, inverted_index, all_docs)
#         results.append((query, operations, len(result_docs) if result_docs else 0, list(result_docs) if result_docs else []))
#     return results

# # Function to print the results in the required format
# def print_results(results):
#     for i, (query, operations, count, docs) in enumerate(results, start=1):
#         print(f"Query {i}: {query}")
#         print(f"Number of documents retrieved for query {i}: {count}")
#         print(f"Names of the documents retrieved for query {i}: {', '.join(docs)}")
#         print()

# # Sample Test Case
# input_directory = '/kaggle/input/ir-text/text_files'
# output_file = '/kaggle/working/unigram_inverted_index.pickle'
# queries = [
#     "Car bag in a canister",
#     "OR, AND NOT",
#     "Coffee brewing techniques in cookbook",
#     "AND, OR NOT, OR"
# ]

# # Construct unigram inverted index
# inverted_index = construct_inverted_index(input_directory)

# # Save inverted index to file
# save_inverted_index(inverted_index, output_file)

# # Load inverted index from file
# inverted_index = load_inverted_index(output_file)

# # Get all document IDs
# all_docs = set(os.listdir(input_directory))

# # Process queries
# results = process_queries(queries, inverted_index, all_docs)

# # Print results
# print_results(results)


# Function to perform query
# Function to perform the query operation
def perform_query(search_term, index, all_files):
    word_list = search_term.split()  # Split the search term into words
    operation = None  # Initialize the operation variable
    result_files = None  # Initialize the result files variable
    
    # Loop through the words in the search term
    for i in range(1, len(word_list), 2):
        if word_list[i] == 'AND':  # Check if it's an AND operation
            operation = AND if operation is None else operation
        elif word_list[i] == 'OR':  # Check if it's an OR operation
            operation = OR if operation is None else operation
        elif word_list[i] == 'AND_NOT':  # Check if it's an AND NOT operation
            operation = AND_NOT if operation is None else operation
        elif word_list[i] == 'OR_NOT':  # Check if it's an OR NOT operation
            operation = lambda files1, files2: OR_NOT(files1, files2, all_files)
        else:
            term_files = index.get(word_list[i], set())  # Retrieve files for the term
            if result_files is None:  # If result files is not initialized
                result_files = term_files  # Initialize it with term files
            else:
                if operation is None:  # If operation is not specified
                    operation = OR  # Set default operation to OR
                result_files = operation(result_files, term_files)  # Perform the operation
    
    return result_files  # Return the result files

# Function to process the queries and generate output
def process_queries(search_queries, index, all_files):
    results = []  # Initialize an empty list to store results
    for i in range(0, len(search_queries), 2):  # Iterate through search queries
        search_term = search_queries[i]  # Get the search term
        operations = search_queries[i + 1].split(',')  # Get the operations
        result_files = perform_query(search_term, index, all_files)  # Perform the query
        result_files = result_files if result_files else set()  # Ensure result_files is not None
        results.append((search_term, operations, len(result_files), result_files))  # Store the results
    return results  # Return the results

# Function to print the results in a formatted manner
def print_results(results):
    for i, (search_term, operations, count, files) in enumerate(results, start=1):
        print(f"Query {i}: {search_term}")  # Print the query
        print(f"Number of documents retrieved for query {i}: {count}")  # Print the number of documents
        print(f"Names of the documents retrieved for query {i}: {', '.join(files)}" if count > 0 else "No documents retrieved")  # Print the documents
        print()

# Manually input queries
num_queries = int(input("Enter the number of queries to execute: "))  # Ask for the number of queries
search_queries = []  # Initialize an empty list for search queries
for _ in range(num_queries):  # Iterate for each query
    search_term = input("Enter the search term: ")  # Get the search term from the user
    operations = input("Enter the operations separated by comma: ")  # Get the operations
    search_queries.extend([search_term, operations])  # Append the search term and operations

# Sample Test Case
input_directory = '/kaggle/input/ir-text/text_files'  # Input directory for text files
output_file = '/kaggle/working/unigram_inverted_index.pickle'  # Output file for inverted index

# Construct unigram inverted index
inverted_index = construct_inverted_index(input_directory)

# Save inverted index to file
save_inverted_index(inverted_index, output_file)

# Load inverted index from file
inverted_index = load_inverted_index(output_file)

# Get all document IDs
all_files = set(os.listdir(input_directory))

# Process queries
results = process_queries(search_queries, inverted_index, all_files)

# Print results
print_results(results)



# Enter the number of queries to execute:  1
# Enter the query:  car bag in a canister
# Enter the operations separated by comma:  OR,AND_NOT
# Query 1: car bag in a canister
# Number of documents retrieved for query 1: 760


Enter the number of queries to execute:  1
Enter the search term:  car bag in a canister
Enter the operations separated by comma:  OR,AND_NOT


Query 1: car bag in a canister
Number of documents retrieved for query 1: 760
Names of the documents retrieved for query 1: file633.txt, file382.txt, file865.txt, file877.txt, file734.txt, file30.txt, file912.txt, file185.txt, file826.txt, file482.txt, file72.txt, file616.txt, file619.txt, file824.txt, file458.txt, file329.txt, file170.txt, file788.txt, file852.txt, file125.txt, file137.txt, file968.txt, file505.txt, file375.txt, file401.txt, file307.txt, file742.txt, file346.txt, file422.txt, file690.txt, file599.txt, file959.txt, file162.txt, file811.txt, file898.txt, file710.txt, file228.txt, file150.txt, file351.txt, file260.txt, file454.txt, file247.txt, file306.txt, file10.txt, file890.txt, file621.txt, file177.txt, file638.txt, file440.txt, file719.txt, file214.txt, file934.txt, file420.txt, file744.txt, file138.txt, file230.txt, file1.txt, file863.txt, file620.txt, file990.txt, file449.txt, file108.txt, file727.txt, file38.txt, file988.txt, file757.txt, file940.txt, file746.txt