In [7]:
import os  # Importing the os module for operating system related functionalities
import nltk  # Importing the Natural Language Toolkit for NLP operations
import string  # Importing the string module for string manipulation
import pickle  # Importing pickle module for serializing and deserializing Python objects

from nltk.tokenize import word_tokenize  # Importing word_tokenize function for tokenization
from nltk.corpus import stopwords  # Importing stopwords from NLTK corpus for removing common words

# Download NLTK resources
# nltk.download('punkt')  # Download the Punkt tokenizer models
# nltk.download('stopwords')  # Download the stopwords corpus

# Define function for preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenization using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))  # Retrieve English stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Filter out stopwords
    
    # Remove punctuations
    tokens = [word for word in tokens if word not in string.punctuation]  # Filter out punctuation symbols
    
    # Remove blank space tokens
    tokens = [word for word in tokens if word.strip()]  # Filter out empty tokens
    
    return tokens  # Return the preprocessed tokens

# Function to create positional index and various steps
def create_positional_index(folder_path):
    positional_index = {}  # Initialize an empty positional index dictionary
    for file_name in os.listdir(folder_path):  # Iterate through each file in the folder
        file_path = os.path.join(folder_path, file_name)  # Construct the full file path
        with open(file_path, 'r') as file:  # Open the file in read mode
            text = file.read()  # Read the contents of the file
        tokens = preprocess_text(text)  # Preprocess the text of the file
        for position, token in enumerate(tokens):  # Iterate through each token in the file
            if token not in positional_index:  # If token is not in positional index
                positional_index[token] = {}  # Initialize an empty dictionary for the token
            if file_name not in positional_index[token]:  # If file not in positional index for token
                positional_index[token][file_name] = []  # Initialize an empty list for the file
            positional_index[token][file_name].append(position)  # Append the position to the list
    return positional_index  # Return the positional index dictionary

# Function to save positional index using pickle
def save_positional_index(positional_index, file_path):
    with open(file_path, 'wb') as file:  # Open the file in write-binary mode
        pickle.dump(positional_index, file)  # Serialize and write the positional index to the file

# Function to load positional index using pickle
def load_positional_index(file_path):
    with open(file_path, 'rb') as file:  # Open the file in read-binary mode
        positional_index = pickle.load(file)  # Deserialize and load the positional index from the file
    return positional_index  # Return the loaded positional index

# Function to process queries using positional index
def process_queries(positional_index, queries):
    results = []  # Initialize an empty list to store query results
    for query in queries:  # Iterate through each query
        query_terms = preprocess_text(query)  # Preprocess the query text
        retrieved_docs = None  # Initialize variable to store retrieved documents
        for term in query_terms:  # Iterate through each term in the query
            if term in positional_index:  # Check if term exists in the positional index
                if retrieved_docs is None:  # If retrieved_docs is None
                    retrieved_docs = set(positional_index[term].keys())  # Initialize with document keys
                else:  # If retrieved_docs is not None
                    retrieved_docs &= set(positional_index[term].keys())  # Intersect with document keys
        results.append(retrieved_docs)  # Append retrieved documents to results list
    return results  # Return the list of query results

# Function to get user input
def get_user_input():
    num_queries = int(input("Enter the number of queries to execute: "))  # Prompt user for number of queries
    queries = []  # Initialize an empty list to store queries
    for _ in range(num_queries):  # Iterate for each query
        query = input("Enter phrase query: ")  # Prompt user to enter query
        queries.append(query)  # Append the query to the list of queries
    return queries  # Return the list of queries

# Function to display results
def display_results(results):
    for i, docs in enumerate(results, 1):  # Iterate through each query result
        print(f"Number of documents retrieved for query {i} using positional index: {len(docs)}")
        print(f"Names of documents retrieved for query {i} using positional index: {', '.join(docs)}")

# Main function
def main():
    # Path to the folder containing your text files
    folder_path = '/kaggle/input/ir-text/text_files'  # Set the path to the folder containing text files
    
    # Create positional index
    positional_index = create_positional_index(folder_path)  # Create the positional index
    
    # Save positional index
    save_positional_index(positional_index, '/kaggle/working/positional_index.pickle')  # Save the positional index
    
    # Load positional index
    loaded_positional_index = load_positional_index('/kaggle/working/positional_index.pickle')  # Load the positional index
    
    # Get user input
    queries = get_user_input()  # Get queries from user
    
    # Process queries
    results = process_queries(loaded_positional_index, queries)  # Process queries using positional index
    
    # Display results
    display_results(results)  # Display the query results

if __name__ == "__main__":
    main()  # Execute the main function


Enter the number of queries to execute:  1
Enter phrase query:  car


Number of documents retrieved for query 1 using positional index: 6
Names of documents retrieved for query 1 using positional index: file174.txt, file542.txt, file886.txt, file166.txt, file264.txt, file746.txt


Enter the number of queries to execute:  1
Enter phrase query:  car bag in a canister


Number of documents retrieved for query 1 using positional index: 2
Names of documents retrieved for query 1 using positional index: file174.txt, file886.txt
