Q. Write a program to build/construct an inverted indexer by collecting n-number of text documents

In [1]:
import os
import json
import re
from collections import defaultdict

In [2]:
# Directory containing the text documents
docs_directory = "/docs"

In [3]:
# Function to tokenize and normalize the text
def tokenize(text):
    # Remove punctuation and split text into words
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

In [5]:

# Building the inverted index
def build_inverted_index(directory):
    inverted_index = defaultdict(list)
    
    # Iterate through all text files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                tokens = tokenize(content)
                
                # Add each token to the inverted index
                for token in tokens:
                    if filename not in inverted_index[token]:
                        inverted_index[token].append(filename)
    
    return inverted_index

In [7]:
# Save the inverted index to a JSON file
def save_inverted_index(index, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(index, file, indent=4)

In [9]:
# Main process
if __name__ == "__main__":
    docs_directory = "./docs"  # Update this to the correct path if needed
    output_file = "inverted_index.json"
    
    # Build the index
    index = build_inverted_index(docs_directory)
    
    # Save the index to a JSON file
    save_inverted_index(index, output_file)
    print(f"Inverted index saved to {output_file}")

Inverted index saved to inverted_index.json
