<a href="https://colab.research.google.com/github/AbinayaVina1/Scalable-Indexing/blob/main/_IRT_EX7_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# do scalable indexing

!pip install whoosh

from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import os
import re

# Custom stopwords List (Replace with your own)
STOPWORDS = {"the", "is", "in", "and", "to", "of", "a", "an", "on", "for", "with"}

# Simple stemming function (without NLTK)
def simple_stem(word):
    # A simple stemmer that removes common suffixes
    suffixes = ['ing', 'ly', 'ed', 'es', 's']
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Define a simple text processing function
def preprocess_text(text):
    # Convert to Lowercase and tokenize using regex
    words = re.findall(r'\w+', text.lower())
    # Remove stopwords and apply stemming
    processed_words = [simple_stem(word) for word in words if word not in STOPWORDS]
    return processed_words

# Define Whoosh schema with a numeric field for term frequency
schema = Schema(
    title=TEXT(stored=True),
    content=TEXT,
    doc_id=ID(stored=True, unique=True),
    term_freq=TEXT(stored=True) # Add term_freq field as TEXT (or NUMERIC if you prefer)
)

# Create/Open index directory
index_dir = "indexdir" # Use a relative path for portability
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

try:
    index = open_dir(index_dir)
except:
    index = create_in(index_dir, schema)

# Indexing Process
# corpus_dir should be the directory containing the files to index, not a file itself.
corpus_dir = "/content/doc2"  # Assuming 'ty.txt' is inside '/content/corpus'

# Ensure the corpus directory exists.  If it doesn't, create it
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir) # Use makedirs to create intermediate directories if needed

# Create a dummy file if one does not exist
dummy_file_path = os.path.join(corpus_dir, "ty.txt")
if not os.path.exists(dummy_file_path):
    with open(dummy_file_path, "w") as f:
        f.write("This is a sample document for demonstration.")

with index.writer() as writer:
    # Indexing with term frequency (TF) calculation
    for filename in os.listdir(corpus_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as file:
                content = file.read()
                processed_content = preprocess_text(content)

                # Calculate term frequency for each term
                term_freq = {}
                for word in processed_content:
                    term_freq[word] = term_freq.get(word, 0) + 1

                # Add document with term frequency information
                writer.add_document(
                    title=filename,
                    content=" ".join(processed_content),
                    doc_id=filename,
                    term_freq=str(term_freq) # Store the term frequency dictionary as a string
                )

print("Indexing Complete!")

# Search Function with unique results (by doc_id) and term frequency of the query
def search_index(query_text):
    query_text = preprocess_text(query_text) # Apply the same preprocessing
    searcher = index.searcher()
    query = QueryParser("content", index.schema).parse(" ".join(query_text))

    # Search the index with the query
    results = searcher.search(query)

    # Use a set to track unique documents by doc_id
    unique_results = set()
    print("\nSearch Results:")
    for result in results:
        # Get the document ID, title, and term frequency
        doc_id = result['doc_id']

        # Ensure that each document is printed only once (unique doc_id)
        if doc_id not in unique_results:
            unique_results.add(doc_id)
            title = result['title']
            term_freq = eval(result['term_freq']) # Convert string back to dict

            # Calculate the term frequency of the query term in the document
            query_term_freq = sum(term_freq.get(word, 0) for word in query_text)

            print(f"\nTitle: {title} (Doc ID: {doc_id})")
            print(f"{query_text[0]} : Term occurrances in this document: {query_term_freq}")


# Example: Searching for a term
search_query = "story" # Replace with the actual query you want to search for
search_index(search_query)

Indexing Complete!

Search Results:

Title: doc19.txt (Doc ID: doc19.txt)
story : Term occurrances in this document: 2

Title: doc17.txt (Doc ID: doc17.txt)
story : Term occurrances in this document: 2

Title: doc18.txt (Doc ID: doc18.txt)
story : Term occurrances in this document: 2

Title: doc20.txt (Doc ID: doc20.txt)
story : Term occurrances in this document: 2

Title: doc15.txt (Doc ID: doc15.txt)
story : Term occurrances in this document: 2

Title: doc14.txt (Doc ID: doc14.txt)
story : Term occurrances in this document: 2

Title: doc13.txt (Doc ID: doc13.txt)
story : Term occurrances in this document: 2

Title: doc12.txt (Doc ID: doc12.txt)
story : Term occurrances in this document: 2

Title: doc8.txt (Doc ID: doc8.txt)
story : Term occurrances in this document: 2

Title: doc5.txt (Doc ID: doc5.txt)
story : Term occurrances in this document: 2
