In [23]:
# ----------------------------------------------------------------------
## 1. Setup and Imports (Following Lab Solution Structure)
## ----------------------------------------------------------------------
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# Note: Other imports (math, numpy, collections, array, json, re, time) assumed present
from collections import defaultdict
from array import array 
import re 

# --- NLTK Setup (Necessary for Point 1) ---
try:
    # Check if stopwords are already downloaded
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    print("Downloading nltk stopwords...")
    # NOTE: In a complete notebook, this would be uncommented: nltk.download('stopwords')

# Placeholder for DOCS variable after successful loading (28080 docs)
# DOCS = [...] 

# ----------------------------------------------------------------------
## 2. Enhanced Pre-processing Function (Point 1)
## ----------------------------------------------------------------------

def build_terms(line):
    """
    Preprocess text: lowercasing, punctuation removal, tokenization,
    stop word removal, stemming, and filtering (bonus).
    """
    # Uses PorterStemmer for consistency with the provided lab solution.
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    # 1. Lowercase
    line = line.lower()

    # 2. Removing punctuation marks (Required by Point 1)
    line = re.sub(r'[^\w\s]', ' ', line)

    # 3. Tokenization
    line = line.split()

    # 4. Removing stop words
    line = [x for x in line if x not in stop_words]

    # 5. Stemming (Required by Point 1)
    line = [stemmer.stem(word) for word in line]

    # BONUS: Filtering single-character/pure digits (Point 1: "anything else you think it's needed")
    line = [x for x in line if len(x) > 1 and not x.isdigit()]

    return line

# ----------------------------------------------------------------------
## 3. Helper Function for Complex Fields (product_details)
## ----------------------------------------------------------------------

def get_product_details_string(product_details_list):
    """Flattens the list of product_details dictionaries into a single string for indexing."""
    detail_strings = []
    if isinstance(product_details_list, list):
        for detail_dict in product_details_list:
            if isinstance(detail_dict, dict):
                # Concatenate keys and values
                for key, value in detail_dict.items():
                    # We only care about the values in the details, e.g., 'Cotton Blend'
                    detail_strings.append(str(value)) 
    return " ".join(detail_strings)


# ----------------------------------------------------------------------
## 4. Integrated Index Creation Function (Points 1, 2, 3, & 4 Combined)
## ----------------------------------------------------------------------

def create_multi_field_index(DOCS):
    """
    Builds the multi-field inverted index, satisfying all Part 1 points.
    """
    # Core Inverted Index structures (Point 3: Multi-Field Indexing)
    text_index = defaultdict(list)    
    category_index = defaultdict(list) 

    # Document Store for Full Retrieval (Point 2 and Hint 3: PID preservation)
    doc_store = {} 
    
    # We also need structures to compute TF-IDF (DF is needed now, TF and IDF will be completed later)
    # The DF is required now to check how many documents contain a term across the corpus.
    df_text = defaultdict(int)
    df_category = defaultdict(int)

    for doc in DOCS:
        pid = doc.get('pid')
        if not pid:
            continue

        # --- Point 2: Document Store (PID Mapping) ---
        # Store the full document using its unique PID. PID is used for evaluation (Hint 3).
        doc_store[pid] = doc

        # --- Point 3: Strategic Field Combination for Indexing ---

        # 1. Main Text for TEXT_INDEX (Content & Description)
        prod_details_str = get_product_details_string(doc.get('product_details', []))
        main_text = (
            doc.get('title', '') + " " +          
            doc.get('description', '') + " " +    
            prod_details_str                      
        )
        
        # 2. Categorical Text for CATEGORY_INDEX (Attributes & Brand)
        # These terms will be boosted in ranking due to their high relevance to queries (Hint 1).
        category_text = (
            doc.get('brand', '') + " " +         
            doc.get('category', '') + " " +
            doc.get('sub_category', '')
            # We omit 'seller' as it's typically less relevant for product content queries like the hints.
        )
        
        # --- Point 4: Numeric/Status Fields ---
        # Fields like 'out_of_stock', 'selling_price', 'average_rating' are NOT indexed as terms.
        # Their raw values are in the DOC_STORE (Point 2) and will be used for filtering/boosting during ranking.

        # --- Point 1/3: Process and Build Indexes ---
        
        # 1. Index Main Text
        terms_main_text = build_terms(main_text)
        current_page_index_text = {}
        for position, term in enumerate(terms_main_text):
            if term not in current_page_index_text:
                current_page_index_text[term] = [pid, array('I', [])] 
            current_page_index_text[term][1].append(position)
        
        # Merge and update DF for main text
        for term, posting in current_page_index_text.items():
            text_index[term].append(posting)
            df_text[term] += 1 # Update document frequency

        # 2. Index Categorical Text
        terms_category = build_terms(category_text)
        current_page_index_category = {}
        for position, term in enumerate(terms_category):
            if term not in current_page_index_category:
                current_page_index_category[term] = [pid, array('I', [])]
            current_page_index_category[term][1].append(position)

        # Merge and update DF for categorical text
        for term, posting in current_page_index_category.items():
            category_index[term].append(posting)
            df_category[term] += 1 # Update document frequency


    print(f"Indexing complete. Created Indexing Structures:")
    print(f"- Documents stored for full retrieval: {len(doc_store)}")
    print(f"- Main Text Index unique terms: {len(text_index)}")
    print(f"- Category Index unique terms: {len(category_index)}")
    
    # Return the indexes and DF (required for subsequent TF/IDF calculation)
    return text_index, category_index, doc_store, df_text, df_category