# 03 - Indexing: Build Multi-Modal Search Indices

This notebook creates searchable indices for text chunks, table row-sentences, and implements hierarchical section-based retrieval.

**Objectives:**
- Create dense embeddings for text and table content
- Build BM25 keyword index for exact matching
- Implement hierarchical indexing (sections → content)
- Store metadata for filtering (company, year, section)
- Use FAISS for efficient vector search

In [12]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'  # Fix OpenMP conflict

import sys
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import faiss
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

sys.path.append(str(Path.cwd().parent / 'src'))

from utils.config import PARSED_DATA_DIR, INDICES_DIR, MODEL_DIR
from retrieval.text_chunker import TextChunker
from retrieval.embedding_generator import EmbeddingGenerator
from retrieval.index_builder import IndexBuilder


## 1. Load Parsed Documents

In [13]:
# Load parsing log
parsing_log = pd.read_csv(PARSED_DATA_DIR / 'parsing_log.csv')
successful_parses = parsing_log[parsing_log['status'] == 'success']

print(f"Total parsed documents: {len(successful_parses)}")

# Load all parsed documents
all_documents = []
for idx, row in successful_parses.iterrows():
    with open(row['parsed_file'], 'rb') as f:
        doc = pickle.load(f)
        all_documents.append(doc)

print(f"Loaded {len(all_documents)} documents")

Total parsed documents: 1481
Loaded 1481 documents


## 2. Text Chunking Strategy

In [14]:
# Initialize text chunker
chunker = TextChunker(
    chunk_size=512,  # tokens
    chunk_overlap=50,  # tokens
    respect_sentence_boundaries=True
)

# Prepare chunks from all documents
text_chunks = []
chunk_metadata = []

for doc in tqdm(all_documents, desc="Chunking text"):
    ticker = doc['metadata']['ticker']
    fiscal_year = doc['metadata']['fiscal_year']
    
    # Chunk each section
    for section in doc['sections']:
        chunks = chunker.chunk_text(
            text=section['content'],
            metadata={
                'ticker': ticker,
                'fiscal_year': fiscal_year,
                'section_title': section['title'],
                'section_id': section.get('section_id'),
                'content_type': 'text'
            }
        )
        
        for chunk in chunks:
            text_chunks.append(chunk['text'])
            chunk_metadata.append(chunk['metadata'])

print(f"Total text chunks: {len(text_chunks)}")

Chunking text: 100%|██████████| 1481/1481 [00:00<00:00, 3365.47it/s]

Total text chunks: 17706





## 3. Prepare Table Row-Sentences

In [15]:
# Collect all table row-sentences
table_sentences = []
table_metadata = []

for doc in tqdm(all_documents, desc="Processing table sentences"):
    ticker = doc['metadata']['ticker']
    fiscal_year = doc['metadata']['fiscal_year']
    
    for sent_obj in doc['table_sentences']:
        table_sentences.append(sent_obj['sentence'])
        table_metadata.append({
            'ticker': ticker,
            'fiscal_year': fiscal_year,
            'table_id': sent_obj['table_id'],
            'section': sent_obj.get('section'),
            'row_idx': sent_obj['row_idx'],
            'content_type': 'table'
        })

print(f"Total table sentences: {len(table_sentences)}")

Processing table sentences: 100%|██████████| 1481/1481 [00:00<00:00, 123029.59it/s]

Total table sentences: 15838





## 4. Create Section Abstracts for Hierarchical Retrieval

In [16]:
# Create section abstracts (title + first paragraph)
section_abstracts = []
section_abstract_metadata = []

for doc in tqdm(all_documents, desc="Creating section abstracts"):
    ticker = doc['metadata']['ticker']
    fiscal_year = doc['metadata']['fiscal_year']
    
    for section in doc['sections']:
        # Create abstract: title + first 200 words
        content_words = section['content'].split()[:200]
        abstract = section['title'] + ". " + " ".join(content_words)
        
        section_abstracts.append(abstract)
        section_abstract_metadata.append({
            'ticker': ticker,
            'fiscal_year': fiscal_year,
            'section_title': section['title'],
            'section_id': section.get('section_id'),
            'content_type': 'section_abstract'
        })

print(f"Total section abstracts: {len(section_abstracts)}")

Creating section abstracts: 100%|██████████| 1481/1481 [00:00<00:00, 5046.17it/s]

Total section abstracts: 14810





## 5. Generate Embeddings

In [17]:
# DEBUG: Check what's in the parsed documents
print("=== Debugging Parsed Documents ===")
if len(all_documents) > 0:
    sample_doc = all_documents[0]
    print(f"\nSample document structure:")
    print(f"Keys: {sample_doc.keys()}")
    print(f"\nMetadata: {sample_doc.get('metadata', {})}")
    print(f"Number of sections: {len(sample_doc.get('sections', []))}")
    print(f"Number of tables: {len(sample_doc.get('tables', []))}")
    print(f"Number of table_sentences: {len(sample_doc.get('table_sentences', []))}")
    
    if sample_doc.get('sections'):
        print(f"\nFirst section keys: {sample_doc['sections'][0].keys()}")
        print(f"First section title: {sample_doc['sections'][0].get('title', 'N/A')}")
        print(f"First section content length: {len(sample_doc['sections'][0].get('content', ''))}")
else:
    print("No documents loaded!")

print(f"\nAfter processing:")
print(f"Total text chunks: {len(text_chunks)}")
print(f"Total section abstracts: {len(section_abstracts)}")
print(f"Total table sentences: {len(table_sentences)}")


=== Debugging Parsed Documents ===

Sample document structure:
Keys: dict_keys(['metadata', 'sections', 'tables', 'table_sentences', 'figures'])

Metadata: {'ticker': 'MMM', 'fiscal_year': 2024, 'file_format': 'html', 'file_path': 'c:\\Users\\anand\\Desktop\\SEM 3\\CS 582\\Proj\\data\\raw\\MMM\\MMM_2024_10K.html', 'num_pages': None, 'num_sections': 10, 'num_tables': 10}
Number of sections: 10
Number of tables: 10
Number of table_sentences: 0

First section keys: dict_keys(['title', 'content'])
First section title: Section 1
First section content length: 5000

After processing:
Total text chunks: 17706
Total section abstracts: 14810
Total table sentences: 15838


In [18]:
# Initialize embedding model
# Using a strong open-source model for financial text
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
print(f"Loading embedding model: {embedding_model_name}")

embedding_generator = EmbeddingGenerator(
    model_name=embedding_model_name,
    batch_size=32,
    device='cuda'  # Use 'cpu' if GPU not available
)

print("\nGenerating embeddings...")

# Generate embeddings for section abstracts (Stage A retrieval)
print("1. Section abstracts...")
section_embeddings = embedding_generator.encode(
    section_abstracts,
    show_progress=True
)

# Generate embeddings for text chunks (Stage B retrieval)
print("2. Text chunks...")
text_embeddings = embedding_generator.encode(
    text_chunks,
    show_progress=True
)

# Generate embeddings for table sentences (Stage B retrieval)
print("3. Table sentences...")
table_embeddings = embedding_generator.encode(
    table_sentences,
    show_progress=True
)

print(f"\nEmbedding dimensions: {text_embeddings.shape[1]}")

Loading embedding model: sentence-transformers/all-mpnet-base-v2
Loading embedding model: sentence-transformers/all-mpnet-base-v2
Model loaded on cuda

Generating embeddings...
1. Section abstracts...


Batches:   0%|          | 0/463 [00:00<?, ?it/s]

2. Text chunks...


Batches:   0%|          | 0/554 [00:00<?, ?it/s]

3. Table sentences...


Batches:   0%|          | 0/495 [00:00<?, ?it/s]


Embedding dimensions: 768


## 6. Build FAISS Indices

In [19]:
# Build FAISS index for section abstracts
print("Building FAISS indices...")

dimension = text_embeddings.shape[1] if len(text_embeddings.shape) > 1 else 768  # Default dimension

# Section abstract index (Stage A)
if len(section_embeddings.shape) > 1 and section_embeddings.shape[0] > 0:
    section_index = faiss.IndexFlatL2(dimension)
    section_index.add(section_embeddings.astype('float32'))
    print(f"Section index: {section_index.ntotal} vectors")
else:
    print("Warning: No section embeddings to index")
    section_index = faiss.IndexFlatL2(dimension)

# Text chunk index (Stage B)
if len(text_embeddings.shape) > 1 and text_embeddings.shape[0] > 0:
    text_index = faiss.IndexFlatL2(dimension)
    text_index.add(text_embeddings.astype('float32'))
    print(f"Text chunk index: {text_index.ntotal} vectors")
else:
    print("Warning: No text embeddings to index")
    text_index = faiss.IndexFlatL2(dimension)

# Table sentence index (Stage B)
if len(table_embeddings.shape) > 1 and table_embeddings.shape[0] > 0:
    table_index = faiss.IndexFlatL2(dimension)
    table_index.add(table_embeddings.astype('float32'))
    print(f"Table sentence index: {table_index.ntotal} vectors")
else:
    print("Warning: No table embeddings to index (no table sentences extracted)")
    table_index = faiss.IndexFlatL2(dimension)


Building FAISS indices...
Section index: 14810 vectors
Text chunk index: 17706 vectors
Table sentence index: 15838 vectors


## 7. Build BM25 Keyword Indices

In [20]:
# Tokenize for BM25
def simple_tokenize(text):
    return text.lower().split()

print("Building BM25 indices...")

# BM25 for text chunks
if len(text_chunks) > 0:
    text_tokens = [simple_tokenize(chunk) for chunk in text_chunks]
    text_bm25 = BM25Okapi(text_tokens)
    print(f"Text BM25 index: {len(text_tokens)} documents")
else:
    print("Warning: No text chunks to index for BM25")
    text_bm25 = None

# BM25 for table sentences
if len(table_sentences) > 0:
    table_tokens = [simple_tokenize(sent) for sent in table_sentences]
    table_bm25 = BM25Okapi(table_tokens)
    print(f"Table BM25 index: {len(table_tokens)} documents")
else:
    print("Warning: No table sentences to index for BM25")
    table_bm25 = None


Building BM25 indices...
Text BM25 index: 17706 documents
Table BM25 index: 15838 documents


## 8. Save Indices and Metadata

In [21]:
# Create indices directory
INDICES_DIR.mkdir(parents=True, exist_ok=True)

print("Saving indices...")

# Save FAISS indices
faiss.write_index(section_index, str(INDICES_DIR / "section_index.faiss"))
faiss.write_index(text_index, str(INDICES_DIR / "text_index.faiss"))
faiss.write_index(table_index, str(INDICES_DIR / "table_index.faiss"))

# Save content and metadata
with open(INDICES_DIR / "section_data.pkl", 'wb') as f:
    pickle.dump({
        'abstracts': section_abstracts,
        'metadata': section_abstract_metadata
    }, f)

with open(INDICES_DIR / "text_data.pkl", 'wb') as f:
    pickle.dump({
        'chunks': text_chunks,
        'metadata': chunk_metadata,
        'bm25': text_bm25
    }, f)

with open(INDICES_DIR / "table_data.pkl", 'wb') as f:
    pickle.dump({
        'sentences': table_sentences,
        'metadata': table_metadata,
        'bm25': table_bm25
    }, f)

# Save index configuration
index_config = {
    'embedding_model': embedding_model_name,
    'embedding_dimension': dimension,
    'num_sections': len(section_abstracts),
    'num_text_chunks': len(text_chunks),
    'num_table_sentences': len(table_sentences),
    'chunk_size': 512,
    'chunk_overlap': 50
}

with open(INDICES_DIR / "index_config.json", 'w') as f:
    json.dump(index_config, f, indent=2)

print("\n=== Indexing Complete ===")
print(f"Section abstracts indexed: {len(section_abstracts)}")
print(f"Text chunks indexed: {len(text_chunks)}")
print(f"Table sentences indexed: {len(table_sentences)}")
print(f"\nIndices saved to: {INDICES_DIR}")

Saving indices...

=== Indexing Complete ===
Section abstracts indexed: 14810
Text chunks indexed: 17706
Table sentences indexed: 15838

Indices saved to: c:\Users\anand\Desktop\SEM 3\CS 582\Proj\indices


## 9. Test Retrieval

In [22]:
# Test retrieval with sample queries
test_queries = [
    "R&D expense in 2024",
    "long-term debt to equity ratio",
    "operating segment revenue growth"
]

print("=== Testing Retrieval ===")

for query in test_queries:
    print(f"\nQuery: '{query}'")
    
    # Encode query
    query_embedding = embedding_generator.encode([query])[0]
    
    # Search section index (Stage A)
    distances, indices = section_index.search(
        query_embedding.reshape(1, -1).astype('float32'), 
        k=3
    )
    
    print("  Top sections:")
    for i, idx in enumerate(indices[0]):
        meta = section_abstract_metadata[idx]
        print(f"    {i+1}. {meta['ticker']} {meta['fiscal_year']} - {meta['section_title'][:60]}...")
    
    # Search table index (Stage B) - only if table sentences exist
    if len(table_sentences) > 0:
        distances, indices = table_index.search(
            query_embedding.reshape(1, -1).astype('float32'), 
            k=3
        )
        
        print("  Top table sentences:")
        for i, idx in enumerate(indices[0]):
            print(f"    {i+1}. {table_sentences[idx][:80]}...")
    else:
        print("  Top table sentences:")
        print("    (No table sentences available - table parsing not implemented)")
    
    # Search text chunks (Stage B)
    distances, indices = text_index.search(
        query_embedding.reshape(1, -1).astype('float32'), 
        k=3
    )
    
    print("  Top text chunks:")
    for i, idx in enumerate(indices[0]):
        meta = chunk_metadata[idx]
        chunk_preview = text_chunks[idx][:100].replace('\n', ' ')
        print(f"    {i+1}. [{meta['ticker']} {meta['fiscal_year']} - {meta['section_title']}]")
        print(f"       {chunk_preview}...")

=== Testing Retrieval ===

Query: 'R&D expense in 2024'
  Top sections:
    1. STX 2023 - Section 2...
    2. HPE 2023 - Section 10...
    3. STE 2023 - Section 10...
  Top table sentences:
    1. 2024:  $ 3,598    4.1 %...
    2. March 1 to March 31, 2024:   —   $  —     —   $  400.0 ...
    3. Research and development expense:   109,181    87,581    219,112    82,310    10...
  Top text chunks:
    1. [SWKS 2023 - Section 10]
       new or emerging technologies and changes in customer requirements, or may be able to devote greater ...
    2. [NVDA 2023 - Section 2]
       2023-01-29 0001045810 us-gaap:CostOfSalesMember 2022-01-31 2023-01-29 0001045810 us-gaap:CostOfSales...
    3. [BALL 2024 - Section 2]
       2022-12-31 0000009389 us-gaap:NoncontrollingInterestMember 2022-12-31 0000009389 us-gaap:RetainedEar...

Query: 'long-term debt to equity ratio'
  Top sections:
    1. CRM 2024 - Section 3...
    2. APA 2022 - Section 1...
    3. CRM 2023 - Section 3...
  Top table sentences:


## Next Steps

Proceed to **04_retrieval.ipynb** to implement the full hierarchical retrieval pipeline with hybrid search.