# Document Chunking and Indexing

Learn how to chunk documents and build searchable indexes for RAG.

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import json

## Load Documents

In [None]:
# Load sample documents
with open('../fixtures/input/documents.json', 'r') as f:
    documents = json.load(f)

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document:")
print(f"  Source: {documents[0]['source']}")
print(f"  Length: {len(documents[0]['content'])} chars")
print(f"  Content preview: {documents[0]['content'][:200]}...")

## Chunking Strategies

### 1. Fixed-Size Chunks (Naive)

In [None]:
# Simple fixed-size chunking
def fixed_size_chunk(text, size=500):
    """Split text into fixed-size chunks"""
    return [text[i:i+size] for i in range(0, len(text), size)]

# Test on first document
text = documents[0]['content']
chunks = fixed_size_chunk(text, size=300)

print(f"Created {len(chunks)} chunks\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} ({len(chunk)} chars):")
    print(f"  {chunk[:100]}...\n")

print("⚠️  Problem: Can split mid-sentence!")

### 2. Recursive Character Splitting (Better)

In [None]:
# Smart chunking with RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,  # Overlap to preserve context
    separators=["\n\n", "\n", ". ", " ", ""]  # Try these in order
)

chunks = splitter.split_text(text)

print(f"Created {len(chunks)} chunks\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} ({len(chunk)} chars):")
    print(f"  {chunk[:100]}...\n")

print("✓ Splits at natural boundaries")
print("✓ Overlap preserves context")

## Chunk Overlap Visualization

In [None]:
# Show overlap between consecutive chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)

sample_text = "Python is a programming language. It was created by Guido van Rossum. Python emphasizes code readability. It has a large standard library."

chunks = splitter.split_text(sample_text)

print("Chunks with overlap:\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: '{chunk}'")

    if i < len(chunks) - 1:
        # Find overlap
        next_chunk = chunks[i+1]
        overlap = ""
        for j in range(1, min(len(chunk), len(next_chunk))):
            if chunk[-j:] == next_chunk[:j]:
                overlap = chunk[-j:]

        if overlap:
            print(f"  Overlap with next: '{overlap[:50]}...'")
    print()

## Chunk All Documents

In [None]:
# Chunk all documents and track metadata
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=120
)

all_chunks = []

for doc in documents:
    doc_chunks = splitter.split_text(doc['content'])

    for i, chunk_text in enumerate(doc_chunks):
        all_chunks.append({
            'text': chunk_text,
            'source': doc['source'],
            'doc_id': doc['doc_id'],
            'chunk_id': i,
            'metadata': doc['metadata']
        })

print(f"Total chunks: {len(all_chunks)}")
print(f"\nChunks per document:")
for doc in documents:
    doc_chunk_count = sum(1 for c in all_chunks if c['doc_id'] == doc['doc_id'])
    print(f"  {doc['source']}: {doc_chunk_count} chunks")

## Generate Embeddings

In [None]:
# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed all chunks
chunk_texts = [c['text'] for c in all_chunks]
embeddings = embed_model.encode(
    chunk_texts,
    normalize_embeddings=True,
    show_progress_bar=True
).astype('float32')

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Dimension: {embeddings.shape[1]}")
print(f"dtype: {embeddings.dtype}")

## Build FAISS Index

In [None]:
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for normalized vectors

# Add embeddings
index.add(embeddings)

print(f"✓ Index created")
print(f"  Total vectors: {index.ntotal}")
print(f"  Dimension: {index.d}")

## Test Retrieval

In [None]:
def search(query: str, k: int = 5):
    """Search for relevant chunks"""
    # Embed query
    query_emb = embed_model.encode(
        query,
        normalize_embeddings=True
    ).astype('float32').reshape(1, -1)

    # Search
    scores, indices = index.search(query_emb, k)

    # Format results
    results = []
    for score, idx in zip(scores[0], indices[0]):
        chunk = all_chunks[idx]
        results.append({
            'text': chunk['text'],
            'source': chunk['source'],
            'score': float(score),
            'metadata': chunk['metadata']
        })

    return results

# Test queries
queries = [
    "What is Python?",
    "How much does the Professional plan cost?",
    "What was Q4 revenue?"
]

for query in queries:
    print(f"\nQuery: {query}")
    print("="*60)

    results = search(query, k=3)

    for i, result in enumerate(results, 1):
        print(f"\n{i}. [{result['score']:.3f}] {result['source']}")
        print(f"   {result['text'][:150]}...")

## Analyze Chunk Sizes

In [None]:
import matplotlib.pyplot as plt

# Analyze chunk length distribution
chunk_lengths = [len(c['text']) for c in all_chunks]

print(f"Chunk length statistics:")
print(f"  Min: {min(chunk_lengths)} chars")
print(f"  Max: {max(chunk_lengths)} chars")
print(f"  Mean: {np.mean(chunk_lengths):.0f} chars")
print(f"  Median: {np.median(chunk_lengths):.0f} chars")

# Histogram
plt.figure(figsize=(10, 4))
plt.hist(chunk_lengths, bins=20, edgecolor='black')
plt.xlabel('Chunk Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Chunk Lengths')
plt.axvline(600, color='r', linestyle='--', label='Target size')
plt.legend()
plt.show()

## Save Index

In [None]:
import os

# Create output directory
os.makedirs('../output', exist_ok=True)

# Save FAISS index
faiss.write_index(index, '../output/documents.index')

# Save chunks metadata
with open('../output/chunks.json', 'w') as f:
    json.dump(all_chunks, f, indent=2)

print("✓ Saved index and chunks")
print(f"  Index: ../output/documents.index")
print(f"  Chunks: ../output/chunks.json")

## Summary

✅ Learned chunking strategies  
✅ Understood chunk overlap importance  
✅ Generated embeddings for chunks  
✅ Built FAISS index  
✅ Tested basic retrieval  
✅ Saved index for reuse

**Key takeaways:**
- Use RecursiveCharacterTextSplitter for semantic boundaries
- 20% overlap prevents context loss
- Track metadata (source, chunk_id) for citations
- Normalize embeddings for cosine similarity

**Next:** Learn RAG generation with LLMs!