# RAG pipeline (Cloudflare Workers AI Embeddings + Local ChromaDB)

This notebook ingests PDF documents, chunks them, generates embeddings using Cloudflare Workers AI, and stores vectors in local ChromaDB.

- Embedding model: `@cf/baai/bge-base-en-v1.5` (768-dim) via Cloudflare Workers AI
- Vector DB: ChromaDB (local)
- Input: PDFs under `../data/pdf_files`


In [1]:
# Imports and setup
import os
import time
import uuid
import requests
import numpy as np
from pathlib import Path
from typing import List, Any
from dotenv import load_dotenv
import chromadb

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Optional progress bar
try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kwargs):
        return x

load_dotenv()

# Environment configuration for Cloudflare embeddings
CLOUDFLARE_ACCOUNT_ID = os.getenv("CLOUDFLARE_ACCOUNT_ID", "<YOUR_ACCOUNT_ID>")
CLOUDFLARE_API_TOKEN = os.getenv("CLOUDFLARE_API_TOKEN", "<YOUR_API_TOKEN>")

# Embedding model per Cloudflare docs
CF_EMBEDDINGS_MODEL = "@cf/baai/bge-base-en-v1.5"  # 768-dim

if any(v.startswith("<YOUR_") for v in [CLOUDFLARE_ACCOUNT_ID, CLOUDFLARE_API_TOKEN]):
    print("WARNING: Set CLOUDFLARE_ACCOUNT_ID and CLOUDFLARE_API_TOKEN in your environment or edit this cell.")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PDF discovery and loading

def process_all_pdfs(pdf_directory: str) -> List[Any]:
    """Load all PDFs found under a directory (recursive), returning LangChain Documents."""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
        except Exception as e:
            print(f"  ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data/pdf_files")
len(all_pdf_documents)

Found 6 PDF files to process

Processing: Bill's_Windsurf_Shop_Invoice.pdf
  ✓ Loaded 1 pages

Processing: Amy's_Bird_Sanctuary_Invoice.pdf
  ✓ Loaded 1 pages

Processing: et-cod.pdf
  ✓ Loaded 334 pages

Processing: Cool_Cars_Invoice.pdf
  ✓ Loaded 1 pages

Processing: Dukes_Basketball_Camp_Invoice.pdf
  ✓ Loaded 1 pages

Processing: Diego_Rodriguez_Invoice.pdf
  ✓ Loaded 1 pages

Total documents loaded: 339


339

In [3]:
# Chunking

def split_documents(documents: List[Any], chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    if split_docs:
        print("Example chunk preview:")
        print(split_docs[0].page_content[:200], "...")
        print(split_docs[0].metadata)
    return split_docs

chunks = split_documents(all_pdf_documents)
len(chunks)

Split 339 documents into 1087 chunks
Example chunk preview:
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
 ...
{'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'source': "../data/pdf_files/Bill's_Windsurf_Shop_Invoice.pdf", 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': "Bill's_Windsurf_Shop_Invoice.pdf", 'file_type': 'pdf'}


1087

In [4]:
# Cloudflare Workers AI Embeddings (REST client)

class CFWorkersAIEmbeddings:
    """
    Minimal client for Workers AI embeddings endpoint.
    POST https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model}
    Body: {"text": "<string>"}
    Returns: {"result": {"data": [[...]]}} or {"data": [...]}
    """

    def __init__(self, account_id: str, api_token: str, model: str):
        self.base = f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model}"
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
        }

    def embed_one(self, text: str, retries: int = 3, backoff: float = 1.5) -> np.ndarray:
        payload = {"text": text}
        last_err = None
        for attempt in range(1, retries + 1):
            try:
                r = requests.post(self.base, headers=self.headers, json=payload, timeout=60)
                if r.status_code == 200:
                    data = r.json()
                    vec = None
                    if isinstance(data, dict) and "result" in data:
                        result = data["result"]
                        if "data" in result and result["data"]:
                            first = result["data"][0]
                            vec = first if isinstance(first, list) else result["data"]
                    elif "data" in data:
                        first = data["data"][0] if isinstance(data["data"], list) and data["data"] and isinstance(data["data"][0], list) else data["data"]
                        vec = first
                    if vec is None:
                        raise ValueError(f"Unexpected response structure: {data}")
                    return np.array(vec, dtype=np.float32)
                else:
                    last_err = RuntimeError(f"HTTP {r.status_code}: {r.text[:300]}")
            except Exception as e:
                last_err = e
            time.sleep(backoff ** (attempt - 1))
        raise last_err

    def embed_batch(self, texts: List[str]) -> np.ndarray:
        vectors = []
        for t in tqdm(texts, desc="Embedding with Cloudflare Workers AI"):
            vectors.append(self.embed_one(t))
        return np.vstack(vectors)

cf_embedder = CFWorkersAIEmbeddings(
    account_id=CLOUDFLARE_ACCOUNT_ID,
    api_token=CLOUDFLARE_API_TOKEN,
    model=CF_EMBEDDINGS_MODEL,
)

In [5]:
# Local ChromaDB Vector Store (adapted from pdf_loader.ipynb)

class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "cloudflare_embeddings", persist_directory: str = "../data/vector_store_cf"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        
        try:
            # Initialize ChromaDB client
            self.client = chromadb.PersistentClient(path=persist_directory)
            
            # Create or get collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings using Cloudflare Workers AI"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Metadata
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document text
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

    def query_similar(self, query_text: str, top_k: int = 5):
        """
        Query for similar documents using Cloudflare embeddings
        
        Args:
            query_text: Text to search for
            top_k: Number of results to return
        """
        # Generate embedding for query using Cloudflare
        query_embedding = cf_embedder.embed_one(query_text)
        
        # Query the collection
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        
        return results

vectorStore = VectorStore()
vectorStore

Vector store initialized. Collection: cloudflare_embeddings
Existing documents in collection: 5523


<__main__.VectorStore at 0x306ce5fd0>

In [6]:
# ChromaDB Compaction Error Fix - Alternative approach with batching

def add_documents_safely(vector_store, documents, embeddings, batch_size=20):
    """
    Safely add documents to ChromaDB with batching to avoid compaction errors
    """
    import time
    
    print(f"Adding {len(documents)} documents in batches of {batch_size} to avoid compaction errors...")
    
    total_added = 0
    
    for i in range(0, len(documents), batch_size):
        batch_docs = documents[i:i + batch_size]
        batch_embeddings = embeddings[i:i + batch_size]
        
        # Prepare batch data
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for j, (doc, embedding) in enumerate(zip(batch_docs, batch_embeddings)):
            doc_id = f"safe_batch_{i//batch_size}_doc_{uuid.uuid4().hex[:8]}_{j}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i + j
            metadata["batch_num"] = i // batch_size
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        
        # Add batch with retry logic
        max_retries = 3
        for attempt in range(max_retries):
            try:
                vector_store.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                total_added += len(batch_docs)
                print(f"✓ Batch {i//batch_size + 1}: Added {len(batch_docs)} documents. Total: {total_added}/{len(documents)}")
                break
                
            except Exception as e:
                if "compaction" in str(e).lower() or "hnsw" in str(e).lower():
                    print(f"⚠️  Compaction error in batch {i//batch_size + 1}, attempt {attempt + 1}. Retrying...")
                    time.sleep(2 ** attempt)  # Exponential backoff
                    
                    if attempt == max_retries - 1:
                        print(f"❌ Failed to add batch {i//batch_size + 1} after {max_retries} attempts")
                        print(f"Error: {e}")
                        return total_added
                else:
                    print(f"❌ Unexpected error in batch {i//batch_size + 1}: {e}")
                    return total_added
        
        # Small delay between batches to reduce load
        time.sleep(0.5)
    
    print(f"\n✅ Successfully added {total_added} documents to vector store!")
    print(f"Total documents in collection: {vector_store.collection.count()}")
    return total_added

In [7]:
# Generate embeddings using Cloudflare Workers AI and store safely

texts = [c.page_content for c in chunks]
print(f"Generating embeddings for {len(texts)} chunks using Cloudflare Workers AI model: {CF_EMBEDDINGS_MODEL}")
embeddings = cf_embedder.embed_batch(texts)
print("Embeddings shape:", embeddings.shape)

# Store in the local ChromaDB vector database using safe batching
added_count = add_documents_safely(vectorStore, chunks, embeddings, batch_size=15)
print(f"\n✅ Successfully stored {added_count} documents with Cloudflare embeddings in local ChromaDB!")

Generating embeddings for 1087 chunks using Cloudflare Workers AI model: @cf/baai/bge-base-en-v1.5


Embedding with Cloudflare Workers AI: 100%|██████████| 1087/1087 [04:09<00:00,  4.35it/s]


Embeddings shape: (1087, 768)
Adding 1087 documents in batches of 15 to avoid compaction errors...
✓ Batch 1: Added 15 documents. Total: 15/1087
✓ Batch 2: Added 15 documents. Total: 30/1087
✓ Batch 3: Added 15 documents. Total: 45/1087
✓ Batch 4: Added 15 documents. Total: 60/1087
✓ Batch 5: Added 15 documents. Total: 75/1087
✓ Batch 6: Added 15 documents. Total: 90/1087
✓ Batch 7: Added 15 documents. Total: 105/1087
✓ Batch 8: Added 15 documents. Total: 120/1087
✓ Batch 9: Added 15 documents. Total: 135/1087
✓ Batch 10: Added 15 documents. Total: 150/1087
✓ Batch 11: Added 15 documents. Total: 165/1087
✓ Batch 12: Added 15 documents. Total: 180/1087
✓ Batch 13: Added 15 documents. Total: 195/1087
✓ Batch 14: Added 15 documents. Total: 210/1087
✓ Batch 15: Added 15 documents. Total: 225/1087
✓ Batch 16: Added 15 documents. Total: 240/1087
✓ Batch 17: Added 15 documents. Total: 255/1087
✓ Batch 18: Added 15 documents. Total: 270/1087
✓ Batch 19: Added 15 documents. Total: 285/1087
✓ Ba

In [8]:
# Test similarity query to validate ingestion

def query_similar(text: str, top_k: int = 3):
    results = vectorStore.query_similar(text, top_k=top_k)
    return results

# Test query
result = query_similar("What is in the invoice for Bill's Windsurf Shop?", top_k=3)
print("Query results using Cloudflare embeddings:")
for i, (doc, distance) in enumerate(zip(result['documents'][0], result['distances'][0])):
    print(f"\nResult {i+1} (distance: {distance:.4f}):")
    print(doc[:200] + "...")

Query results using Cloudflare embeddings:

Result 1 (distance: 0.2168):
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...

Result 2 (distance: 0.2168):
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...

Result 3 (distance: 0.2168):
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...


In [9]:
# RAG Retriever - Integrated with Cloudflare Embeddings and ChromaDB

class RAGRetriever:
    """
    RAG Retriever that integrates Cloudflare Workers AI embeddings with ChromaDB vector store
    for semantic document retrieval and context generation.
    """
    
    def __init__(self, vector_store: VectorStore, embedder: CFWorkersAIEmbeddings, 
                 similarity_threshold: float = 0.7, max_context_length: int = 4000):
        """
        Initialize RAG Retriever
        
        Args:
            vector_store: ChromaDB VectorStore instance
            embedder: Cloudflare Workers AI embeddings client
            similarity_threshold: Minimum similarity score for relevant results
            max_context_length: Maximum characters in combined context
        """
        self.vector_store = vector_store
        self.embedder = embedder
        self.similarity_threshold = similarity_threshold
        self.max_context_length = max_context_length
    
    def retrieve(self, query: str, top_k: int = 5, include_metadata: bool = True) -> dict:
        """
        Retrieve relevant documents for a given query
        
        Args:
            query: Search query text
            top_k: Number of top results to retrieve
            include_metadata: Whether to include document metadata
            
        Returns:
            Dictionary with retrieved documents, scores, and metadata
        """
        try:
            # Get similar documents from vector store
            results = self.vector_store.query_similar(query, top_k=top_k)
            
            # Process and filter results
            processed_results = self._process_results(results, include_metadata)
            
            return processed_results
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return {"documents": [], "scores": [], "metadata": [], "context": ""}
    
    def _process_results(self, raw_results: dict, include_metadata: bool = True) -> dict:
        """
        Process raw ChromaDB results into structured format
        
        Args:
            raw_results: Raw results from ChromaDB query
            include_metadata: Whether to include metadata
            
        Returns:
            Processed results dictionary
        """
        documents = raw_results.get('documents', [[]])[0]
        distances = raw_results.get('distances', [[]])[0]
        metadatas = raw_results.get('metadatas', [[]])[0] if include_metadata else []
        ids = raw_results.get('ids', [[]])[0]
        
        # Convert distances to similarity scores (1 - distance for cosine similarity)
        similarity_scores = [1 - dist for dist in distances]
        
        # Filter by similarity threshold
        filtered_results = []
        for i, score in enumerate(similarity_scores):
            if score >= self.similarity_threshold:
                result_item = {
                    'document': documents[i],
                    'score': score,
                    'distance': distances[i],
                    'id': ids[i]
                }
                if include_metadata and i < len(metadatas):
                    result_item['metadata'] = metadatas[i]
                filtered_results.append(result_item)
        
        # Sort by similarity score (highest first)
        filtered_results.sort(key=lambda x: x['score'], reverse=True)
        
        # Generate combined context
        context = self._generate_context([item['document'] for item in filtered_results])
        
        return {
            'results': filtered_results,
            'documents': [item['document'] for item in filtered_results],
            'scores': [item['score'] for item in filtered_results],
            'metadata': [item.get('metadata', {}) for item in filtered_results],
            'context': context,
            'num_results': len(filtered_results)
        }
    
    def _generate_context(self, documents: list) -> str:
        """
        Generate combined context from retrieved documents
        
        Args:
            documents: List of document texts
            
        Returns:
            Combined context string
        """
        if not documents:
            return ""
        
        context_parts = []
        current_length = 0
        
        for i, doc in enumerate(documents):
            # Add document with separator
            doc_text = f"[Document {i+1}]\n{doc}\n"
            
            # Check if adding this document exceeds max length
            if current_length + len(doc_text) > self.max_context_length:
                # Truncate the document to fit
                remaining_space = self.max_context_length - current_length - 20  # Leave space for truncation indicator
                if remaining_space > 100:  # Only add if there's meaningful space
                    truncated_doc = f"[Document {i+1}]\n{doc[:remaining_space]}...\n"
                    context_parts.append(truncated_doc)
                break
            
            context_parts.append(doc_text)
            current_length += len(doc_text)
        
        return "\n".join(context_parts)
    
    def retrieve_with_reranking(self, query: str, top_k: int = 10, final_k: int = 5) -> dict:
        """
        Retrieve documents with simple reranking based on query term overlap
        
        Args:
            query: Search query text
            top_k: Initial number of results to retrieve
            final_k: Final number of results after reranking
            
        Returns:
            Reranked results dictionary
        """
        # Get initial results
        initial_results = self.retrieve(query, top_k=top_k)
        
        if not initial_results['results']:
            return initial_results
        
        # Simple reranking based on query term overlap
        query_terms = set(query.lower().split())
        
        for result in initial_results['results']:
            doc_terms = set(result['document'].lower().split())
            term_overlap = len(query_terms.intersection(doc_terms)) / len(query_terms)
            
            # Combine semantic similarity with term overlap
            result['rerank_score'] = 0.7 * result['score'] + 0.3 * term_overlap
        
        # Sort by rerank score and take top final_k
        reranked = sorted(initial_results['results'], key=lambda x: x['rerank_score'], reverse=True)[:final_k]
        
        # Update the results
        context = self._generate_context([item['document'] for item in reranked])
        
        return {
            'results': reranked,
            'documents': [item['document'] for item in reranked],
            'scores': [item['rerank_score'] for item in reranked],
            'metadata': [item.get('metadata', {}) for item in reranked],
            'context': context,
            'num_results': len(reranked)
        }
    
    def get_source_info(self, results: dict) -> list:
        """
        Extract source information from retrieval results
        
        Args:
            results: Results dictionary from retrieve method
            
        Returns:
            List of source information dictionaries
        """
        sources = []
        for metadata in results.get('metadata', []):
            if metadata:
                source_info = {
                    'file': metadata.get('source_file', 'Unknown'),
                    'page': metadata.get('page', 'Unknown'),
                    'content_length': metadata.get('content_length', 0)
                }
                sources.append(source_info)
        return sources

# Initialize RAG Retriever with existing components
rag_retriever = RAGRetriever(
    vector_store=vectorStore,
    embedder=cf_embedder,
    similarity_threshold=0.6,  # Adjust based on your needs
    max_context_length=3000
)

print("✅ RAG Retriever initialized with Cloudflare embeddings and ChromaDB!")

✅ RAG Retriever initialized with Cloudflare embeddings and ChromaDB!


In [10]:
# Test the RAG Retriever

def test_rag_retriever():
    """Test the RAG Retriever with sample queries"""
    
    test_queries = [
        "What is in the invoice for Bill's Windsurf Shop?",
        "Tell me about Amy's Bird Sanctuary invoice details",
        "What are the building automation system requirements?",
        "Schneider Electric EcoStruxure features"
    ]
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n{'='*60}")
        print(f"Test Query {i}: {query}")
        print(f"{'='*60}")
        
        # Basic retrieval
        results = rag_retriever.retrieve(query, top_k=3)
        
        print(f"\nFound {results['num_results']} relevant documents:")
        
        for j, (doc, score) in enumerate(zip(results['documents'], results['scores'])):
            print(f"\n--- Result {j+1} (Similarity: {score:.3f}) ---")
            print(doc[:200] + "..." if len(doc) > 200 else doc)
            
            # Show source info if available
            if j < len(results['metadata']) and results['metadata'][j]:
                metadata = results['metadata'][j]
                source_file = metadata.get('source_file', 'Unknown')
                page = metadata.get('page', 'Unknown')
                print(f"Source: {source_file}, Page: {page}")
        
        # Show combined context (truncated)
        if results['context']:
            print(f"\n--- Combined Context (first 300 chars) ---")
            print(results['context'][:300] + "..." if len(results['context']) > 300 else results['context'])

# Run the test
test_rag_retriever()



Test Query 1: What is in the invoice for Bill's Windsurf Shop?

Found 3 relevant documents:

--- Result 1 (Similarity: 0.783) ---
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...
Source: Bill's_Windsurf_Shop_Invoice.pdf, Page: 0

--- Result 2 (Similarity: 0.783) ---
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...
Source: Bill's_Windsurf_Shop_Invoice.pdf, Page: 0

--- Result 3 (Similarity: 0.783) ---
Invoice for Bill's Windsurf Shop
Email: Surf@Intuit.com
Invoice Details:
Description Qty Unit Price Amount
Design Service 1 $500.00 $500.00
Consulting 2 $200.00 $400.00
Installation 1 $300.00 $300.00
...
Source: Bill's_Windsurf_Shop_Invoice.pdf, Page: 0

--- Combined Context (first 30

In [11]:
# Advanced RAG Retrieval with Reranking

def test_reranking():
    """Test the reranking functionality"""
    
    query = "What are the costs and pricing details in the invoices?"
    
    print(f"Query: {query}")
    print("\n" + "="*50)
    
    # Standard retrieval
    standard_results = rag_retriever.retrieve(query, top_k=5)
    print("\nSTANDARD RETRIEVAL:")
    for i, (doc, score) in enumerate(zip(standard_results['documents'][:3], standard_results['scores'][:3])):
        print(f"\n{i+1}. Score: {score:.3f}")
        print(doc[:150] + "...")
    
    # Reranked retrieval
    reranked_results = rag_retriever.retrieve_with_reranking(query, top_k=8, final_k=3)
    print("\n\nRERANKED RETRIEVAL:")
    for i, result in enumerate(reranked_results['results']):
        print(f"\n{i+1}. Rerank Score: {result['rerank_score']:.3f} (Semantic: {result['score']:.3f})")
        print(result['document'][:150] + "...")
    
    # Source information
    sources = rag_retriever.get_source_info(reranked_results)
    print("\n\nSOURCE INFORMATION:")
    for i, source in enumerate(sources):
        print(f"{i+1}. File: {source['file']}, Page: {source['page']}")

# Test reranking
test_reranking()

Query: What are the costs and pricing details in the invoices?


STANDARD RETRIEVAL:


RERANKED RETRIEVAL:


SOURCE INFORMATION:
