## RAG Pipelines = Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## read all the pdf inside the directory
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: Practical Python and OpenCV.pdf


Ignoring wrong pointing object 2328 0 (offset 0)
Ignoring wrong pointing object 359 0 (offset 0)
Ignoring wrong pointing object 360 0 (offset 0)


  ✓ Loaded 166 pages

Processing: SQL Queries .pdf
  ✓ Loaded 54 pages

Total documents loaded: 220


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': '..\\data\\pdf\\Practical Python and OpenCV.pdf', 'total_pages': 166, 'page': 0, 'page_label': '1', 'source_file': 'Practical Python and OpenCV.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': '..\\data\\pdf\\Practical Python and OpenCV.pdf', 'total_pages'

In [4]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs


In [5]:
chunks=split_documents(all_pdf_documents)
chunks

Split 220 documents into 335 chunks

Example chunk:
Content: Practical Python and
OpenCV: An Introductory,
Example Driven Guide to
Image Processing and
Computer Vision
4th Edition
Dr. Adrian Rosebrock...
Metadata: {'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': '..\\data\\pdf\\Practical Python and OpenCV.pdf', 'total_pages': 166, 'page': 1, 'page_label': 'i', 'source_file': 'Practical Python and OpenCV.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': '..\\data\\pdf\\Practical Python and OpenCV.pdf', 'total_pages': 166, 'page': 1, 'page_label': 'i', 'source_file': 'Practical Python and OpenCV.pdf', 'file_type': 'pdf'}, page_content='Practical Python and\nOpenCV: An Introductory,\nExample Driven Guide to\nImage Processing and\nComputer Vision\n4th Edition\nDr. Adrian Rosebrock'),
 Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, 

## embedding and vectorStoreDB

In [9]:
### embedding and vectorStoreDB
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x18bc191dbe0>

## VectorStore

In [14]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x18bc191dd30>

In [15]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1', 'source': '..\\data\\pdf\\Practical Python and OpenCV.pdf', 'total_pages': 166, 'page': 1, 'page_label': 'i', 'source_file': 'Practical Python and OpenCV.pdf', 'file_type': 'pdf'}, page_content='Practical Python and\nOpenCV: An Introductory,\nExample Driven Guide to\nImage Processing and\nComputer Vision\n4th Edition\nDr. Adrian Rosebrock'),
 Document(metadata={'producer': 'pdfTeX-1.40.16', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-01-06T06:31:50-05:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-09T20:13:54+06:30', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, 

In [16]:
## convert the text to embeddings
texts=[doc.page_content for doc in chunks]
texts

['Practical Python and\nOpenCV: An Introductory,\nExample Driven Guide to\nImage Processing and\nComputer Vision\n4th Edition\nDr. Adrian Rosebrock',
 'C O P Y R I G H T\nThe contents of this book, unless otherwise indicated, are\nCopyright c⃝2018 Adrian Rosebrock, PyImageSearch.com.\nAll rights reserved.\nThis version of the book was published on 14 December\n2018.\nBooks like this are made possible by the time invested by\nthe authors. If you received this book and did not purchase\nit, please consider making future books possible by buy-\ning a copy at https://www.pyimagesearch.com/practical-\npython-opencv/ today.\nii',
 'C O N T E N T S\n1 introduction 1\n2 python and required packages 5\n2.1 A note on Python & OpenCV Versions . . . . 6\n2.2 NumPy and SciPy . . . . . . . . . . . . . . . . 7\n2.2.1 Windows . . . . . . . . . . . . . . . . . 7\n2.2.2 OSX . . . . . . . . . . . . . . . . . . . 7\n2.2.3 Linux . . . . . . . . . . . . . . . . . . . 8\n2.3 Matplotlib . . . . . . . . . . . 

In [18]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 335 texts...


Batches: 100%|██████████| 11/11 [00:26<00:00,  2.44s/it]


Generated embeddings with shape: (335, 384)
Adding 335 documents to vector store...
Successfully added 335 documents to vector store
Total documents in collection: 335


## Retriever Pipeline From VectorStore

In [19]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [20]:
rag_retriever

<__main__.RAGRetriever at 0x18ba9d20830>

In [23]:
rag_retriever.retrieve("What is practical python and openCV")

Retrieving documents for query: 'What is practical python and openCV'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.34it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_6e86b48c_1',
  'content': 'C O P Y R I G H T\nThe contents of this book, unless otherwise indicated, are\nCopyright c⃝2018 Adrian Rosebrock, PyImageSearch.com.\nAll rights reserved.\nThis version of the book was published on 14 December\n2018.\nBooks like this are made possible by the time invested by\nthe authors. If you received this book and did not purchase\nit, please consider making future books possible by buy-\ning a copy at https://www.pyimagesearch.com/practical-\npython-opencv/ today.\nii',
  'metadata': {'creationdate': '2019-01-06T06:31:50-05:00',
   'page_label': 'ii',
   'author': '',
   'moddate': '2025-10-09T20:13:54+06:30',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) kpathsea version 6.2.1',
   'creator': 'LaTeX with hyperref package',
   'page': 2,
   'total_pages': 166,
   'producer': 'pdfTeX-1.40.16',
   'file_type': 'pdf',
   'trapped': '/False',
   'subject': '',
   'source_file': 'Practical Python and OpenCV

In [24]:
rag_retriever.retrieve("using opencv to compute histograms")

Retrieving documents for query: 'using opencv to compute histograms'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.22it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_624b227f_171',
  'content': 'The most important takeaway from this code can be seen\nby inspecting the ﬁrst arguments to the cv2.calcHist func-\ntion. Here we see that we are passing in a list of two chan-\nnels: the Green and Blue channels. And that’s all there is\nto it.\nSo, how is a 2D histogram stored in OpenCV? It’s actually\na 2D NumPy array. Since I used 32 bins for each channel, I\nnow have a 32 × 32 histogram.\nHow do we visualize a 2D histogram? Let’s take a look\nat Figure 7.3 where we see three graphs. The ﬁrst is a 2D\ncolor histogram for the Green and Blue channels, the sec-\nond for Green and Red, and the third for Blue and Red.\nShades of blue represent low pixel counts, whereas shades\n97',
  'metadata': {'creationdate': '2019-01-06T06:31:50-05:00',
   'subject': '',
   'total_pages': 166,
   'content_length': 673,
   'title': '',
   'producer': 'pdfTeX-1.40.16',
   'trapped': '/False',
   'keywords': '',
   'source_file': 'Practical Python and OpenCV.pdf

In [26]:
rag_retriever.retrieve("contours and opencv version caveats")

Retrieving documents for query: 'contours and opencv version caveats'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.70it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_8ee0344a_255',
  'content': '11.2 contours and opencv version caveats\ntract the actual contours list.\nFinally,Line 3 takes the parsed contours fromgrab_contours\nand draws them on our image. By using grab_contours we\ncan be sure our script will work across all OpenCV versions.\nIt is entirely up to you whether or not you want to use\nthe grab_contours function or simply make the assump-\ntion that your end user is utilizing a speciﬁc version of\nOpenCV and hard-code the return tuple. I have provided\nyou with examples of both inside the text and source code of\nthis book so you can see both in action (and make whatever\ndecision you feel is best based on your particular situation).\nFurther Reading\nWhenever you are working on a new problem, consider\nhow contours and the associated properties of contours\ncan help you solve the problem. More often than not,\na clever use of contours can save you a lot of time and\navoid more advanced (and tedious) techniques.\nOf cours

### RAG Pipeline- VectorDB To LLM Output Generation

In [None]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")
# Use a supported model - gemma2-9b-it was decommissioned.
# Recommended: llama3-8b-8192 (or another current model from your provider).
# The code falls back to a safe default if the env var MODEL_NAME is set.
model_name = os.getenv("MODEL_NAME", "llama3-8b-8192")

llm = ChatGroq(groq_api_key=groq_api_key, model_name=model_name, temperature=0.1, max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k=3):
    """Retrieve top_k contexts and ask llm to answer using only that context."""
    # retrieve the context
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    # build a clear prompt
    prompt = (
        "You are a helpful assistant. Use ONLY the information in the Context to answer the Question.\n\n"
        "Context:\n{context}\n\n"
        "Question: {query}\n\n"
        "Answer:"
    )

    # Use the llm.invoke API and handle different response shapes robustly
    try:
        raw = llm.invoke([prompt.format(context=context, query=query)])
        # Some LLM wrappers return an object with .content, others return a list or string
        if hasattr(raw, "content"):
            return raw.content
        elif isinstance(raw, list) and raw:
            first = raw[0]
            if hasattr(first, "content"):
                return first.content
            else:
                return str(raw)
        else:
            return str(raw)
    except Exception as e:
        return f"Error invoking LLM: {e}"


In [None]:
answer = rag_simple("What is attention mechanism?", rag_retriever, llm)
print(answer)

Retrieving documents for query: 'What is attention mechanism?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.66it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
No relevant context found to answer the question.





In [None]:
# Re-initialize LLM using same environment-driven model_name variable
groq_api_key = os.getenv("GROQ_API_KEY")
model_name = os.getenv("MODEL_NAME", "llama3-8b-8192")
llm = ChatGroq(groq_api_key=groq_api_key, model_name=model_name, temperature=0.1, max_tokens=1024)
print(f"LLM initialized with model: {model_name}")

No relevant context found to answer the question.


In [42]:
# Prompt templates helper (copy-ready strings)\nPROMPT_RAG_QA = (\n    "You are a helpful assistant. Use ONLY the information in the Context to answer the Question. "\n    "If the answer is not present, reply 'I don't know from these documents.' and offer to search more.\n\n"\n    "Context:\n{context}\n\n"\n    "Question: {question}\n\n"\n    "Answer:"\n)\n\nPROMPT_SUMMARY_SHORT = (\n    "Summarize the following content in {sentences} sentences. Keep it factual and concise.\n\nContent:\n{context}\n"\n)\n\nPROMPT_EXTRACTION_JSON = (\n    "Extract the following fields and return valid JSON that matches the schema: {schema}. "\n    "If a field is missing, use null. Return ONLY JSON with these keys.\n\nContent:\n{context}\n"\n)\n\n# Example usage:\n# llm.invoke([PROMPT_RAG_QA.format(context=context, question=user_q)])

Retrieving documents for query: 'What is the main topic introduced in the first chapters of the uploaded PDF documents?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.65it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Prompt Question: What is the main topic introduced in the first chapters of the uploaded PDF documents?

Answer: No relevant context found to answer the question.





In [45]:
from langchain_groq import ChatGroq

# Initialize with a currently available Groq model
llm = ChatGroq(
    model="llama-3.1-8b-instant",  # Use this currently available model
    # model="llama-3.1-70b-versatile",  # Or this more powerful model
    temperature=0,
    groq_api_key=groq_api_key
)

In [None]:
# Your question about Canny edge detector is perfect for this context
answer = rag_simple(
    "What is the Canny edge detector? Explain its steps and provide examples.", 
    "Explain image thresholding methods in OpenCV",
    rag_retriever, 
    llm
)
print(answer)

Retrieving documents for query: 'What is the Canny edge detector? Explain its steps and provide examples.'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 30.30it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





The Canny edge detector is a multi-step process used to detect edges in an image. Its steps include:

1. **Blurring the image**: Removing noise by blurring the image.
2. **Computing Sobel gradient images**: Calculating the gradient of the image in the x and y directions.
3. **Suppressing edges**: Reducing the number of edges detected to reduce noise.
4. **Hysteresis thresholding**: Determining if a pixel is an edge or not based on its intensity value.

The Canny edge detector is useful for detecting the outline of objects in an image, such as the outline of coins in a grayscale image. It produces more "crisp" edges compared to other edge detection methods like the Laplacian or Sobel gradient images.

For example, in the given context, the Canny edge detector is applied to a grayscale image of coins, resulting in a more "crisp" edge detection with less noise. The lower and upper edge thresholds are typically set to determine if a pixel is an edge or not, with values below 30 considered 

In [48]:
# Your question about Canny edge detector is perfect for this context
answer = rag_simple(
    #"What is the Canny edge detector? Explain its steps and provide examples.", 
    "Explain image thresholding methods in OpenCV",
    rag_retriever, 
    llm
)
print(answer)

Retrieving documents for query: 'Explain image thresholding methods in OpenCV'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.30it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Image thresholding in OpenCV is a process of converting an image into a binary image, where pixels are either black (0) or white (255). There are two main types of thresholding methods:

1. **Simple Thresholding**: This method involves setting a fixed threshold value (T) and comparing each pixel intensity with it. If the pixel intensity is greater than T, it is set to the maximum value (255), otherwise it is set to 0.

   Example: `cv2.threshold(image, T, 255, cv2.THRESH_BINARY)`

2. **Adaptive Thresholding**: This method involves dividing the image into small neighborhoods and applying the thresholding process to each neighborhood separately. The threshold value is calculated based on the mean intensity of the neighborhood.

   Example: `cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 4)`

   In the example, `11` is the neighborhood size and `4` is the constant subtracted from the mean intensity.

There are also two types of adaptive thresholding m

In [50]:
# Your question about Canny edge detector is perfect for this context
answer = rag_simple(
   # "What is the Canny edge detector? Explain its steps and provide examples.", 
   # "Explain image thresholding methods in OpenCV",
   "Explain image transformations like rotation and translation",
    rag_retriever, 
    llm
)
print(answer)

Retrieving documents for query: 'Explain image transformations like rotation and translation'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 35.94it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Image transformations are operations that modify the position, orientation, or size of an image. Two common types of image transformations are rotation and translation.

**Rotation:**
Rotation is the process of rotating an image by a specified angle around a certain point. This can be useful for various applications such as:

* Changing the orientation of an image
* Creating a sense of movement or action
* Enhancing the visual appeal of an image

In the context of the provided code, rotation is achieved using the `cv2.getRotationMatrix2D` function, which generates a rotation matrix based on the specified angle and center point. The `cv2.warpAffine` function is then used to apply the rotation to the image.

**Translation:**
Translation is the process of shifting an image by a specified amount in the x and y directions. This can be useful for various applications such as:

* Aligning an image with another image or object
* Creating a sense of movement or action
* Enhancing the visual app

## Enhanced RAG Pipeline Features

In [53]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""     "How does histogram equalization improve image contrast?"\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced(   "How does histogram equalization improve image contrast?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'How does histogram equalization improve image contrast?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 34.08it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: Histogram equalization improves image contrast by "stretching" the distribution of pixels. This is done by taking a histogram with a large peak at the center and stretching it out towards the corners of the image. This process helps to improve the global contrast of the image by making the darker areas darker and the lighter areas lighter.

In other words, histogram equalization redistributes the pixel values in the image so that the entire range of possible values is used, rather than having a large peak in the middle of the histogram. This results in a more even distribution of pixel values, which can make the image appear more visually appealing and easier to interpret.

For example, if an image has a large peak in the middle of the histogram, it means that most of the pixels in the image have similar values, resulting in a lack of contrast. By applying histogram equalization, the peak is stretched out towards the corners of the histogram, resulting in a more even distributi

In [54]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("How do contours work in computer vision?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'How do contours work in computer vision?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 29.52it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: Contours in computer vision are curves of points with no gaps in the curve, used for shape approximation and analysis. They are found in an image by first obtaining a binarization of the image, typically using edge detection methods or thresholding.
Sources: [{'source': 'Practical Python and OpenCV.pdf', 'page': 163, 'score': 0.2251429557800293, 'preview': 'a clever use of contours can save you a lot of time and\navoid more advanced (and tedious) techniques.\nOf course, contours can’t help you detect objects in im-\nages in all situations. But in certain circumstances, con-\ntours are all you need. I’ve included examples of such\nsituations in the supplemen...'}, {'source': 'Practical Python and OpenCV.pdf', 'page': 163, 'score': 0.12646496295928955, 'preview': '11.2 contours and opencv version caveats\ntract the actual contours list.\nFinally,Line 3 takes the parsed contours fromgrab_contours\nand draws them on our image. By using grab_contours we\ncan be sure our script will 

In [55]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""     "How does histogram equalization improve image contrast?"\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced( "Find employees earning more than their managers?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Find employees earning more than their managers?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 56.72it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





Answer: To find employees earning more than their managers, you can use the following SQL query:

```sql
select w.ename, w.sal, m.ename, m.sal
from emp w, emp m
where w.mgr = m.empno and w.sal > m.sal;
```

This query joins the `emp` table with itself, where the first instance (`w`) represents the employee and the second instance (`m`) represents the manager. The `where` clause filters the results to include only rows where the employee's salary is greater than their manager's salary.
Sources: [{'source': 'SQL Queries .pdf', 'page': 22, 'score': 0.1497424840927124, 'preview': 'greater\tthan\tor\tequal\tto\tany\tother\temployee\tsalary\tof\tthe\tcompany.\nA)\t\t\t\tselect\te.ename,e.sal,e.comm\tfrom\temp\te\t\twhere\nnvl2(e.comm.,e.sal+e.comm.,e.sal)\t>=\tany\t(select\tsal\tfrom\temp);\t\t\t(OR)\nB)\t\t\t\tselect\tename,sal,comm.\tfrom\temp\twhere\tsal+nvl(comm.,0)\t>=\tany\t(select\nsal\tfrom\temp);/\n14.\t\t\t\t\t\t\t\tList\tth...'}, {'source': 'SQL Queries .pdf', 'page': 37, 'score':

In [56]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("List department with highest number of employees", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'List department with highest number of employees'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 32.01it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





Answer: select * from emp group by deptno having count(*) = (select max(count(*)) from emp group by deptno);
Sources: [{'source': 'SQL Queries .pdf', 'page': 18, 'score': 0.1564648151397705, 'preview': 'where\te.deptno\t=\td.deptno\tgroup\tby\td.deptno,d.dname,d..loc\nhaving\tcount(*)\t=\t(select\tmax(count(*)\t)\tfrom\temp\tgroup\tby\tdeptno);\n2.82.\t\t\t\t\tDisplay\tthe\temps\twhose\tmanager\tname\tis\tjones.\nK)\t\t\t\tselect\t*\tfrom\temp\twhere\tmgr\tin...'}, {'source': 'SQL Queries .pdf', 'page': 52, 'score': 0.10592901706695557, 'preview': '234)\t\t\tAny\temp\tSal\tof\temp5\ttable.\nA)\tselect\t*\tfrom\temp5;\n235)\t\t\tList\tthe\thighest\tpaid\temp.\nA)\t\t\t\tselect\t*\tfrom\temp\twhere\tsal\tin\t(select\tmax(sal)\tfrom\temp);\n236)\t\t\tList\tthe\tdetails\tof\tmost\trecently\thired\temp\tof\tdept\t30.\nA)\t\t\t\tselect\t*\tfrom\temp\twhere\thiredate\tin\n(select\tmax(hiredate)\tfrom\temp\twhere\tde...'}]
Confidence: 0.1564648151397705
Context Preview: where	e.deptno	=	d.dept

In [58]:
## # --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("How does histogram equalization improve image contrast?", top_k=3, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])


Retrieving documents for query: 'How does histogram equalization improve image contrast?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 31.20it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
7.4 histogram equalization
The code here is very simple – it’s just an extension of the
code above. We are now computing an 8 × 8 × 8 histogram
for each of the RGB channel




s. We can’t visualize this his-
togram, but we can see that the shape is indeed (8,8,8)
with 512 values.
7.4 histogram equalization
Histogram equalization improves the contrast of an image
by “stretching” the distribution of pixels. Consider a his-
togram with a large peak at the center of it. Applying his-
togram equalization will stretch the peak out towards the
corner of the image, thus improving the global contrast of
the image. Histogram equalization is applied to grayscale
images.
This method is useful when an image contains foregroun-
ds and backgrounds that are both dark or both light. It
tends to produce unrealistic effects in photographs; how-
ever, it is normally useful when enhancing the contrast of
medical or satellite images.
Regardless whether you are applying histogram equaliza-

ever, it is normally useful when enhancing the contrast of
medical or satellite images.
Regardless whether you are applying histogram equaliza-
tion to a photograph, a satellite image, or an X-