### 1. Data Ingestion to Vector database 

1.1. Read documents 

In [4]:
import os
from pathlib import Path

from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
def process_all_documents(pdf_dir):
    all_documents = []
    pdf_dir = Path(pdf_dir)

    pdf_files = list(pdf_dir.glob('*.pdf'))
    print(f"Found {len(pdf_files)} PDF files in {pdf_dir}")

    for pdf_file in pdf_files:
        print(f'Processing file {pdf_file}')
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for document in documents:
                metadata = document.metadata
                metadata['source'] = str(pdf_file)
                document.metadata = metadata
                all_documents.append(document)
            print(f'Loaded {len(documents)} pages')
        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
    
    print(f'Total documents: {len(all_documents)}')

    return all_documents

In [20]:
pdf_dir = '../data/pdf/'
all_documents = process_all_documents(pdf_dir)

Found 3 PDF files in ..\data\pdf
Processing file ..\data\pdf\attention-is-all-you-need.pdf
Loaded 11 pages
Processing file ..\data\pdf\Deep_Residual_Learning_CVPR_2016_paper.pdf
Loaded 9 pages
Processing file ..\data\pdf\Generic Algorithm.pdf
Loaded 4 pages
Total documents: 24


In [21]:
print(type(all_documents[0]))

<class 'langchain_core.documents.base.Document'>


1.2 Chunking

In [22]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )

    splitted_docs = splitter.split_documents(documents)
    if splitted_docs:
        print(f'Total splitted documents splitted from {len(documents)} documents: {len(splitted_docs)} chunks')
        print(f'First splitted document content:\n{splitted_docs[0].page_content[:50]}\n')
    else:
        print('No documents were splitted. Please check the input documents.')
    return splitted_docs

chunks = split_documents(all_documents)

Total splitted documents splitted from 24 documents: 104 chunks
First splitted document content:
Attention Is All You Need
Ashish Vaswani∗
Google B



1.3. Embedding all documents

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', collection_name = 'pdf_documents', persist_directory: str = '../data/vector_store/chroma_db'):
        self.model_name = model_name
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.model = self.load_model()
        self.client = self.load_db_client()
        self.collection = self.client.get_or_create_collection(
            name = self.collection_name,
            metadata = {"description": "PDF document embeddings for RAG"}
        )

    def load_model(self):
        try:
            print(f'Loading model {self.model_name}')
            model = SentenceTransformer(self.model_name)
            print(f'Loading successfully \nEmbedding dimension: {model.get_sentence_embedding_dimension()}')
            return model
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
        
    def load_db_client(self):
        try:
            print(f'Loading ChromaDB client with persist directory')
            os.makedirs(self.persist_directory, exist_ok = True)
            client = chromadb.PersistentClient(path=self.persist_directory)
            print('ChromaDB client loaded successfully')
            return client
        except Exception as e:
            print(f"Error loading ChromaDB client: {e}")
            raise e
        
    def embed_texts(self, texts: List[str]):
        if not self.model:
            raise ValueError("Model is not loaded.")
        print(f'Embedding for {len(texts)} texts')
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f'Embeddings shape: {embeddings.shape}')
        return embeddings

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

    def query(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]:
        query_embedding = self.embed_texts([query_text])[0].reshape(1, -1)
        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=top_k
        )
        return results['documents'][0], results['metadatas'][0]

In [52]:
embedding_manager = EmbeddingManager()


Loading model all-MiniLM-L6-v2
Loading successfully 
Embedding dimension: 384
Loading ChromaDB client with persist directory
ChromaDB client loaded successfully


In [54]:
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.embed_texts(texts)
embedding_manager.add_documents(chunks, embeddings)

Embedding for 104 texts


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches: 100%|██████████| 4/4 [00:06<00:00,  1.65s/it]


Embeddings shape: (104, 384)
Adding 104 documents to vector store...
Successfully added 104 documents to vector store
Total documents in collection: 104


### 2. Retriver

In [63]:
class Retriever:
    def __init__(self, embedding_manager: EmbeddingManager):
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.2) -> List[Tuple[str, Dict[str, Any]]]:
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        try:
            query_embedding = self.embedding_manager.embed_texts([query])[0].reshape(1, -1)
            results = self.embedding_manager.collection.query(
                query_embeddings=query_embedding.tolist(),
                n_results=top_k
            )
            retrieved_docs = []

            if(results['documents'] and results['metadatas']):
                for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
                    doc_embedding = self.embedding_manager.model.encode([doc])[0].reshape(1, -1)
                    similarity = cosine_similarity(query_embedding, doc_embedding)[0][0]
                    if similarity >= score_threshold:
                        retrieved_docs.append((doc, metadata))
                        similarity_score = 1 - similarity
                        print(f"Retrieved document with similarity {similarity_score:.4f}")
                    else:
                        print(f"Document similarity {similarity:.4f} below threshold")
            else:
                print("No documents retrieved from the collection.")
            
            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

In [64]:
retriever = Retriever(embedding_manager)

In [69]:
results = retriever.retrieve(query = "What is generic algorithm?")

for i, result in enumerate(results):
    print(f"Result {i+1}:\n")
    print(f"Content: {result[0][:1000]}...\nMetadata: {result[1]}\n")

Retrieving documents for query: 'What is generic algorithm?'
Top K: 5, Score threshold: 0.2
Embedding for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.32it/s]

Embeddings shape: (1, 384)





Retrieved document with similarity 0.5619
Retrieved document with similarity 0.5819
Retrieved document with similarity 0.5826
Retrieved document with similarity 0.6011
Retrieved document with similarity 0.6113
Result 1:

Content: Genetic Algorithms
STEPHANIE FORREST
Department of Computer Science, University of New Mexico, Albuquerque ^forrest@cs.unm.edu&
A genetic algorithm is a computa-
tional model of biological evolution. Ge-
netic
algorithms
are
useful
both
as
search methods for solving problems
and for modeling evolutionary systems.
In genetic algorithms, binary strings
are stored in a computer’s memory and
over time are modified in much the
same way that populations of individu-
als evolve under natural selection. Al-
though
the
computational
setting
is
highly simplified when compared with
the natural world, genetic algorithms
are
capable
of
evolving
surprisingly
complex
and
interesting
structures.
These
structures,
called
individuals,
can represent solutions to problems,
strate

### 3. Enhancing LLM Prompt by RAG

In [98]:
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")
# print(os.getenv("GROQ_API_KEY"))

In [95]:
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage

In [91]:
class GroqLLM:
    def __init__(self, model_name: str = "llama3-8b-8192", api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key
        self.model = self.load_model()

    
    def load_model(self):
        if not self.api_key:
            raise ValueError("GROQ_API_KEY is not set. Please set it in your environment variables.")
        try:
            print(f'Loading GROQ model {self.model_name}')
            model = ChatGroq(
                model_name=self.model_name, 
                api_key=self.api_key,
                temperature = 0.1,
                max_tokens = 1024)
            print('GROQ model loaded successfully')
            return model
        except Exception as e:
            print(f"Error loading GROQ model {self.model_name}: {e}")
            raise e
        
    def generate_response(self, question: str, context: str) -> str:
        prompt_template = """You are an AI assistant helping users find information. Use the following context to answer the question accurately and concisely.
Context:
{context}

Question: {question}

Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        try: 
            prompt = PromptTemplate(
                input_variables = ['context', 'question'],
                template = prompt_template
            )

            formatted_prompt = prompt.format(context = context, question = question)
            messages = [
                HumanMessage(content = formatted_prompt)
            ]
            response = self.model.invoke(messages)
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            raise e
        
groq_llm = GroqLLM(api_key = groq_api_key)
        

Loading GROQ model llama3-8b-8192
GROQ model loaded successfully


In [99]:
class GeminiLLM:
    def __init__(self, model_name: str = "gemini-2.5-flash", api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key
        self.model = self.load_model()

    def load_model(self):
        if not self.api_key:
            raise ValueError("GOOGLE_API_KEY is not set. Please set it in your environment variables.")
        try:
            print(f'Loading Gemini model: {self.model_name}')
            model = ChatGoogleGenerativeAI(
                model=self.model_name,
                google_api_key=self.api_key,
                temperature=0.1,
                max_output_tokens=1024,
            )
            print('Gemini model loaded successfully')
            return model
        except Exception as e:
            print(f"Error loading Gemini model {self.model_name}: {e}")
            raise e
            
    def generate_response(self, question: str, context: str) -> str:
        prompt_template = """You are an AI assistant helping users find information. Use the following context to answer the question accurately and concisely.
Context:
{context}

Question: {question}

Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        try: 
            prompt = PromptTemplate(
                input_variables=['context', 'question'],
                template=prompt_template
            )

            formatted_prompt = prompt.format(context=context, question=question)
            messages = [
                HumanMessage(content=formatted_prompt)
            ]
            response = self.model.invoke(messages)
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            raise e
        
gemini_llm = GeminiLLM(api_key = gemini_api_key)

Loading Gemini model: gemini-2.5-flash
Gemini model loaded successfully


In [100]:
class RAG:
    def __init__(self, retriever: Retriever = None, llm: GeminiLLM = None):
        self.retriever = retriever
        self.llm = llm

    def get_input_and_response(self, query: str, top_k: int = 5, score_threshold: float = 0.2) -> str:
        retrieved_docs = self.retriever.retrieve(query, top_k, score_threshold)
        if not retrieved_docs:
            return "No relevant documents found to answer the query."

        context = "\n\n".join([doc[0] for doc in retrieved_docs])
        response = self.llm.generate_response(query, context)
        return response


In [101]:
rag = RAG(retriever = retriever, llm = gemini_llm)

In [104]:
result = rag.get_input_and_response(query = "How does generic algorithm work?")
print("=" * 80)
print(f'Chatbot response:\n {result}\n')

Retrieving documents for query: 'How does generic algorithm work?'
Top K: 5, Score threshold: 0.2
Embedding for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 40.27it/s]

Embeddings shape: (1, 384)
Retrieved document with similarity 0.6009
Retrieved document with similarity 0.6181





Retrieved document with similarity 0.6299
Retrieved document with similarity 0.6390
Retrieved document with similarity 0.6578
Chatbot response:
 A genetic algorithm works by:
1.  **Creating a random population:** Initially, a population of individuals (often represented as bit strings) is created randomly. Each individual is a candidate solution to a problem.
2.  **Evaluating fitness:** Variations among individuals lead to some being more "fit" (i.e., better problem solutions) than others.
3.  **Selection:** These differences in fitness are used to bias the selection of a new set of candidate solutions. A new population is created by making copies of more successful individuals and deleting less successful ones.
4.  **Genetic operators:** The copies are not exact. During the copy operation, genetic operators are applied probabilistically:
    *   **Mutation:** Random bit flips occur.
    *   **Crossover:** Corresponding substrings are exchanged between two individuals.
    These operat