# Data ingestion Pipeline

In [12]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [13]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents
# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: DUK_Admission_Prospectus-2025.pdf
  ✓ Loaded 31 pages

Processing: PG_Regulatuions rev-2023 .pdf
  ✓ Loaded 18 pages

Total documents loaded: 49


In [4]:

all_pdf_documents

[Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 1, 'page_label': '2', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Contents \nAbout the University .. \nDA ENIC PO OGS o s s s iS85 Ay A s s e A SRR X SRR 4 \nAICTE Approved M.Tech. & MBA Programmes ...........ccceeuriuemiussnsssnsinsnssissnsnsesnnn 4 \nM.Tech. Computer Sc

In [14]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [15]:
chunks=split_documents(all_pdf_documents)
chunks

Split 49 documents into 122 chunks

Example chunk:
Content: Kerala University of Digital Sciences, 
Innovation and Technology 
(Digital University Kerala) 
- DIGITAL 
?.f UNIVERSITY 
- KERALA 
PROSPECTUS 5 
wE...
Metadata: {'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 1, 'page_label': '2', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Contents \nAbout the University .. \nDA ENIC PO OGS o s s s iS85 Ay A s s e A SRR X SRR 4 \nAICTE Approved M.Tech. & MBA Programmes ...........ccceeuriuemiussnsssnsinsnssissnsnsesnnn 4 \nM.Tech. Computer Sc

Embedding

In [16]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os

In [17]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x27cc0c2ba10>

Vector Store

In [18]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 488


<__main__.VectorStore at 0x27cc0c2bb60>

In [10]:
chunks

[Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-29T06:51:30+00:00', 'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf', 'total_pages': 31, 'page': 1, 'page_label': '2', 'source_file': 'DUK_Admission_Prospectus-2025.pdf', 'file_type': 'pdf'}, page_content='Contents \nAbout the University .. \nDA ENIC PO OGS o s s s iS85 Ay A s s e A SRR X SRR 4 \nAICTE Approved M.Tech. & MBA Programmes ...........ccceeuriuemiussnsssnsinsnssissnsnsesnnn 4 \nM.Tech. Computer Sc

In [19]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store in the vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 122 texts...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches: 100%|██████████| 4/4 [00:18<00:00,  4.57s/it]


Generated embeddings with shape: (122, 384)
Adding 122 documents to vector store...
Successfully added 122 documents to vector store
Total documents in collection: 610


**RAG Retriever Pipeline**

In [20]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [13]:

rag_retriever

<__main__.RAGRetriever at 0x214f06e6a50>

In [14]:

rag_retriever.retrieve("What is the system of teaching in The Kerala Agricultural University")

Retrieving documents for query: 'What is the system of teaching in The Kerala Agricultural University'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 13.42it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_8798392b_0',
  'content': 'Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE',
  'metadata': {'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf',
   'creator': 'PyPDF',
   'moddate': '2025-09-29T06:51:30+00:00',
   'page_label': '1',
   'producer': 'iLovePDF',
   'total_pages': 31,
   'file_type': 'pdf',
   'source_file': 'DUK_Admission_Prospectus-2025.pdf',
   'page': 0,
   'doc_index': 0,
   'content_length': 149,
   'creationdate': ''},
  'similarity_score': 0.14767634868621826,
  'distance': 0.8523236513137817,
  'rank': 1},
 {'id': 'doc_00e95fb4_0',
  'content': 'Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE',
  'metadata': {'page': 0,
   'content_length': 149,
   'creator': 'PyPDF',
   'file_type': 'pdf',
   'creationdate': '',
   'page_label': '1'

In [15]:
rag_retriever.retrieve("What is Digital University")

Retrieving documents for query: 'What is Digital University'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 25.55it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_8798392b_0',
  'content': 'Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE',
  'metadata': {'creator': 'PyPDF',
   'file_type': 'pdf',
   'page': 0,
   'page_label': '1',
   'producer': 'iLovePDF',
   'content_length': 149,
   'source_file': 'DUK_Admission_Prospectus-2025.pdf',
   'moddate': '2025-09-29T06:51:30+00:00',
   'doc_index': 0,
   'creationdate': '',
   'total_pages': 31,
   'source': '..\\data\\DUK_Admission_Prospectus-2025.pdf'},
  'similarity_score': 0.40543949604034424,
  'distance': 0.5945605039596558,
  'rank': 1},
 {'id': 'doc_00e95fb4_0',
  'content': 'Kerala University of Digital Sciences, \nInnovation and Technology \n(Digital University Kerala) \n- DIGITAL \n?.f UNIVERSITY \n- KERALA \nPROSPECTUS 5 \nwE',
  'metadata': {'file_type': 'pdf',
   'source_file': 'DUK_Admission_Prospectus-2025.pdf',
   'source': '..\\data\\DUK_Admission_Prospectus-20


Integration Vectordb Context pipeline With LLM output

In [28]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM ( GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt])
    return response.content

In [17]:

answer=rag_simple("what is the eligibility to secure admission for MSc Computer Science with specialization in Data Analytics",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'what is the eligibility to secure admission for MSc Computer Science with specialization in Data Analytics'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 19.88it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Eligibility for MSc Computer Science (Data Analytics):

- Hold a 3‑year or 4‑year Bachelor’s degree in Science, Engineering or Mathematics.  
- Mathematics or Statistics must be one of the subjects.  
- Minimum aggregate of 60 % (or equivalent), with relaxed marks for reserved categories.


In [18]:
answer=rag_simple("How to get admission in to digital University",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'How to get admission in to digital University'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.57it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





**How to get admission into Digital University Kerala**

1. **Check Eligibility**  
   - Minimum academic qualifications (e.g., 10+2, bachelor's, etc.) as per the specific course.  
   - Age, nationality, and any other prerequisites listed on the university’s website.

2. **Visit the Official Portal**  
   - Go to the Digital University Kerala website (usually `www.digitaluniversitykerala.ac.in`).  
   - Navigate to the “Admissions” or “Prospectus” section.

3. **Download & Fill the Application Form**  
   - Download the online application form for the desired program.  
   - Fill in personal, academic, and contact details accurately.

4. **Upload Required Documents**  
   - Scan and upload documents such as:  
     - Academic certificates & transcripts  
     - Identity proof (Aadhaar, passport, etc.)  
     - Photograph (as per specifications)  
     - Any additional documents requested (e.g., recommendation letters, statement of purpose).

5. **Pay the Application Fee**  
   - Pay t

In [19]:
answer=rag_simple("what are academic programmes at Digital University",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'what are academic programmes at Digital University'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.42it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





**Academic programmes offered by the Digital University**

| Level | Core focus areas (themes) | Typical programmes |
|-------|---------------------------|--------------------|
| **Undergraduate** | Intelligence & NLP, IoT, Electronic Systems & Automation, Imaging Technologies, Data Analytics & Big Data, Cybersecurity, Blockchain, Ecological Informatics, Geospatial Analytics, Applied Materials | B.Sc. in Artificial Intelligence, B.Sc. in Internet of Things, B.Sc. in Electronic & Automation Engineering, B.Sc. in Imaging & Vision, B.Sc. in Data Science, B.Sc. in Cybersecurity, B.Sc. in Blockchain Technology, B.Sc. in Ecological Informatics, B.Sc. in Geospatial Analytics, B.Sc. in Applied Materials |
| **Graduate (M.Sc.)** | Same themes, with deeper specialization | M.Sc. in AI & NLP, M.Sc. in IoT & Smart Systems, M.Sc. in Electronic Systems & Automation, M.Sc. in Imaging & Computational Vision, M.Sc. in Big Data Analytics, M.Sc. in Cybersecurity & Digital Forensics, M.Sc. in Blockchain &

In [31]:
def rag_with_optimizer(query, retriever, llm, optimizer):
    """
    RAG pipeline using a trained RetrievalOptimizer.

    Args:
        query (str): User question.
        retriever: Your vector/document retriever.
        llm: Groq LLM (ChatGroq).
        optimizer: RetrievalOptimizer instance with trained policy.

    Returns:
        str: Generated answer.
    """
    # 1. Determine optimal k
    k = optimizer.get_optimal_k(query)
    
    # 2. Retrieve top-k documents
    retrieved_docs = retriever.retrieve(query, top_k=k)
    context = "\n\n".join([doc['content'] for doc in retrieved_docs]) if retrieved_docs else ""
    
    if not context:
        return "No relevant context found to answer the question."
    
    # 3. Build prompt
    prompt = f"""Use the following context to answer the question concisely.
Context:
{context}

Question: {query}

Answer:"""
    
    # 4. Generate answer
    response = llm.invoke([prompt])
    return response.content


Enhanced RAG Pipeline Features

In [None]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("what are academic programmes at Digital University", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'what are academic programmes at Digital University'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.32it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: **Academic programmes offered by the Digital University**

| Level | Focus areas (themes) |
|-------|----------------------|
| **Undergraduate (B.Sc./B.Tech.)** | Intelligence & Natural Language Processing, Internet of Things, Electronic Systems & Automation, Imaging Technologies, Data Analytics & Big Data, Cybersecurity, Blockchain, Ecological Informatics, Geospatial Analytics, Applied Materials |
| **Postgraduate (M.Sc./M.Tech.)** | Same thematic tracks as above, with deeper specialization and research components |
| **Doctoral (Ph.D.)** | Advanced research in any of the core themes, leading to innovative, sustainable solutions for industry, government, and society |
| **Professional & Executive Training** | Short‑term, industry‑aligned courses and certifications in the above domains, designed for skill up‑skilling and consultancy roles |

These programmes are delivered through state‑of‑the‑art labs, industry‑partnered projects, and consultancy‑driven learning, all aimed at c

In [None]:
# --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("what is Digital Access for Community Empowerment ", top_k=3, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Retrieving documents for query: 'what is Digital Access for Community Empowerment '
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.02it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
activities and certifications that qualify for credits are subjective to the specific school. 
 
9.5 Every master program will have a university core that will have a single course called - 
Digital Access for Community Empowerment - DACE (Level 300) 




that covers 4 
components: 
a. 2  credit module called Community Empowerment (DE). This is a 5 days outbound 
program where students get exposed to problems facing society and explore ways to 
use digital technologies to find solutions.  At the end of the program, the students are 
expected to work and report their finding through a  short dissertation. 
 
b. 1 credit module called Digital Experience Laboratory  (DEL), where they get 
exposed to various digital technologies through a set of hands -on lab projects.  Each 
school may have their curriculum for this course. 
 
c. 1 credit module called Design Thinking and Innovation (DTI), where students will 
be exposed to the idea of applying innovative thinking in digital sciences.

activities and certifications that qualify for credits are subjective to the specific school. 
 
9.5 Every master program will have a university core that will have a single course called - 
Digital Access for Community Empowerment - DACE (Level 300) that co

# (c) Reinforcement Learning for Retrieval Optimization

In [None]:
import random
import numpy as np

class RetrievalOptimizer:
    """
    Simulated RL policy to optimize number of retrieved documents (k)
    based on reward = accuracy - token cost.
    """
    def __init__(self, retriever, llm, max_k=10):
        self.retriever = retriever
        self.llm = llm
        self.max_k = max_k
        self.policy = {}  # store learned optimal k per topic

    def reward_function(self, response_quality: float, tokens_used: int) -> float:
        alpha, beta = 1.0, 0.001  # tune weights
        return alpha * response_quality - beta * tokens_used

    def evaluate_response(self, answer: str, question: str) -> float:
        """
        Approximate correctness score using cosine similarity between
        embeddings of question and answer (as a weak signal).
        """
        emb_q = self.retriever.embedding_manager.generate_embeddings([question])[0]
        emb_a = self.retriever.embedding_manager.generate_embeddings([answer])[0]
        return float(np.dot(emb_q, emb_a) / (np.linalg.norm(emb_q) * np.linalg.norm(emb_a)))

    def train(self, training_queries: list):
        """
        Learn optimal k for each query by simulating multiple k values.
        """
        print("Training RL retrieval policy...")
        for query in training_queries:
            best_k, best_reward = 3, -np.inf
            for k in range(1, self.max_k + 1):
                results = self.retriever.retrieve(query, top_k=k)
                context = "\n\n".join([r['content'] for r in results])
                if not context:
                    continue
                prompt = f"Use the context below to answer:\n{context}\n\nQuestion: {query}\nAnswer:"
                response = self.llm.invoke([prompt])
                quality = self.evaluate_response(response.content, query)
                tokens_used = len(prompt.split())
                reward = self.reward_function(quality, tokens_used)
                if reward > best_reward:
                    best_reward, best_k = reward, k
            self.policy[query] = best_k
            print(f"Learned k={best_k} for query '{query}' (reward={best_reward:.3f})")
        print("Policy training complete.\n")

    def get_optimal_k(self, query: str) -> int:
        """Return optimal k (fallback to average if unseen)"""
        if query in self.policy:
            return self.policy[query]
        return int(np.mean(list(self.policy.values()))) if self.policy else 3


In [None]:
import json
import pickle
import os

# Paths
json_path = "../data/duk_training_data.json"
optimizer_path = "../data/reward_policy.pkl"

# Load training queries from JSON
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

training_queries = [item["query"] for item in data]

# Initialize optimizer
optimizer = RetrievalOptimizer(retriever=rag_retriever, llm=llm, max_k=3)

# Check if a trained policy already exists
if os.path.exists(optimizer_path):
    print("✅ Found existing trained policy. Loading instead of retraining...")
    with open(optimizer_path, "rb") as f:
        optimizer.policy = pickle.load(f)
    print(f"Loaded policy from {optimizer_path}")
else:
    print("🚀 No existing policy found. Starting training...")
    optimizer.train(training_queries)

    # Save the newly trained policy
    with open(optimizer_path, "wb") as f:
        pickle.dump(optimizer.policy, f)
    print(f"Policy saved to {optimizer_path}")

✅ Found existing trained policy. Loading instead of retraining...
Loaded policy from ../data/reward_policy.pkl


In [None]:
results = []

for query in test_queries:
    # 1. Get the optimal number of documents from the trained policy
    k = optimizer.get_optimal_k(query)
    
    # 2. Retrieve top-k relevant documents
    retrieved_docs = rag_retriever.retrieve(query, top_k=k)
    context = "\n\n".join([doc['content'] for doc in retrieved_docs]) if retrieved_docs else ""
    
    if not context:
        # No documents retrieved
        answer = "No relevant context found to answer the question."
        reward = 0.0
    else:
        # 3. Generate answer using Groq LLM
        prompt = f"""Use the following context to answer the question concisely.
Context:
{context}

Question: {query}

Answer:"""

        response = llm.invoke([prompt])  # invoke returns an object with .content
        answer = response.content

        # 4. Compute reward based on your optimizer
        reward = optimizer.reward_function(
            optimizer.evaluate_response(answer, query),
            tokens_used=len(prompt.split())
        )
    
    # 5. Store results
    results.append({
        "query": query,
        "k": k,
        "answer": answer,
        "reward": reward
    })

# Optional: view results
for r in results:
    print(f"Query: {r['query']}\nOptimal k: {r['k']}\nAnswer: {r['answer']}\nReward: {r['reward']}\n{'-'*50}")


Retrieving documents for query: 'What are the MTech specializations offered at DUK?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.61it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.85it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.57it/s]


Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'What is the eligibility requirement for MBA admission?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.02it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 39.27it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.04it/s]


Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'What is DUAT and how is it conducted?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 29.00it/s]


Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Retrieving documents for query: 'What is the application fee for DUAT?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.84it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.00it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.91it/s]


Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'Is hostel accommodation available for male students?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.07it/s]


Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Retrieving documents for query: 'What is the Earn While You Learn scheme?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 19.64it/s]


Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Retrieving documents for query: 'Explain the DUAT syllabus and marking scheme for MSc and MTech programmes.'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.58it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.64it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.37it/s]

Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'Compare the focus areas of School of Digital Sciences and School of Digital Humanities.'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...



Batches: 100%|██████████| 1/1 [00:00<00:00, 23.24it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 46.20it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.70it/s]


Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'Explain the steps to apply for DUK scholarships and financial aid.'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.53it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.67it/s]


Generated embeddings with shape: (1, 384)
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.11it/s]


Generated embeddings with shape: (1, 384)
Retrieving documents for query: 'Summarize DUK’s vision, mission, and core values in your own words.'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.18it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Query: What are the MTech specializations offered at DUK?
Optimal k: 1
Answer: The MTech specializations offered at DUK are:

1. Computer Science and Engineering (specializations: 
   - connected systems and intelligence
   - artificial intelligence
   - cybersecurity engineering)

2. Electronics Engineering (specializations: 
   - Analog Hardware
   - VLSI
   - IoT and Robotics)
Reward: 0.6664936141967773
--------------------------------------------------
Query: What is the eligibility requirement for MBA admission?
Optimal k: 1
Answer: A valid entrance examination score approved by the University (CUET(PG), CAT, CMAT, KMAT, XAT, NMAT, GRE, DUAT) and a Bachelor's degree with a minimum aggregate of 60% marks (or equivalent, with relaxation for reserved categories as per University norms).
Reward: 0.39599870538711546
--------------------------------------------------
Query: What is DUAT and how is it condu




In [47]:
query = "What are the MTech specializations offered at DUK?"
answer = rag_with_optimizer(query, rag_retriever, llm, optimizer)
print(f"Answer: {answer}")

Retrieving documents for query: 'What are the MTech specializations offered at DUK?'
Top K: 1, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.13it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





Answer: The MTech specializations offered at DUK are:

1. Computer Science and Engineering (specializations: 
   - Connected systems and intelligence
   - Artificial intelligence
   - Cybersecurity engineering)

2. Electronics Engineering (specializations: 
   - Analog Hardware
   - VLSI
   - IoT and Robotics)


# Hallucination detection module

In [48]:
# hallucination_detector.py
from sentence_transformers import SentenceTransformer, util

class HallucinationDetector:
    """
    Detect hallucinations in LLM responses by comparing them
    with retrieved evidence using semantic similarity.
    """

    def __init__(self, model_name="all-MiniLM-L6-v2", threshold=0.75):
        """
        :param model_name: Embedding model for similarity comparison.
        :param threshold: Minimum cosine similarity for the response
                          to be considered grounded.
        """
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold

    def compute_similarity(self, response: str, retrieved_chunks: list) -> float:
        """
        Compute similarity between generated response and retrieved context.
        """
        context_text = " ".join(retrieved_chunks)
        resp_emb = self.model.encode(response, convert_to_tensor=True)
        ctx_emb = self.model.encode(context_text, convert_to_tensor=True)
        sim = util.cos_sim(resp_emb, ctx_emb).item()
        return sim

    def detect(self, response: str, retrieved_chunks: list) -> dict:
        """
        Detect hallucination based on similarity threshold.
        Returns a dictionary with similarity score and hallucination flag.
        """
        similarity = self.compute_similarity(response, retrieved_chunks)
        is_grounded = similarity >= self.threshold

        return {
            "similarity": round(similarity, 3),
            "is_grounded": is_grounded,
            "status": "Grounded ✅" if is_grounded else "Hallucination ⚠️"
        }


In [58]:
# After getting response from LLM


hallucination_detector = HallucinationDetector(threshold=0.50)

def rag_with_hallucination(query, retriever, llm, top_k=5):
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results])
    response_prompt = f"""Use the following context to answer the question concisely.
    Context:
    {context}

    Question: {query}

    Answer:"""
    response = llm.invoke([response_prompt])

    # Run hallucination check
    retrieved_chunks = [doc['content'] for doc in results]
    hallucination_result = hallucination_detector.detect(response.content, retrieved_chunks)

    # Combine results
    print("Answer:", response.content)
    print("Hallucination Status:", hallucination_result['status'])
    print("Similarity Score:", hallucination_result['similarity'])
    
    return {
        "query": query,
        "answer": response.content,
        "hallucination_result": hallucination_result
    }

# Example run
result = rag_with_hallucination(
    "what is the eligibility to secure admission for MSc Computer Science with specialization in Data Analytics",
    rag_retriever,
    llm
)


Retrieving documents for query: 'what is the eligibility to secure admission for MSc Computer Science with specialization in Data Analytics'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s]


Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
Answer: Candidates must possess either a 3-year or a 4-year Bachelor's degree in Science/Engineering/Mathematics, with Mathematics/Statistics as one of the subjects, and a minimum aggregate of 60% marks (or equivalent). Relaxation in marks will be provided for reserved categories as per University norms.
Hallucination Status: Grounded ✅
Similarity Score: 0.63


In [59]:
# Access the nested hallucination dict correctly
hallucination_data = result["hallucination_result"]

if not hallucination_data["is_grounded"]:
    print("⚠️ Regenerating grounded response...")
    grounded_prompt = f"""Answer ONLY using the context below.
    If the answer is not present, reply: 'Information not found in the context.'

    Context:
    {context}

    Question: {query}

    Answer:"""
    response = llm.invoke([grounded_prompt])

    print("Regenerated (Grounded) Answer:", response.content)


In [60]:
import json
from datetime import datetime

def rag_with_hallucination_control(query, retriever, llm, hallucination_detector, top_k=5, similarity_threshold=0.75, log_path="../data/hallucination_log.json"):
    """
    Complete RAG pipeline with hallucination detection and control.
    Steps:
    1. Retrieve top-k relevant chunks.
    2. Generate LLM answer.
    3. Detect hallucination using semantic similarity.
    4. If hallucinated, regenerate grounded response.
    5. Log results for analysis.
    """

    print(f"\n🔍 Query: {query}")
    print(f"Retrieving top {top_k} documents...")

    # Step 1: Retrieve relevant context
    results = retriever.retrieve(query, top_k=top_k)
    if not results:
        print("⚠️ No relevant context found.")
        return {"query": query, "answer": "No relevant context found."}

    context = "\n\n".join([doc['content'] for doc in results])
    retrieved_chunks = [doc['content'] for doc in results]

    # Step 2: Generate initial response
    prompt = f"""Use ONLY the context below to answer concisely.
    Context:
    {context}

    Question: {query}

    Answer:"""
    response = llm.invoke([prompt])
    answer = response.content.strip()

    print("\n🧾 Initial Answer:\n", answer)

    # Step 3: Run hallucination detection
    hallucination_result = hallucination_detector.detect(answer, retrieved_chunks)
    print(f"\nHallucination Check → {hallucination_result['status']} (Similarity: {hallucination_result['similarity']})")

    # Step 4: If hallucinated, regenerate grounded response
    if not hallucination_result["is_grounded"]:
        print("⚠️ Regenerating grounded response...")

        grounded_prompt = f"""Answer ONLY using the context below.
        If the answer is not present, reply exactly as: 'Information not found in the context.'

        Context:
        {context}

        Question: {query}

        Answer:"""

        try:
            grounded_response = llm.invoke([grounded_prompt])
            grounded_answer = grounded_response.content.strip()
            print("\n✅ Regenerated (Grounded) Answer:\n", grounded_answer)
        except Exception as e:
            grounded_answer = "Error during regeneration"
            print("❌ Error regenerating grounded answer:", e)
    else:
        grounded_answer = answer  # No hallucination → keep original

    # Step 5: Log the result
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "initial_answer": answer,
        "final_answer": grounded_answer,
        "similarity": hallucination_result["similarity"],
        "is_grounded": hallucination_result["is_grounded"],
        "status": hallucination_result["status"]
    }

    try:
        with open(log_path, "a", encoding="utf-8") as f:
            json.dump(log_entry, f, ensure_ascii=False)
            f.write("\n")
        print(f"📝 Logged to {log_path}")
    except Exception as e:
        print(f"⚠️ Logging failed: {e}")

    # Step 6: Return final structured output
    return {
        "query": query,
        "initial_answer": answer,
        "final_answer": grounded_answer,
        "hallucination_result": hallucination_result
    }


In [62]:
# Initialize once
hallucination_detector = HallucinationDetector(threshold=0.75)

# Run a query through the full pipeline
result = rag_with_hallucination_control(
    query="what is the eligibility to secure admission for MBA",
    retriever=rag_retriever,
    llm=llm,
    hallucination_detector=hallucination_detector,
    top_k=5
)



🔍 Query: what is the eligibility to secure admission for MBA
Retrieving top 5 documents...
Retrieving documents for query: 'what is the eligibility to secure admission for MBA'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.00it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)






🧾 Initial Answer:
 3-year/4-year Graduation in the relevant stream(s) fixed by the School.

Hallucination Check → Hallucination ⚠️ (Similarity: 0.371)
⚠️ Regenerating grounded response...

✅ Regenerated (Grounded) Answer:
 3-year/4-year Graduation in the relevant stream(s) fixed by the School.
📝 Logged to ../data/hallucination_log.json
