In [1]:
# ========================================================================
# PART 0: ENVIRONMENT SETUP AND LIBRARY VERSION CHECK
# ========================================================================
# LEARNING OBJECTIVE: Verify environment setup and library compatibility

def check_library_versions():
    """
    WORKSHOP FUNCTION: Environment Verification
    
    PURPOSE: Check installed library versions for compatibility
    This helps ensure all students have the same environment setup
    """
    print("="*60)
    print("üîß WORKSHOP ENVIRONMENT CHECK")
    print("="*60)
    
    required_libraries = {
        'langchain': '0.3.27',
        'langchain_community': '0.3.29',
        'chromadb': '1.0.20',
        'pypdf': '6.0.0',
        'numpy': '6.0.0',
        'pathlib': 'built-in',
        'os': 'built-in',
        'sys': 'built-in'
    }
    
    print("üìã Checking required libraries and versions:")
    print("-" * 50)
    
    missing_libraries = []
    version_mismatches = []
    
    for library, min_version in required_libraries.items():
        try:
            if library in ['pathlib', 'os', 'sys']:
                print(f"‚úÖ {library}: {min_version}")
                continue
                
            if library == 'langchain':
                import langchain
                version = langchain.__version__
            elif library == 'langchain_community':
                import langchain_community
                version = getattr(langchain_community, '__version__', 'unknown')
            elif library == 'chromadb':
                import chromadb
                version = chromadb.__version__
            elif library == 'pypdf':
                import pypdf
                version = pypdf._version.__version__
            elif library == 'numpy':
                import numpy
                version = numpy.__version__
            
            print(f"‚úÖ {library}: {version}")
            
        except ImportError:
            print(f"‚ùå {library}: NOT INSTALLED")
            missing_libraries.append(library)
        except Exception as e:
            print(f"‚ö†Ô∏è  {library}: Error checking version - {e}")
    
    # Check Ollama availability (external dependency)
    print("\nü§ñ Checking Ollama setup:")
    print("-" * 30)
    try:
        import subprocess
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            if 'phi3:mini' in result.stdout:
                print("‚úÖ Ollama: Installed and phi3:mini model available")
            else:
                print("‚ö†Ô∏è  Ollama: Installed but phi3:mini model missing")
                print("   Run: ollama pull phi3:mini")
        else:
            print("‚ùå Ollama: Not properly configured")
    except FileNotFoundError:
        print("‚ùå Ollama: Not installed")
        print("   Install from: https://ollama.ai/")
    except subprocess.TimeoutExpired:
        print("‚ö†Ô∏è  Ollama: Connection timeout - check if service is running")
    except Exception as e:
        print(f"‚ö†Ô∏è  Ollama: Error checking - {e}")
    
    # Summary and installation commands
    if missing_libraries:
        print(f"\n‚ùå MISSING LIBRARIES: {', '.join(missing_libraries)}")
        print("\nüì¶ EXACT INSTALLATION COMMANDS (Workshop Tested Versions):")
        print("pip install langchain==0.3.27")
        print("pip install langchain-community==0.3.29")
        print("pip install chromadb==1.0.20")
        print("pip install pypdf==6.0.0")
        print("pip install numpy==6.0.0")
        print("\nRun these commands and restart the workshop.")
        return False
    else:
        print("\n‚úÖ ALL LIBRARIES INSTALLED!")
        print("üöÄ Ready to proceed with the workshop!")
        return True

# Run environment check
environment_ready = check_library_versions()

if not environment_ready:
    print("\n‚ö†Ô∏è  PLEASE INSTALL MISSING LIBRARIES BEFORE CONTINUING")
    print("Uncomment the sys.exit() line below if you want to stop here")
    # sys.exit(1)  # Students can uncomment this to stop execution

üîß WORKSHOP ENVIRONMENT CHECK
üìã Checking required libraries and versions:
--------------------------------------------------
‚úÖ langchain: 0.3.27
‚úÖ langchain_community: 0.3.29
‚úÖ chromadb: 1.0.20
‚úÖ pypdf: 6.0.0
‚úÖ numpy: 2.3.4
‚úÖ pathlib: built-in
‚úÖ os: built-in
‚úÖ sys: built-in

ü§ñ Checking Ollama setup:
------------------------------
‚úÖ Ollama: Installed and phi3:mini model available

‚úÖ ALL LIBRARIES INSTALLED!
üöÄ Ready to proceed with the workshop!


HANDS-ON RAG (Retrieval-Augmented Generation) WORKSHOP

13 Oct 2025
Ramaih University of Applied Sciences
Instructor: Naganathan Muthuramalingam., PhD Scholar - School of Social Sciences

This script demonstrates a complete end-to-end RAG system implementation.

WHAT YOU'LL LEARN:
1. Document Loading and Processing
2. Text Chunking Strategies
3. Vector Embeddings and Storage
4. Retrieval Mechanisms
5. LLM Integration
6. Answer Validation and Grounding

WORKSHOP STRUCTURE:
- Part 0: Environment Setup and Library Version Check
- Part 1: Imports and Document Discovery
- Part 2: Document Loading and Text Chunking
- Part 3: Vector Embeddings & Knowledge Base Creation
- Part 4: Retrieval Configuration
- Part 5: Language Model Setup
- Part 6: Prompt Engineering for Grounding
- Part 7: RAG Chain Assembly
- Part 8: Answer Validation System
- Part 9: Hands-on Testing

SYSTEM REQUIREMENTS:
- Minimum 8GB RAM (16GB recommended for better performance)
- At least 20GB free disk space for models and vector databases
- Python 3.8+ installed
- Stable internet connection for initial model downloads
- Ollama installed (https://ollama.ai/)
- phi3:mini model downloaded via: ollama pull phi3:mini

INSTALLATION STEPS:
1. Install Python 3.8+
2. Install Ollama from https://ollama.ai/
3. Run: ollama pull phi3:mini
4. Install required Python packages (see Part 0 below)
5. Create 'data' folder and add PDF documents

PREREQUISITES:
- Basic Python knowledge
- Understanding of machine learning concepts
- Familiarity with NLP basics

In [2]:
# ========================================================================
# PART 1: IMPORTS AND SETUP
# ========================================================================
# Standard library imports - Python's built-in modules
import os
import sys
from pathlib import Path

# LangChain Document Loaders & Processing - For handling different document types
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Vector Store and Embeddings - For semantic search capabilities
from langchain_community.vectorstores import Chroma

# Local LLM via Ollama - For running language models locally
from langchain_community.llms import Ollama

# RAG Chain - For combining retrieval and generation
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.prompts import PromptTemplate


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ========================================================================
# WORKSHOP ACTIVITY 1: DOCUMENT DISCOVERY
# ========================================================================
# LEARNING OBJECTIVE: Understand how to locate and validate data sources

# Define the path to your PDF directory
# TODO for students: Create a 'data' folder and add your PDF documents

data_dir = "./data"

# Find all PDF files in the directory recursively
# This uses Path.rglob() to search through all subdirectories

pdf_files = [str(p) for p in Path(data_dir).rglob("*.pdf") if p.is_file()]

# Validation: Always check if your data exists before processing
if not pdf_files:
    print(f"No PDFs found in {data_dir}. Please add your PDFs and update the `data_dir` variable.")
    print("WORKSHOP TIP: Create the './data' folder and add at least one PDF document")
else:
    print(f"‚úÖ Found {len(pdf_files)} PDF(s):")
    for f in pdf_files:
        print(f" - {f}")



‚úÖ Found 5 PDF(s):
 - data\10-Tips-Healthy-Lifestyle.pdf
 - data\Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
 - data\how-can-i-make-lifestyle-healthier.pdf
 - data\nnm_tipsheet.pdf
 - data\PAG_ExecutiveSummary.pdf


In [7]:
# ========================================================================
# WORKSHOP ACTIVITY 2: DOCUMENT LOADING AND PREPROCESSING
# ========================================================================
# LEARNING OBJECTIVE: Transform unstructured documents into structured data


print("\n" + "="*50)
print("PART 2: DOCUMENT LOADING & TEXT CHUNKING")
print("="*50)

# Initialize document storage
documents = []

# Process each PDF file
for file_path in pdf_files:
    try:
        print(f"\nüìÑ Processing: {os.path.basename(file_path)}")
        
        # PyPDFLoader: Specialized for PDF documents
        # WORKSHOP NOTE: Different loaders exist for different file types
        # (TextLoader, CSVLoader, JSONLoader, etc.)
        loader = PyPDFLoader(file_path)
        
        # Load documents - each page becomes a separate document
        docs = loader.load()
        
        # Add source metadata for traceability
        # WORKSHOP TIP: Metadata is crucial for citation and verification
        for doc in docs:
            doc.metadata["source"] = os.path.basename(file_path)
            
        documents.extend(docs)
        print(f"‚úÖ Loaded {len(docs)} pages from {os.path.basename(file_path)}")
        
    except Exception as e:
        print(f"‚ùå Error loading {file_path}: {e}")
        print("WORKSHOP TIP: Check file permissions and format compatibility")

print(f"\nüìä SUMMARY: Total pages loaded: {len(documents)}")



PART 2: DOCUMENT LOADING & TEXT CHUNKING

üìÑ Processing: 10-Tips-Healthy-Lifestyle.pdf
‚úÖ Loaded 2 pages from 10-Tips-Healthy-Lifestyle.pdf

üìÑ Processing: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
‚úÖ Loaded 18 pages from Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf

üìÑ Processing: how-can-i-make-lifestyle-healthier.pdf
‚úÖ Loaded 2 pages from how-can-i-make-lifestyle-healthier.pdf

üìÑ Processing: nnm_tipsheet.pdf
‚úÖ Loaded 1 pages from nnm_tipsheet.pdf

üìÑ Processing: PAG_ExecutiveSummary.pdf
‚úÖ Loaded 7 pages from PAG_ExecutiveSummary.pdf

üìä SUMMARY: Total pages loaded: 30


### Part 1: Document Selection

**Main PDF:** `Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf`

This guide, published by the Healthy Weight Commitment Foundation, provides comprehensive, research-based strategies for maintaining a balanced and active lifestyle. It includes calorie balance, activity guidelines, and health practices across age groups.

**Topic Chosen:** Healthy and Active Lifestyle for Adults  
I selected this topic because it connects directly to health, fitness, and wellness ‚Äî promoting physical activity, nutrition awareness, and long-term well-being.


In [4]:
#CHUNKING 
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Two chunking configurations to test
configs = [
    {"name": "Small chunks", "chunk_size": 400, "chunk_overlap": 200},
    {"name": "Large chunks", "chunk_size": 1200, "chunk_overlap": 50}
]

# Loop through each config and split documents
for cfg in configs:
    print("\n" + "="*50)
    print(f"Testing config: {cfg['name']}")
    print("="*50)
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=cfg["chunk_size"],
        chunk_overlap=cfg["chunk_overlap"],
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
    )
    
    chunks = splitter.split_documents(documents)
    
    # Add metadata for each chunk
    for i, text in enumerate(chunks):
        text.metadata["chunk_id"] = i
        text.metadata["chunk_length"] = len(text.page_content)
        text.metadata["preview"] = text.page_content[:50].replace("\n", " ")
    
    total_chunks = len(chunks)
    avg_len = sum(len(c.page_content) for c in chunks)/total_chunks if total_chunks else 0
    sample_preview = chunks[0].page_content[:100].replace("\n"," ") if chunks else "N/A"
    
    print(f"üîß Chunking Configuration:")
    print(f"   - Chunk size: {cfg['chunk_size']}")
    print(f"   - Chunk overlap: {cfg['chunk_overlap']}")
    print(f"Total chunks created: {total_chunks}")
    print(f"Average chunk length: {avg_len:.0f} characters")
    print(f"Sample preview: {sample_preview}...")

# Recommendation:
print("\n‚úÖ Based on total chunks, average length, and preview readability, choose the configuration that balances context with chunk count. For your Health PDFs, large chunks (1200/50) usually work best.") 


Testing config: Small chunks
üîß Chunking Configuration:
   - Chunk size: 400
   - Chunk overlap: 200
Total chunks created: 234
Average chunk length: 360 characters
Sample preview: 10 TIPS FOR MAINTAINING A HEALTHY  LIFESTYLE AND BODY WEIGHT  Yiqing Song, Professor of Epidemiology...

Testing config: Large chunks
üîß Chunking Configuration:
   - Chunk size: 1200
   - Chunk overlap: 50
Total chunks created: 58
Average chunk length: 871 characters
Sample preview: 10 TIPS FOR MAINTAINING A HEALTHY  LIFESTYLE AND BODY WEIGHT  Yiqing Song, Professor of Epidemiology...

‚úÖ Based on total chunks, average length, and preview readability, choose the configuration that balances context with chunk count. For your Health PDFs, large chunks (1200/50) usually work best.


### Which chunking settings worked better and why?

 Two configurations were tested: small chunks (400/200) and large chunks (1200/50). Small chunks created 234 short, overlapping chunks that fragmented context, while large chunks produced 58 well-structured chunks with sufficient context and minimal redundancy. Therefore, the large chunk settings were chosen for better readability and more effective LLM processing.

In [5]:
# ========================================================================
# WORKSHOP ACTIVITY 4: EMBEDDINGS AND VECTOR STORE
# ========================================================================
# üéØ LEARNING OBJECTIVE: Convert text to vectors for semantic search
# and enable knowledge retrieval from health and wellness documents.

print("\n" + "="*60)
print("PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE")
print("="*60)

# ------------------------------------------------------------------------
# 1Ô∏è‚É£ Import required modules
# ------------------------------------------------------------------------
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document

# ------------------------------------------------------------------------
# 2Ô∏è‚É£ Initialize embedding model
# ------------------------------------------------------------------------
print("üß† Initializing embedding model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("‚úÖ Embedding model loaded successfully")

# ------------------------------------------------------------------------
# 3Ô∏è‚É£ Create domain-specific health documents
# (based on Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf)
# ------------------------------------------------------------------------
texts = [
    Document(
        page_content="Life's Essential 8: Eat better, be active, quit tobacco, and get healthy sleep.",
        metadata={"source": "Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf"}
    ),
    Document(
        page_content="Adults should aim for 150 minutes of moderate activity or 75 minutes of vigorous exercise weekly.",
        metadata={"source": "Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf"}
    ),
    Document(
        page_content="Adults generally need 7‚Äì9 hours of quality sleep every night to maintain good health.",
        metadata={"source": "Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf"}
    ),
    Document(
        page_content="A balanced diet includes more vegetables, fruits, whole grains, and fewer processed foods.",
        metadata={"source": "Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf"}
    ),
    Document(
        page_content="Maintaining calorie balance helps prevent weight gain and supports overall well-being.",
        metadata={"source": "Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf"}
    )
]

# ------------------------------------------------------------------------
# 4Ô∏è‚É£ Create vector database using Chroma
# ------------------------------------------------------------------------
print("\nüóÑÔ∏è Creating vector database...")
vectorstore = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_health_db"
)
print("‚úÖ Vector database created and persisted at './chroma_health_db'")

# ------------------------------------------------------------------------
# 5Ô∏è‚É£ Test retrieval (semantic search)
# ------------------------------------------------------------------------
query = "How much physical activity should adults get each week?"
results = vectorstore.similarity_search(query, k=2)

print("\nüîç SAMPLE QUERY RESULT:")
print(f"Query: {query}\n")
for i, r in enumerate(results, 1):
    print(f"{i}. {r.page_content} (Source: {r.metadata['source']})")

# ------------------------------------------------------------------------
# 6Ô∏è‚É£ Save and confirm
# ------------------------------------------------------------------------
print("\nüíæ Knowledge base saved and ready for retrieval tasks.")
print("You can now use this vectorstore in your RAG pipeline for question answering.")



PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE
üß† Initializing embedding model...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


‚úÖ Embedding model loaded successfully

üóÑÔ∏è Creating vector database...
‚úÖ Vector database created and persisted at './chroma_health_db'

üîç SAMPLE QUERY RESULT:
Query: How much physical activity should adults get each week?

1. Adults should aim for 150 minutes of moderate activity or 75 minutes of vigorous exercise weekly. (Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf)
2. Adults generally need 7‚Äì9 hours of quality sleep every night to maintain good health. (Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf)

üíæ Knowledge base saved and ready for retrieval tasks.
You can now use this vectorstore in your RAG pipeline for question answering.


In [6]:
# ========================================================================
# WORKSHOP ACTIVITY 5: RETRIEVAL CONFIGURATION
# ========================================================================
# üéØ LEARNING OBJECTIVE: Configure and test optimized document retrieval
# using the Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf embeddings.

print("\n" + "="*60)
print("PART 5: RETRIEVAL CONFIGURATION")
print("="*60)

# ------------------------------------------------------------------------
# 1Ô∏è‚É£ Create retriever with optimized MMR settings
# ------------------------------------------------------------------------
retriever = vectorstore.as_retriever(
    search_type="mmr",       # Use Maximum Marginal Relevance
    search_kwargs={
        "k": 5,              # Return top 5 relevant chunks per query
        "fetch_k": 10,       # Consider top 10 before applying MMR
        "lambda_mult": 0.7   # Balance between relevance (0.7) & diversity (0.3)
    }
)

print("üîç Retrieval Configuration:")
print(f"   - Strategy: Maximum Marginal Relevance (MMR)")
print(f"   - Documents returned: 5")
print(f"   - Initial candidates: 10")
print(f"   - Relevance vs Diversity balance: 0.7")

# ------------------------------------------------------------------------
# 2Ô∏è‚É£ Test retrieval with 2 sample questions from the PDF
# ------------------------------------------------------------------------
questions = [
    "According to the Adult Guide to an Active Healthy Lifestyle, what are effective ways adults can stay healthy daily?",
    "Based on the Adult Guide, how much physical activity should adults get per week?"
]

for q in questions:
    print("\n" + "-"*60)
    print(f"üß† Question: {q}")
    results = retriever.get_relevant_documents(q)
    
    if results:
        for i, r in enumerate(results, 1):
            preview = r.page_content[:120].replace("\n", " ")
            print(f"\nResult {i}:")
            print(f"  üìÑ Source: {r.metadata.get('source', 'Unknown')}")
            print(f"  üîπ Preview: {preview}...")
    else:
        print("‚ö†Ô∏è No relevant content found for this query.")

# ------------------------------------------------------------------------
# 3Ô∏è‚É£ Observation summary
# ------------------------------------------------------------------------
print("\n‚úÖ Retrieval test complete.")
print("Results show that the retriever effectively identifies exercise, nutrition, and lifestyle recommendations")
print("from 'Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf' using balanced MMR settings.")



PART 5: RETRIEVAL CONFIGURATION
üîç Retrieval Configuration:
   - Strategy: Maximum Marginal Relevance (MMR)
   - Documents returned: 5
   - Initial candidates: 10
   - Relevance vs Diversity balance: 0.7

------------------------------------------------------------
üß† Question: According to the Adult Guide to an Active Healthy Lifestyle, what are effective ways adults can stay healthy daily?


  results = retriever.get_relevant_documents(q)



Result 1:
  üìÑ Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
  üîπ Preview: Life's Essential 8: Eat better, be active, quit tobacco, and get healthy sleep....

Result 2:
  üìÑ Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
  üîπ Preview: Maintaining calorie balance helps prevent weight gain and supports overall well-being....

Result 3:
  üìÑ Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
  üîπ Preview: Adults generally need 7‚Äì9 hours of quality sleep every night to maintain good health....

Result 4:
  üìÑ Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
  üîπ Preview: Adults should aim for 150 minutes of moderate activity or 75 minutes of vigorous exercise weekly....

Result 5:
  üìÑ Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
  üîπ Preview: A balanced diet includes more vegetables, fruits, whole grains, and fewer processed foods....

------------------------------------------------------------
üß† Question: Based on the Adult 

In [7]:
# ========================================================================
# WORKSHOP ACTIVITY 6: LLM INTEGRATION
# ========================================================================
# üéØ LEARNING OBJECTIVE: Connect a local language model (Ollama + Phi-3 Mini)
# and enable question-answering from the health lifestyle RAG system.

print("\n" + "="*60)
print("PART 6: LANGUAGE MODEL SETUP")
print("="*60)

# ------------------------------------------------------------------------
# 1Ô∏è‚É£ PREREQUISITES
# ------------------------------------------------------------------------
print("üìã PREREQUISITE CHECK:")
print("   1. Install Ollama ‚Üí https://ollama.ai/")
print("   2. Run: ollama pull phi3:mini")
print("   3. Start Ollama server before running this notebook.")

# ------------------------------------------------------------------------
# 2Ô∏è‚É£ Import and initialize the LLM
# ------------------------------------------------------------------------
from langchain_community.llms import Ollama

try:
    llm = Ollama(
        model="phi3:mini",        # Lightweight model for local RAG use
        temperature=0.2,          # Low temperature = deterministic output
        num_thread=2,             # Adjust based on available CPU cores
    )

    # Quick sanity check
    print("\nüß™ Testing LLM connection...")
    test_response = llm.invoke("What is 2 + 2?")
    print(f"‚úÖ LLM Response: {test_response}")
    print("‚úÖ Language model initialized successfully!")

except Exception as e:
    print(f"‚ùå LLM Connection Failed: {e}")
    print("üí° TIP: Ensure Ollama service is running and 'phi3:mini' is installed.")
    print("      To install: ollama pull phi3:mini")



PART 6: LANGUAGE MODEL SETUP
üìã PREREQUISITE CHECK:
   1. Install Ollama ‚Üí https://ollama.ai/
   2. Run: ollama pull phi3:mini
   3. Start Ollama server before running this notebook.


  llm = Ollama(



üß™ Testing LLM connection...
‚úÖ LLM Response: The sum of 2 and 2 is 4. This simple arithmetic problem has a fixed answer, which can be determined through basic addition principles where combining two quantities each equal to the number 2 results in a total quantity of four units. Therefore, when we add these numbers together (2 +  end - beginning), it yields an outcome or result that represents this combined value:

end = start + difference
4 = 2 + 2
‚úÖ Language model initialized successfully!


In [8]:
# ========================================================================
# WORKSHOP ACTIVITY 7: PROMPT ENGINEERING
# ========================================================================
# LEARNING OBJECTIVE: Design prompts that enforce factual grounding
# Domain: Adult-Guide-to-an-Active-Healthy-Lifestyle (Health & Wellness)

print("\n" + "="*60)
print("PART 7: PROMPT ENGINEERING FOR GROUNDING")
print("="*60)

# ------------------------------------------------------------------------
# CONCEPT:
# Prompt engineering in RAG ensures the model stays grounded to the context.
# It reduces hallucinations and enforces citation-based factual answers.
# ------------------------------------------------------------------------

# Enhanced, domain-specific prompt template
prompt_template = """
You are a knowledgeable health and wellness assistant.
Your job is to answer questions STRICTLY based on the provided context from
'Adult Guide to an Active Healthy Lifestyle' and other health-related PDFs.

CRITICAL INSTRUCTIONS:
1. ONLY use information explicitly stated in the context below.
2. If the context doesn't contain the answer, respond exactly with:
   "The provided documents do not contain information to answer this question."
3. Always cite which document/source your answer comes from.
4. Do NOT add personal opinions or external information.
5. If multiple sources mention similar facts, summarize clearly.
6. Use direct quotes when available, enclosed in quotation marks.
7. Focus on health-related keywords such as:
   exercise, sleep, diet, physical activity, stress, wellness, nutrition.

Context Documents:
{context}

Question: {question}

RESPONSE REQUIREMENTS:
- Begin with the most relevant source and cite it.
- Use direct quotes or clear references to the document text.
- Avoid filler words or generic advice.
- If uncertain, explicitly say that the context does not provide enough data.

Answer:
"""

from langchain.prompts import PromptTemplate

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

print("‚úÖ Prompt template created with domain-specific grounding instructions")



PART 7: PROMPT ENGINEERING FOR GROUNDING
‚úÖ Prompt template created with domain-specific grounding instructions


In [9]:
# ========================================================================
# WORKSHOP ACTIVITY 8: RAG CHAIN ASSEMBLY
# ========================================================================
# LEARNING OBJECTIVE: Combine all components into a working RAG system
# DOMAIN: Adult Guide to an Active Healthy Lifestyle
# ------------------------------------------------------------------------

print("\n" + "="*60)
print("PART 8: RAG CHAIN ASSEMBLY")
print("="*60)

from langchain.chains import RetrievalQA

# Build the Retrieval-Augmented Generation (RAG) pipeline
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" = combine all retrieved text into a single prompt
    retriever=retriever,
    chain_type_kwargs={
        "prompt": PROMPT,  # the domain-specific grounded prompt
        "document_separator": "\n\n--- SOURCE DOCUMENT ---\n\n"
    },
    return_source_documents=True,  # enables citation and validation
    verbose=False
)

print("‚úÖ RAG chain assembled successfully!")
print("   Components connected: Retriever ‚Üí LLM ‚Üí Grounded Prompt ‚Üí Response")
print("   Domain: Adult Guide to an Active Healthy Lifestyle")



PART 8: RAG CHAIN ASSEMBLY
‚úÖ RAG chain assembled successfully!
   Components connected: Retriever ‚Üí LLM ‚Üí Grounded Prompt ‚Üí Response
   Domain: Adult Guide to an Active Healthy Lifestyle


In [11]:
# ========================================================================
# WORKSHOP ACTIVITY 9: ANSWER VALIDATION SYSTEM
# ========================================================================
# LEARNING OBJECTIVE: Implement quality control for RAG responses
# DOMAIN: Adult Guide to an Active Healthy Lifestyle
# ------------------------------------------------------------------------

def validate_answer(answer, source_docs):
    """
    WORKSHOP FUNCTION: Answer Quality Assessment (Health Domain)
    
    PURPOSE:
    Evaluate the factual quality and grounding of generated answers.
    Detect hallucinations, uncertain language, and missing citations.
    
    PARAMETERS:
    - answer: Generated response from RAG system
    - source_docs: Retrieved documents used for context
    
    RETURNS:
    - confidence_score: Float (0.0‚Äì1.0)
    - warnings: List of detected quality issues
    """
    answer_lower = answer.lower()
    
    # Indicators of uncertainty or hallucination
    hallucination_phrases = [
        "i think", "probably", "likely", "it seems", "perhaps",
        "generally speaking", "typically", "usually", "in most cases",
        "some experts believe", "may suggest", "it could be"
    ]
    
    confidence_score = 1.0
    warnings = []
    
    # ü©∫ Step 1: Detect vague or uncertain language
    for phrase in hallucination_phrases:
        if phrase in answer_lower:
            confidence_score -= 0.2
            warnings.append(f"Uncertain or non-factual phrase detected: '{phrase}'")
    
    # üìò Step 2: Check for missing citations
    has_citations = any(doc.metadata['source'].lower() in answer_lower for doc in source_docs)
    if not has_citations:
        confidence_score -= 0.3
        warnings.append("Answer does not reference any source document.")
    
    return max(0.0, confidence_score), warnings


def ask_question_with_validation(question):
    """
    WORKSHOP FUNCTION: Execute the full RAG process with validation
    
    STEPS:
    1. Retrieve relevant chunks from vectorstore
    2. Generate grounded response using LLM
    3. Validate grounding quality and detect issues
    4. Display source and keyword overlap for transparency
    """
    print(f"ü§î Question: {question}")
    print("\nüîç Retrieving relevant information from health documents...")
    
    # Run RAG chain
    result = qa_chain.invoke({"query": question})
    answer = result["result"]
    source_docs = result["source_documents"]
    
    # Validate the generated answer
    confidence, warnings = validate_answer(answer, source_docs)
    
    # üßæ Display the answer
    print("\nüìù Generated Answer:")
    print("="*60)
    print(answer)
    
    # üìä Quality Assessment
    print(f"\nüìä Answer Quality Evaluation:")
    print(f"   Confidence Score: {confidence:.2f}/1.0")
    
    if confidence >= 0.8:
        print("   ‚úÖ HIGH QUALITY: Well-grounded response based on the document.")
    elif confidence >= 0.6:
        print("   ‚ö†Ô∏è  MEDIUM QUALITY: Review recommended for partial grounding.")
    else:
        print("   ‚ùå LOW QUALITY: Possible hallucination or missing evidence.")
    
    if warnings:
        print("\n‚ö†Ô∏è  Quality Warnings:")
        for warning in warnings:
            print(f"   ‚Ä¢ {warning}")
    
    # üìö Detailed Source Inspection
    print(f"\nüìö Retrieved Sources ({len(source_docs)} documents):")
    print("-" * 70)
    
    question_keywords = set(question.lower().split())
    
    for i, doc in enumerate(source_docs):
        content_keywords = set(doc.page_content.lower().split())
        keyword_overlap = question_keywords.intersection(content_keywords)
        
        print(f"{i+1}. Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"   Page: {doc.metadata.get('page', 'Unknown')}")
        print(f"   Keyword Overlap: {list(keyword_overlap)}")
        print(f"   Content Preview: {doc.page_content[:200]}...")
        print()
    
    # üí° If the answer says info isn‚Äôt available
    if "do not contain information" in answer.lower():
        print("\nüí° TROUBLESHOOTING SUGGESTIONS:")
        print("1. Rephrase the question (e.g., 'How much exercise per week is recommended?').")
        print("2. Verify that the PDF text extraction worked properly.")
        print("3. Check if your question spans multiple chunks.")
        print("4. Add more health-related PDFs for context enrichment.")
    
    return result, confidence, warnings


In [12]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system with real health queries
# DOMAIN: Adult Guide to an Active Healthy Lifestyle
# ------------------------------------------------------------------------

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM")
print("="*80)

# Two sample health-related questions to demonstrate retrieval & grounding
questions = [
    "How much physical activity should adults get each week?",
    "How many hours of sleep do adults need daily?"
]

for q in questions:
    print("\n" + "="*80)
    print(f"ü§î Testing Question: {q}")
    print("="*80)
    
    print("üß™ Running RAG pipeline...\n")
    result, confidence, warnings = ask_question_with_validation(q)
    print("\n" + "="*80)
    print(f"‚úÖ Finished testing question: {q}")
    print("="*80)



WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM

ü§î Testing Question: How much physical activity should adults get each week?
üß™ Running RAG pipeline...

ü§î Question: How much physical activity should adults get each week?

üîç Retrieving relevant information from health documents...

üìù Generated Answer:
According to 'Adult Guide to an Active Healthy Lifestyle' (Source Document), adults should aim for "150 minutes of moderate activity or 75 minutes of vigorous exercise weekly" (Paragraph on Physical Activity). This information directly answers the question regarding physical activity recommendations.

üìä Answer Quality Evaluation:
   Confidence Score: 0.70/1.0
   ‚ö†Ô∏è  MEDIUM QUALITY: Review recommended for partial grounding.

   ‚Ä¢ Answer does not reference any source document.

üìö Retrieved Sources (5 documents):
----------------------------------------------------------------------
1. Source: Adult-Guide-to-an-Active-Healthy-Lifestyle.pdf
   Page: Unknown
   Keyword

In [16]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM")
print("="*80)

# Sample question for demonstration
# WORKSHOP INSTRUCTION: Students should modify this question
question = "How much physical activity should adults get per week?"

print("üß™ RUNNING SAMPLE QUERY...")
result, confidence, warnings = ask_question_with_validation(question)


WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM
üß™ RUNNING SAMPLE QUERY...
ü§î Question: How much physical activity should adults get per week?

üîç Retrieving relevant information...

üìù Answer:
According to "Source Document" which states that adults need "7-9 hours of sleep daily," there is no information provided about physical activity requirements. Therefore, based on the given documents, we cannot determine how much physical activity adults should get per week as this specific detail was not mentioned in any source document (Sources: Source Document).

üìä Quality Assessment:
   Confidence Score: 0.70/1.0
   ‚ö†Ô∏è  MEDIUM QUALITY: Review recommended

   ‚Ä¢ Answer does not reference source documents

üìö Retrieved Sources (3 documents):
------------------------------------------------------------
1. Source: health_guide.pdf
   Page: Unknown
   Keyword overlap: ['per', 'activity']
   Content: 150 minutes moderate or 75 minutes vigorous activity per week....

2. Source: he