In [1]:
# ========================================================================
# PART 0: ENVIRONMENT SETUP AND LIBRARY VERSION CHECK
# ========================================================================
# LEARNING OBJECTIVE: Verify environment setup and library compatibility

def check_library_versions():
    """
    WORKSHOP FUNCTION: Environment Verification
    
    PURPOSE: Check installed library versions for compatibility
    This helps ensure all students have the same environment setup
    """
    print("="*60)
    print("🔧 WORKSHOP ENVIRONMENT CHECK")
    print("="*60)
    
    required_libraries = {
        'langchain': '0.3.27',
        'langchain_community': '0.3.29',
        'chromadb': '1.0.20',
        'pypdf': '6.0.0',
        'numpy': '6.0.0',
        'pathlib': 'built-in',
        'os': 'built-in',
        'sys': 'built-in'
    }
    
    print("📋 Checking required libraries and versions:")
    print("-" * 50)
    
    missing_libraries = []
    version_mismatches = []
    
    for library, min_version in required_libraries.items():
        try:
            if library in ['pathlib', 'os', 'sys']:
                print(f"✅ {library}: {min_version}")
                continue
                
            if library == 'langchain':
                import langchain
                version = langchain.__version__
            elif library == 'langchain_community':
                import langchain_community
                version = getattr(langchain_community, '__version__', 'unknown')
            elif library == 'chromadb':
                import chromadb
                version = chromadb.__version__
            elif library == 'pypdf':
                import pypdf
                version = pypdf._version.__version__
            elif library == 'numpy':
                import numpy
                version = numpy.__version__
            
            print(f"✅ {library}: {version}")
            
        except ImportError:
            print(f"❌ {library}: NOT INSTALLED")
            missing_libraries.append(library)
        except Exception as e:
            print(f"⚠️  {library}: Error checking version - {e}")
    
    # Check Ollama availability (external dependency)
    print("\n🤖 Checking Ollama setup:")
    print("-" * 30)
    try:
        import subprocess
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            if 'phi3:mini' in result.stdout:
                print("✅ Ollama: Installed and phi3:mini model available")
            else:
                print("⚠️  Ollama: Installed but phi3:mini model missing")
                print("   Run: ollama pull phi3:mini")
        else:
            print("❌ Ollama: Not properly configured")
    except FileNotFoundError:
        print("❌ Ollama: Not installed")
        print("   Install from: https://ollama.ai/")
    except subprocess.TimeoutExpired:
        print("⚠️  Ollama: Connection timeout - check if service is running")
    except Exception as e:
        print(f"⚠️  Ollama: Error checking - {e}")
    
    # Summary and installation commands
    if missing_libraries:
        print(f"\n❌ MISSING LIBRARIES: {', '.join(missing_libraries)}")
        print("\n📦 EXACT INSTALLATION COMMANDS (Workshop Tested Versions):")
        print("pip install langchain==0.3.27")
        print("pip install langchain-community==0.3.29")
        print("pip install chromadb==1.0.20")
        print("pip install pypdf==6.0.0")
        print("pip install numpy==6.0.0")
        print("\nRun these commands and restart the workshop.")
        return False
    else:
        print("\n✅ ALL LIBRARIES INSTALLED!")
        print("🚀 Ready to proceed with the workshop!")
        return True

# Run environment check
environment_ready = check_library_versions()

if not environment_ready:
    print("\n⚠️  PLEASE INSTALL MISSING LIBRARIES BEFORE CONTINUING")
    print("Uncomment the sys.exit() line below if you want to stop here")
    # sys.exit(1)  # Students can uncomment this to stop execution

🔧 WORKSHOP ENVIRONMENT CHECK
📋 Checking required libraries and versions:
--------------------------------------------------
✅ langchain: 0.3.27
✅ langchain_community: 0.3.29
✅ chromadb: 1.0.20
✅ pypdf: 6.0.0
✅ numpy: 2.3.3
✅ pathlib: built-in
✅ os: built-in
✅ sys: built-in

🤖 Checking Ollama setup:
------------------------------
✅ Ollama: Installed and phi3:mini model available

✅ ALL LIBRARIES INSTALLED!
🚀 Ready to proceed with the workshop!


HANDS-ON RAG (Retrieval-Augmented Generation) WORKSHOP

13 Oct 2025
Ramaih University of Applied Sciences
Instructor: Naganathan Muthuramalingam., PhD Scholar - School of Social Sciences

This script demonstrates a complete end-to-end RAG system implementation.

WHAT YOU'LL LEARN:
1. Document Loading and Processing
2. Text Chunking Strategies
3. Vector Embeddings and Storage
4. Retrieval Mechanisms
5. LLM Integration
6. Answer Validation and Grounding

WORKSHOP STRUCTURE:
- Part 0: Environment Setup and Library Version Check
- Part 1: Imports and Document Discovery
- Part 2: Document Loading and Text Chunking
- Part 3: Vector Embeddings & Knowledge Base Creation
- Part 4: Retrieval Configuration
- Part 5: Language Model Setup
- Part 6: Prompt Engineering for Grounding
- Part 7: RAG Chain Assembly
- Part 8: Answer Validation System
- Part 9: Hands-on Testing

SYSTEM REQUIREMENTS:
- Minimum 8GB RAM (16GB recommended for better performance)
- At least 20GB free disk space for models and vector databases
- Python 3.8+ installed
- Stable internet connection for initial model downloads
- Ollama installed (https://ollama.ai/)
- phi3:mini model downloaded via: ollama pull phi3:mini

INSTALLATION STEPS:
1. Install Python 3.8+
2. Install Ollama from https://ollama.ai/
3. Run: ollama pull phi3:mini
4. Install required Python packages (see Part 0 below)
5. Create 'data' folder and add PDF documents

PREREQUISITES:
- Basic Python knowledge
- Understanding of machine learning concepts
- Familiarity with NLP basics

In [2]:
# ========================================================================
# PART 1: IMPORTS AND SETUP
# ========================================================================
# Standard library imports - Python's built-in modules
import os
import sys
from pathlib import Path

# LangChain Document Loaders & Processing - For handling different document types
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Vector Store and Embeddings - For semantic search capabilities
from langchain_community.vectorstores import Chroma

# Local LLM via Ollama - For running language models locally
from langchain_community.llms import Ollama

# RAG Chain - For combining retrieval and generation
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
# ========================================================================
# WORKSHOP ACTIVITY 1: DOCUMENT DISCOVERY
# ========================================================================
# LEARNING OBJECTIVE: Understand how to locate and validate data sources

# Define the path to your PDF directory
# TODO for students: Create a 'data' folder and add your PDF documents

data_dir = "./data"

# Find all PDF files in the directory recursively
# This uses Path.rglob() to search through all subdirectories

pdf_files = [str(p) for p in Path(data_dir).rglob("*.pdf") if p.is_file()]

# Validation: Always check if your data exists before processing
if not pdf_files:
    print(f"No PDFs found in {data_dir}. Please add your PDFs and update the `data_dir` variable.")
    print("WORKSHOP TIP: Create the './data' folder and add at least one PDF document")
else:
    print(f"✅ Found {len(pdf_files)} PDF(s):")
    for f in pdf_files:
        print(f" - {f}")

# 📚 Domain Topics for My RAG System:
# 1. Data Engineering
# 2. Artificial Intelligence
# 3. Cybersecurity
# 4. Healthcare Analytics
# 5. Climate Change
print("✅ Using 5 custom topic PDFs from my interest areas.")


✅ Found 5 PDF(s):
 - data\Chapter1-Econometrics-IntroductionToEconometrics.pdf
 - data\cloud computing nist.pdf
 - data\healthcare.pdf
 - data\siemens-healthineers_ES_case_study_Southampton.pdf
 - data\understanding-machine-learning-theory-algorithms.pdf
✅ Using 5 custom topic PDFs from my interest areas.


In [4]:
# ========================================================================
# WORKSHOP ACTIVITY 2: DOCUMENT LOADING AND PREPROCESSING
# ========================================================================
# LEARNING OBJECTIVE: Transform unstructured documents into structured data


print("\n" + "="*50)
print("PART 2: DOCUMENT LOADING & TEXT CHUNKING")
print("="*50)

# Initialize document storage
documents = []

# Process each PDF file
for file_path in pdf_files:
    try:
        print(f"\n📄 Processing: {os.path.basename(file_path)}")
        
        # PyPDFLoader: Specialized for PDF documents
        # WORKSHOP NOTE: Different loaders exist for different file types
        # (TextLoader, CSVLoader, JSONLoader, etc.)
        loader = PyPDFLoader(file_path)
        
        # Load documents - each page becomes a separate document
        docs = loader.load()
        
        # Add source metadata for traceability
        # WORKSHOP TIP: Metadata is crucial for citation and verification
        for doc in docs:
            doc.metadata["source"] = os.path.basename(file_path)
            
        documents.extend(docs)
        print(f"✅ Loaded {len(docs)} pages from {os.path.basename(file_path)}")
        
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        print("WORKSHOP TIP: Check file permissions and format compatibility")

print(f"\n📊 SUMMARY: Total pages loaded: {len(documents)}")



PART 2: DOCUMENT LOADING & TEXT CHUNKING

📄 Processing: Chapter1-Econometrics-IntroductionToEconometrics.pdf
✅ Loaded 11 pages from Chapter1-Econometrics-IntroductionToEconometrics.pdf

📄 Processing: cloud computing nist.pdf
✅ Loaded 7 pages from cloud computing nist.pdf

📄 Processing: healthcare.pdf
✅ Loaded 2 pages from healthcare.pdf

📄 Processing: siemens-healthineers_ES_case_study_Southampton.pdf
✅ Loaded 8 pages from siemens-healthineers_ES_case_study_Southampton.pdf

📄 Processing: understanding-machine-learning-theory-algorithms.pdf
✅ Loaded 449 pages from understanding-machine-learning-theory-algorithms.pdf

📊 SUMMARY: Total pages loaded: 477


In [5]:
# ========================================================================
# WORKSHOP ACTIVITY 3: TEXT CHUNKING STRATEGY
# ========================================================================
# LEARNING OBJECTIVE: Understand why and how to split text optimally

print("\n" + "="*50)
print("PART 3: TEXT CHUNKING")
print("="*50)

# CONCEPT: Why do we chunk text?
# 1. LLMs have context length limitations
# 2. Smaller chunks = more precise retrieval
# 3. Better semantic matching
# 4. Improved processing speed

# Larger, context-rich chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
)



print("🔧 Chunking Configuration:")
print(f"   - Chunk size: {text_splitter._chunk_size} characters")
print(f"   - Overlap: {text_splitter._chunk_overlap} characters")
print(f"   - Separators: {text_splitter._separators}")

# Split documents into chunks
texts = text_splitter.split_documents(documents)


# Add better metadata to each chunk
for i, text in enumerate(texts):
    text.metadata["chunk_id"] = i
    text.metadata["chunk_length"] = len(text.page_content)
    # Add first few words as preview
    text.metadata["preview"] = text.page_content[:50].replace("\n", " ")

# Validation
if not texts:
    print("❌ No text chunks created. Check your documents.")
    sys.exit(0)

print(f"✅ Successfully split into {len(texts)} text chunks")

# WORKSHOP ACTIVITY: Examine chunk examples
print(f"\n📝 SAMPLE CHUNK (ID: 0):")
if texts:
    sample_chunk = texts[0]
    print(f"   Source: {sample_chunk.metadata.get('source', 'Unknown')}")
    print(f"   Length: {sample_chunk.metadata.get('chunk_length', 0)} characters")
    print(f"   Preview: {sample_chunk.metadata.get('preview', 'N/A')}...")


PART 3: TEXT CHUNKING
🔧 Chunking Configuration:
   - Chunk size: 1200 characters
   - Overlap: 200 characters
   - Separators: ['\n\n', '\n', '. ', '! ', '? ', ' ', '']
✅ Successfully split into 1073 text chunks

📝 SAMPLE CHUNK (ID: 0):
   Source: Chapter1-Econometrics-IntroductionToEconometrics.pdf
   Length: 1176 characters
   Preview: Econometrics | Chapter 1 | Introduction to Econome...


In [6]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:

# ========================================================================
# WORKSHOP ACTIVITY 4: EMBEDDINGS AND VECTOR STORE
# ========================================================================
# LEARNING OBJECTIVE: Convert text to vectors for semantic search

print("\n" + "="*50)
print("PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE")
print("="*50)

from langchain_community.embeddings import HuggingFaceEmbeddings


# CONCEPT: What are embeddings?
# - Mathematical representations of text meaning
# - Similar texts have similar vectors
# - Enable semantic search (not just keyword matching)

print("🧠 Initializing embedding model...")

embeddings = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2", # WORKSHOP NOTE: Lightweight but effective
model_kwargs={'device': 'cpu'}, # Use CPU for compatibility
encode_kwargs={'normalize_embeddings': False}
)


print("✅ Embedding model loaded: all-MiniLM-L6-v2")
print("   - Dimensions: 384")
print("   - Model size: ~90MB")
print("   - Performance: Good balance of speed vs accuracy")

# Create vector database
print("\n🗄️  Creating vector database...")

vectorstore = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_clinicaltrial_db"  # Persistent storage
)

print("✅ Vector database created and saved to disk")
print("   WORKSHOP TIP: Database persists between runs for efficiency")


PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE
🧠 Initializing embedding model...


  embeddings = HuggingFaceEmbeddings(


✅ Embedding model loaded: all-MiniLM-L6-v2
   - Dimensions: 384
   - Model size: ~90MB
   - Performance: Good balance of speed vs accuracy

🗄️  Creating vector database...
✅ Vector database created and saved to disk
   WORKSHOP TIP: Database persists between runs for efficiency


In [8]:
# ========================================================================
# WORKSHOP ACTIVITY 5: RETRIEVAL CONFIGURATION
# ========================================================================
# LEARNING OBJECTIVE: Configure optimal document retrieval

print("\n" + "="*50)
print("PART 5: RETRIEVAL CONFIGURATION")
print("="*50)

# CONCEPT: Retrieval strategies
# - Similarity: Find most similar documents
# - MMR (Maximum Marginal Relevance): Balance relevance and diversity
# - Similarity + threshold: Filter low-relevance results

# Improved retriever parameters for deeper and more diverse search
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,           # number of top chunks to use
        "fetch_k": 10,    # number of candidates to consider
        "lambda_mult": 0.3  # diversity vs relevance
    }
)


print("🔍 Retrieval Configuration:")
print(f"   - Strategy: MMR (Maximum Marginal Relevance)")
print(f"   - Documents returned: 5")
print(f"   - Initial candidates: 10")
print(f"   - Relevance vs Diversity balance: 0.7")


PART 5: RETRIEVAL CONFIGURATION
🔍 Retrieval Configuration:
   - Strategy: MMR (Maximum Marginal Relevance)
   - Documents returned: 5
   - Initial candidates: 10
   - Relevance vs Diversity balance: 0.7


In [9]:
# ========================================================================
# WORKSHOP ACTIVITY 6: LLM INTEGRATION
# ========================================================================
# LEARNING OBJECTIVE: Connect local language model for generation

print("\n" + "="*50)
print("PART 6: LANGUAGE MODEL SETUP")
print("="*50)

# PREREQUISITE: Install Ollama and pull a model
print("📋 PREREQUISITE CHECK:")
print("   1. Install Ollama: https://ollama.ai/")
print("   2. Run: ollama pull phi3:mini")
print("   3. Verify: ollama list")


try:
    llm = Ollama(
        model="phi3:mini",    # WORKSHOP NOTE: Lightweight model for laptops
        temperature=0.2,      # Low temperature = more deterministic responses
        num_thread=2,         # Adjust based on your CPU cores
    )
    
    # Test LLM connection
    print("\n🧪 Testing LLM connection...")
    test_response = llm.invoke("What is 2+2?")
    print(f"✅ LLM Response: {test_response}")
    print("✅ Language model initialized successfully!")
    
except Exception as e:
    print(f"❌ LLM Connection Failed: {e}")
    print("WORKSHOP TIP: Ensure Ollama is running and phi3:mini is installed")
    # TODO: Add fallback or alternative model suggestion



PART 6: LANGUAGE MODEL SETUP
📋 PREREQUISITE CHECK:
   1. Install Ollama: https://ollama.ai/
   2. Run: ollama pull phi3:mini
   3. Verify: ollama list

🧪 Testing LLM connection...


  llm = Ollama(


✅ LLM Response: The sum of 2 and 2 is 4.
✅ Language model initialized successfully!


In [10]:
# ========================================================================
# WORKSHOP ACTIVITY 7: PROMPT ENGINEERING
# ========================================================================
# LEARNING OBJECTIVE: Design prompts that enforce grounding

print("\n" + "="*50)
print("PART 7: PROMPT ENGINEERING FOR GROUNDING")
print("="*50)

# CONCEPT: Prompt engineering for RAG
# - Explicit instructions prevent hallucination
# - Structure ensures consistent output format
# - Citations enable verification

# Enhanced prompt template for better factual retrieval

prompt_template = """
You are a precise document analyst. Your task is to answer questions STRICTLY based on the provided context.

CRITICAL INSTRUCTIONS:
1. ONLY use information explicitly stated in the context below
2. If the context doesn't contain the answer, respond: "The provided documents do not contain information to answer this question."
3. Always cite which document/source your answer comes from
4. Do not make inferences beyond what is directly stated
5. If multiple sources contradict each other, mention the contradiction
6. Use exact quotes when possible, enclosed in quotation marks
7. For factual questions (like currency, population, etc.), scan ALL context carefully


Context Documents:
{context}

Question: {question}
Requirements for your answer:
- Start with the most relevant source
- Use direct quotes where applicable
- Clearly separate facts from different sources
- Look for keywords related to the question (currency, money, dollar, etc.)
- End with source citations

Answer:
"""


PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)
print("✅ Prompt template created with grounding instructions")


PART 7: PROMPT ENGINEERING FOR GROUNDING
✅ Prompt template created with grounding instructions


In [11]:
# ========================================================================
# WORKSHOP ACTIVITY 8: RAG CHAIN ASSEMBLY
# ========================================================================
# LEARNING OBJECTIVE: Combine all components into a working system

print("\n" + "="*50)
print("PART 8: RAG CHAIN ASSEMBLY")
print("="*50)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",    # WORKSHOP NOTE: "stuff" = include all context in prompt
    retriever=retriever,
    chain_type_kwargs={
        "prompt": PROMPT,
        "document_separator": "\n\n--- SOURCE DOCUMENT ---\n\n"
    },
    return_source_documents=True,  # Essential for verification
    verbose=False  # WORKSHOP TIP: Set to True for debugging
)

print("✅ RAG chain assembled successfully!")
print("   Components connected: Retriever → LLM → Response")


PART 8: RAG CHAIN ASSEMBLY
✅ RAG chain assembled successfully!
   Components connected: Retriever → LLM → Response


In [12]:
# ========================================================================
# WORKSHOP ACTIVITY 9: ANSWER VALIDATION SYSTEM
# ========================================================================
# LEARNING OBJECTIVE: Implement quality control for RAG responses

def validate_answer(answer, source_docs):
    """
    WORKSHOP FUNCTION: Answer Quality Assessment
    
    PURPOSE: Detect potential hallucinations and assess grounding quality
    
    PARAMETERS:
    - answer: Generated response from RAG system
    - source_docs: Retrieved documents used for context
    
    RETURNS:
    - confidence_score: Float between 0.0 and 1.0
    - warnings: List of quality issues detected
    """
    answer_lower = answer.lower()
    
    # Define hallucination indicators
    # WORKSHOP EXERCISE: Add more phrases students might identify
    hallucination_phrases = [
        "i think", "probably", "likely", "it seems", "perhaps", 
        "generally speaking", "typically", "usually", "in most cases"
    ]
    
    confidence_score = 1.0
    warnings = []
    
    # Check for uncertain language
    for phrase in hallucination_phrases:
        if phrase in answer_lower:
            confidence_score -= 0.2
            warnings.append(f"Uncertain language detected: '{phrase}'")
    
    # Verify source citation
    has_citations = any(doc.metadata['source'].lower() in answer_lower for doc in source_docs)
    if not has_citations:
        confidence_score -= 0.3
        warnings.append("Answer does not reference source documents")
    
    return max(0.0, confidence_score), warnings

def ask_question_with_validation(question):
    """
    WORKSHOP FUNCTION: Complete RAG Query with Validation
    
    This function demonstrates the full RAG pipeline:
    1. Question input
    2. Document retrieval
    3. Answer generation
    4. Quality validation
    5. Source verification
    """
    print(f"🤔 Question: {question}")
    print("\n🔍 Retrieving relevant information...")
    
    # Execute RAG pipeline
    result = qa_chain.invoke({"query": question})
    answer = result["result"]
    source_docs = result["source_documents"]
    
    # Validate response quality
    confidence, warnings = validate_answer(answer, source_docs)
    
    # Display results with educational annotations
    print("\n📝 Answer:")
    print("="*50)
    print(answer)
    
    # Quality assessment
    print(f"\n📊 Quality Assessment:")
    print(f"   Confidence Score: {confidence:.2f}/1.0")
    
    if confidence >= 0.8:
        print("   ✅ HIGH QUALITY: Well-grounded response")
    elif confidence >= 0.6:
        print("   ⚠️  MEDIUM QUALITY: Review recommended")
    else:
        print("   ❌ LOW QUALITY: Potential hallucination detected")
    
    if warnings:
        print("\n⚠️  Quality Warnings:")
        for warning in warnings:
            print(f"   • {warning}")
    
    # Enhanced source verification with keyword analysis
    print(f"\n📚 Retrieved Sources ({len(source_docs)} documents):")
    print("-" * 60)
    
    question_keywords = set(question.lower().split())
    
    for i, doc in enumerate(source_docs):
        content_keywords = set(doc.page_content.lower().split())
        keyword_overlap = question_keywords.intersection(content_keywords)
        
        print(f"{i+1}. Source: {doc.metadata['source']}")
        print(f"   Page: {doc.metadata.get('page', 'Unknown')}")
        print(f"   Keyword overlap: {list(keyword_overlap)}")
        print(f"   Content: {doc.page_content[:200]}...")
        print()
    
    # Suggest improvements if answer is not found
    if "do not contain information" in answer.lower():
        print("\n💡 TROUBLESHOOTING SUGGESTIONS:")
        print("1. Check if your question keywords appear in the documents")
        print("2. Try rephrasing the question with different terms")
        print("3. Verify the PDF content was properly extracted")
        print("4. Consider if the information spans multiple chunks")
        
        # Try alternative search terms
        if "currency" in question.lower():
            alt_terms = ["money", "dollar", "economic", "financial", "payment"]
            print(f"\n🔄 Trying alternative search terms: {alt_terms}")
            for term in alt_terms:
                alt_docs = vectorstore.similarity_search(term, k=3)
                if alt_docs:
                    print(f"\n   Found content for '{term}':")
                    for doc in alt_docs[:1]:  # Show first match
                        print(f"   {doc.page_content[:100]}...")
    
    return result, confidence, warnings

In [16]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM")
print("="*80)

# Sample question for demonstration
# WORKSHOP INSTRUCTION: Students should modify this question
questions = [
    "What is the Difference between Saas, Paas and Iaas?"
    "What is Econometrics?"
    "What are different types of data in econometrics?"
]

for q in questions:
    print(f"\nQ: {q}")
    response = qa_chain.invoke({"query": q})
    print("A:", response["result"])



print("🧪 RUNNING SAMPLE QUERY...")
result, confidence, warnings = ask_question_with_validation(questions)


WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM

Q: What is the Difference between Saas, Paas and Iaas?What is Econometrics?What are different types of data in econometrics?
A: The provided documents do not contain information to answer this question. However, based on general knowledge in the field of cloud computing and econometrics, I can provide an explanation that aligns closely with common understanding within these domains. Please note that for precise definitions or specific details from your source materials, direct quotes would be necessary which are absent herein:

- SaaS (Software as a Service) is typically understood to involve software applications delivered over the internet on a subscription basis, allowing users to connect with and use the application through a web browser without installing it. The provided documents mention Google App Engine but do not elaborate further about its relation to IaaS or Paas in this context (Source: IV).

- PAAS (Platform as a Service) p

AttributeError: 'list' object has no attribute 'replace'

In [None]:
# ========================================================================
# WORKSHOP CONCLUSION: INTERACTIVE SESSION
# ========================================================================

print("\n" + "="*80)
print("🎓 WORKSHOP COMPLETE! RAG SYSTEM READY FOR EXPERIMENTATION")
print("="*80)
print("\nEXPERIMENT IDEAS FOR STUDENTS:")
print("1. Try different chunk sizes (400, 800, 1200)")
print("2. Compare similarity vs MMR retrieval")
print("3. Adjust retrieval parameters (k, fetch_k, lambda_mult)")
print("4. Modify the prompt template")
print("5. Test with different types of questions")
print("6. Add your own validation criteria")
print("\n🔧 DEBUGGING TOOLS:")
print("- Use debug_retrieval(question, vectorstore) to see what's retrieved")
print("- Use manual_search('currency', vectorstore) to find specific terms")
print("- Check similarity scores to understand retrieval quality")
print("\nHAPPY LEARNING! 🚀")