# RAG Vaccine Information System

This notebook implements a Retrieval-Augmented Generation (RAG) system for vaccine information using Google's Gemini AI. The system:
- Ingests documents from PDFs and websites
- Creates a searchable vector database using embeddings
- Provides an AI agent that answers questions using retrieved context

## 1. Setup and Configuration

Import required libraries and configure API access.

In [None]:
from pathlib import Path
import sys
import numpy as np

from google.adk.agents import Agent
from google.adk.models.google_llm import Gemini
from google.adk.tools import FunctionTool
from google.genai import types
from google.adk.apps.app import App
from google.adk.sessions import InMemorySessionService
from google.adk.runners import Runner

print("✅ Imports loaded")

### Project Path Configuration

Add the project root to Python's import path to access custom modules.

In [None]:
# Ensure the project root (the parent of the "src" directory) is on sys.path
# so that "import src.model" finds the src package under the project root.
project_root = Path.cwd().parent
src_dir = project_root / "src"

project_root_path = str(project_root.resolve())
if project_root_path not in sys.path:
    sys.path.insert(0, project_root_path)

from src.model import Intensity, SentimentOutput
from src.model.rag_output import RagOutput
from src.model.document_chunk import DocumentChunk
from src.config import load_env_variables, get_env_variable

# Import handlers for document processing, web scraping, and embeddings
from src.rag.pdf_handler import PdfHandler
from src.rag.web_handler import WebHandler
from src.rag.embedding_handler import EmbeddingHandler


In [None]:
load_env_variables()

GOOGLE_API_KEY = get_env_variable("GOOGLE_API_KEY")
print(f"✅ API key loaded")

### API Configuration

Load environment variables and configure retry policy for API calls.

In [None]:
retry_config = types.HttpRetryOptions(
    attempts=3,
    initial_delay=1,
    http_status_codes=[429, 500, 503, 504]
)

## 4. Initialize Embedding Handler

Create the embedding handler instance for managing embeddings and retrieval.


In [None]:
# Initialize the EmbeddingHandler with API key and cache directory
embedding_handler = EmbeddingHandler(
    api_key=GOOGLE_API_KEY,
    cache_dir="../cache"
)

print("✅ Embedding handler initialized")


## 5. Knowledge Base Initialization

Build or load the vector index from document sources.

In [None]:
# Configuration
CONFIG = {
    "pdf_folder": str((project_root / "src" / "Doc_vaccini").resolve()),
    "root_url": "https://www.serviziterritoriali-asstmilano.it/servizi/vaccinazioni/",
    "max_pages": 10,
    "max_depth": 2,
    "use_cache": True
}

# Initialize global index
global_embeddings = np.array([])
global_chunks = []

# Load from cache or build fresh
if CONFIG["use_cache"]:
    global_embeddings, global_chunks = embedding_handler.load_index_from_cache()

if global_embeddings.size == 0:
    print("Building knowledge base...")
    
    # Load and process documents
    pdf_chunks = PdfHandler.load_pdfs_from_folder(CONFIG["pdf_folder"])
    web_start_id = pdf_chunks[-1].id + 1 if pdf_chunks else 0
    web_chunks = WebHandler.crawl_website(
        CONFIG["root_url"], 
        max_pages=CONFIG["max_pages"], 
        max_depth=CONFIG["max_depth"], 
        start_id=web_start_id
    )
    
    # Build index
    all_chunks = pdf_chunks + web_chunks
    if all_chunks:
        global_embeddings, global_chunks = embedding_handler.build_vector_index(all_chunks)
        embedding_handler.save_index_to_cache(global_embeddings, global_chunks)
        print(f"✅ Index built: {len(global_chunks)} chunks")
    else:
        print("⚠️ No content found")
else:
    print(f"✅ Loaded from cache: {len(global_chunks)} chunks")


In [None]:
def retrieve_vaccine_info(query: str) -> str:
    """
    Retrieves vaccine information from the knowledge base.
    
    This function is exposed to the AI agent as a tool.
    
    Args:
        query: User's question about vaccines
        
    Returns:
        Formatted string with relevant information and source citations
    """
    if not global_chunks:
        return "Error: Knowledge base not initialized. Run ingestion cell first."
    
    try:
        top_chunks = embedding_handler.retrieve_top_k(query, global_embeddings, global_chunks, k=5)
        if not top_chunks:
            return "No relevant information found."
        
        # Format results with source citations
        results = []
        for c in top_chunks:
            from pathlib import Path as FilePath
            source = FilePath(c.source).name if c.doc_type == "pdf" else c.source
            results.append(f"[SOURCE: {source}]\n{c.content}")
        
        return "\n\n---\n\n".join(results)
    except Exception as e:
        return f"Error: {str(e)}"

# Create tool wrapper for the agent
rag_tool = FunctionTool(retrieve_vaccine_info)
print(f"✅ Tool ready" if global_chunks else "⚠️ Run ingestion first")


## 6. RAG Tool

Create the retrieval function that the agent will use to access the knowledge base.

## 7. Agent Configuration

Configure the RAG agent with the retrieval tool and structured output schema.

In [None]:
prompt = """You are a helpful assistant for vaccine information.
You have access to a knowledge base containing official documents and web pages about vaccinations.
    
When the user asks a question:
1. Use the `retrieve_vaccine_info` tool to find relevant information.
2. Answer the question based ONLY on the information returned by the tool.
3. If the tool returns no information, or the information is not pertinent, return an error in the format specified below
4. Always cite the sources provided in the tool output.
5. Be concise but thorough in your responses.
"""

rag_agent = Agent(
    name="RAG_Vaccine_Informer",
    model=Gemini(
        model="gemini-2.5-flash-lite", 
        retry_options=retry_config
    ),
    instruction=prompt,
    tools=[rag_tool],
    output_key="rag_output",
    #output_schema=RagOutput,
)

print("✅ RAG Agent configured")

## 8. Application Setup

Create the application and runner instances.

In [None]:
# Create session service for managing conversation state
session_service = InMemorySessionService()

# Create application with RAG agent as root
application = App(
    name="VaccineInfoRAG",
    root_agent=rag_agent
)

# Create runner to execute queries
runner = Runner(
    app=application, 
    session_service=session_service
)

print("✅ Application ready")

## 9. Testing

Test the RAG system with sample queries.

### Single Query Test

In [None]:
response = await runner.run_debug("What is the policy for vaccinating pregnant women?")

In [None]:
# Test the retrieve function directly
test_result = retrieve_vaccine_info("What is the policy for vaccinating pregnant women?")
print("Tool output:")
print(test_result)
