# Building an Agentic System to enhance RAG with Self-Grading and Web Search Capabilities


- [medium](https://medium.com/the-ai-forum/building-an-agentic-system-to-enhance-rag-with-self-grading-and-web-search-capabilities-using-3f9a1d885730)


## 2. Set up the API keys in google colab

In [None]:
import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["LOGFIRE_IGNORE_NO_CONFIG"] = os.getenv("LOGFIRE_IGNORE_NO_CONFIG")

## 3. Instantiate LLM

In [1]:
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.models.groq import GroqModel

openai_model = OpenAIModel("gpt-4o-mini")
# groq_model = GroqModel("mixtral-8x7b-32768")  # or another valid Groq model
groq_model = GroqModel("llama-3.3-70b-versatile")

## 4. Load required documents and build index

In [2]:
# from langchain_community.vectorstores import chroma
from langchain_chroma.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("../data/RAG.pdf")
documents = loader.load()

split_docs = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
).split_documents(documents)

embedding = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

persist_directory = "../data/chroma_langchain_db"

vectorstore = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding,
    persist_directory=persist_directory,
    collection_name="RAG_vectorstore",
)

  embedding = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


## 5. Define Dependencies

In [3]:
# from dataclasses import dataclass


# @dataclass
# class Deps:
#     question: str | None
#     context: str | None
#     query: str | None  # Added as it's used in system prompt
#     response: str | None  # Added as it's used in system prompt

In [4]:
from pydantic import BaseModel, Field
from typing import Optional


# 1. Define our Deps class correctly
class Deps(BaseModel):
    question: str = Field(default="")
    query: str = Field(default="")
    context: Optional[str] = Field(default="")
    response: Optional[str] = Field(default="")
    content: str = Field(default="")
    message: str = Field(default="")

## 6. Instantiate pydantic.ai Agent

In [5]:
system_prompt = """You are a Helpful Assistant proficient in providing concise, factual, and to-the-point answers for questions about RAG based on provided context.

When answering questions:
1. First use the `retriever_tool` to get relevant context from the knowledge base
2. If the retrieved context is insufficient, use the `websearch_tool` to get additional information
3. Generate a comprehensive response based on all available context

Your response should be in JSON format with the following structure:
{
    "Relevancy": <score 0-1>,
    "Faithfulness": <score 0-1>,
    "Context Quality": <score 0-1>,
    "Needs Web Search": True,  # Set to True when websearch_tool was used, False if not
    "Explanation": "Explanation of the grading and search decision",
    "Answer": "Your detailed answer based on the available context"
}
"""

In [6]:
import nest_asyncio

nest_asyncio.apply()

groq_agent = Agent(
    groq_model,
    deps_type=Deps,
    retries=2,
    result_type=str,
    system_prompt=system_prompt,
    end_strategy="early",  # This is the default but being explicit
)

## 7,8.Function Tools - Web search

In [7]:
from tavily import TavilyClient


@groq_agent.tool_plain
async def websearch_tool(question: str) -> str:
    """Search the web for information"""
    try:
        tavily_client = TavilyClient()
        answer = tavily_client.qna_search(query=question)
        print(
            f"Web search result: {str(answer)[:200]}..."
        )  # Print first 200 chars for debugging
        return str(answer)
    except Exception as e:
        print(f"Error in websearch_tool: {e}")
        return ""

In [8]:
# question = "what time is the match between intermiami, new york. is messi playing"

# tavily_client = TavilyClient()

# # Step 2. Executing a Q&A search query
# answer = tavily_client.qna_search(query=question)

# # Step 3. That's it! Your question has been answered!
# print(f"WEB SEARCH:{answer}")

## 9. Create Retriever Tool

In [9]:
persist_directory

'../data/chroma_langchain_db'

In [10]:
from pydantic_ai import RunContext
from typing import List


@groq_agent.tool
async def retriever_tool(ctx: RunContext[Deps], question: str) -> str:
    """Retrieve relevant documents from the vector store"""
    try:
        load_vectorstore = Chroma(
            persist_directory=persist_directory,
            embedding_function=embedding,
            collection_name="RAG_vectorstore",
        )
        retrieved_docs = load_vectorstore.similarity_search(query=question, k=3)
        context = "\n".join(doc.page_content for doc in retrieved_docs)
        print(
            f"Retrieved context: {context[:200]}..."
        )  # Print first 200 chars for debugging
        return context
    except Exception as e:
        print(f"Error in retriever_tool: {e}")
        return ""

## 10. Results — Invoke the agent

In [11]:
# # question: what is a RAG
# query = "What is RAG?"
# response = groq_agent.run_sync(
#     # Deps(question=query, query=query)
#     # Deps(question=query, query=query, context=None, response=None)
# )
# print(response)

In [12]:
query = "how many types of RAG are available in retrieval augmented generation? how can we use Agentic RAG in production?"
try:
    # The run_sync method expects a string prompt, not a Deps object
    response = groq_agent.run_sync(
        user_prompt=query,  # Pass the query string directly
        deps=Deps(  # Pass Deps as a separate argument
            question=query,
            query=query,
            context="",
            response="",
            content=query,
            message=query,
        ),
    )
    print("Response:", response)
except Exception as e:
    print(f"Error during agent execution: {e}")
    import traceback

    traceback.print_exc()

Retrieved context: The design also incorporates the feedback from a diverse group of partici-
pants during a workshop session, which focused on the practical aspects of imple-
menting RAG systems. Their input highlighte...
Web search result: Retrieval Augmented Generation (RAG) encompasses various techniques that enhance language models by integrating external knowledge sources. Traditional RAG models retrieve relevant information from a ...
Response: RunResult(_all_messages=[ModelRequest(parts=[SystemPromptPart(content='You are a Helpful Assistant proficient in providing concise, factual, and to-the-point answers for questions about RAG based on provided context.\n\nWhen answering questions:\n1. First use the `retriever_tool` to get relevant context from the knowledge base\n2. If the retrieved context is insufficient, use the `websearch_tool` to get additional information\n3. Generate a comprehensive response based on all available context\n\nYour response should be in JSON format wit

In [13]:
from IPython.display import Markdown

Markdown(str(response))

RunResult(_all_messages=[ModelRequest(parts=[SystemPromptPart(content='You are a Helpful Assistant proficient in providing concise, factual, and to-the-point answers for questions about RAG based on provided context.\n\nWhen answering questions:\n1. First use the `retriever_tool` to get relevant context from the knowledge base\n2. If the retrieved context is insufficient, use the `websearch_tool` to get additional information\n3. Generate a comprehensive response based on all available context\n\nYour response should be in JSON format with the following structure:\n{\n    "Relevancy": <score 0-1>,\n    "Faithfulness": <score 0-1>,\n    "Context Quality": <score 0-1>,\n    "Needs Web Search": True,  # Set to True when websearch_tool was used, False if not\n    "Explanation": "Explanation of the grading and search decision",\n    "Answer": "Your detailed answer based on the available context"\n}\n', dynamic_ref=None, part_kind='system-prompt'), UserPromptPart(content='how many types of RAG are available in retrieval augmented generation? how can we use Agentic RAG in production?', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 15, 970896, tzinfo=datetime.timezone.utc), part_kind='user-prompt')], kind='request'), ModelResponse(parts=[ToolCallPart(tool_name='retriever_tool', args='{"question": "types of RAG in retrieval augmented generation and Agentic RAG in production"}', tool_call_id='call_vccx', part_kind='tool-call')], model_name='llama-3.3-70b-versatile', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 16, tzinfo=datetime.timezone.utc), kind='response'), ModelRequest(parts=[ToolReturnPart(tool_name='retriever_tool', content='The design also incorporates the feedback from a diverse group of partici-\npants during a workshop session, which focused on the practical aspects of imple-\nmenting RAG systems. Their input highlighted the effectiveness of the system’s\nreal-time retrieval capabilities, particularly in knowledge-intensive domains, and\nunderscored the importance of refining the integration between retrieval and\ngeneration to enhance the transparency and reliability of the system’s outputs.\nThe design also incorporates the feedback from a diverse group of partici-\npants during a workshop session, which focused on the practical aspects of imple-\nmenting RAG systems. Their input highlighted the effectiveness of the system’s\nreal-time retrieval capabilities, particularly in knowledge-intensive domains, and\nunderscored the importance of refining the integration between retrieval and\ngeneration to enhance the transparency and reliability of the system’s outputs.\nThe design also incorporates the feedback from a diverse group of partici-\npants during a workshop session, which focused on the practical aspects of imple-\nmenting RAG systems. Their input highlighted the effectiveness of the system’s\nreal-time retrieval capabilities, particularly in knowledge-intensive domains, and\nunderscored the importance of refining the integration between retrieval and\ngeneration to enhance the transparency and reliability of the system’s outputs.', tool_call_id='call_vccx', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 16, 614098, tzinfo=datetime.timezone.utc), part_kind='tool-return')], kind='request'), ModelResponse(parts=[ToolCallPart(tool_name='websearch_tool', args='{"question": "types of RAG in retrieval augmented generation and Agentic RAG in production"}', tool_call_id='call_ks5k', part_kind='tool-call')], model_name='llama-3.3-70b-versatile', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 17, tzinfo=datetime.timezone.utc), kind='response'), ModelRequest(parts=[ToolReturnPart(tool_name='websearch_tool', content='Retrieval Augmented Generation (RAG) encompasses various techniques that enhance language models by integrating external knowledge sources. Traditional RAG models retrieve relevant information from a dataset and generate responses using a language model. Agentic RAG, however, includes a decision-making component that allows the system to identify additional sources, prioritize information, or initiate new queries based on user input. Types of RAG include Simple RAG, where the model retrieves information and generates responses, and Speculative RAG, which generates multiple responses and combines them for a unified view. Agentic RAG is particularly useful in production for personalized interactions and autonomous AI agents that can deliver context-aware responses.', tool_call_id='call_ks5k', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 19, 592618, tzinfo=datetime.timezone.utc), part_kind='tool-return')], kind='request'), ModelResponse(parts=[TextPart(content='{\n    "Relevancy": 0.8,\n    "Faithfulness": 0.7,\n    "Context Quality": 0.9,\n    "Needs Web Search": true,\n    "Explanation": "The initial search using retriever_tool did not provide sufficient information, requiring a web search to gather more context on types of RAG and Agentic RAG in production.",\n    "Answer": "There are several types of RAG, including Simple RAG and Speculative RAG. Agentic RAG can be used in production for personalized interactions and autonomous AI agents. It enhances the language model\'s ability to identify additional sources, prioritize information, or initiate new queries based on user input, making it particularly useful for delivering context-aware responses."\n}', part_kind='text')], model_name='llama-3.3-70b-versatile', timestamp=datetime.datetime(2025, 2, 22, 22, 27, 19, tzinfo=datetime.timezone.utc), kind='response')], _new_message_index=0, data='{\n    "Relevancy": 0.8,\n    "Faithfulness": 0.7,\n    "Context Quality": 0.9,\n    "Needs Web Search": true,\n    "Explanation": "The initial search using retriever_tool did not provide sufficient information, requiring a web search to gather more context on types of RAG and Agentic RAG in production.",\n    "Answer": "There are several types of RAG, including Simple RAG and Speculative RAG. Agentic RAG can be used in production for personalized interactions and autonomous AI agents. It enhances the language model\'s ability to identify additional sources, prioritize information, or initiate new queries based on user input, making it particularly useful for delivering context-aware responses."\n}', _result_tool_name=None, _usage=Usage(requests=3, request_tokens=2251, response_tokens=216, total_tokens=2467, details=None))

## 11. langchain JSON output parser

In [14]:
print(response.data)

{
    "Relevancy": 0.8,
    "Faithfulness": 0.7,
    "Context Quality": 0.9,
    "Needs Web Search": true,
    "Explanation": "The initial search using retriever_tool did not provide sufficient information, requiring a web search to gather more context on types of RAG and Agentic RAG in production.",
    "Answer": "There are several types of RAG, including Simple RAG and Speculative RAG. Agentic RAG can be used in production for personalized interactions and autonomous AI agents. It enhances the language model's ability to identify additional sources, prioritize information, or initiate new queries based on user input, making it particularly useful for delivering context-aware responses."
}


In [15]:
from langchain_core.output_parsers import JsonOutputParser

parser = JsonOutputParser()
print(parser.parse(response.data), end="\n\n")
print(parser.parse(response.data)["Answer"])

{'Relevancy': 0.8, 'Faithfulness': 0.7, 'Context Quality': 0.9, 'Needs Web Search': True, 'Explanation': 'The initial search using retriever_tool did not provide sufficient information, requiring a web search to gather more context on types of RAG and Agentic RAG in production.', 'Answer': "There are several types of RAG, including Simple RAG and Speculative RAG. Agentic RAG can be used in production for personalized interactions and autonomous AI agents. It enhances the language model's ability to identify additional sources, prioritize information, or initiate new queries based on user input, making it particularly useful for delivering context-aware responses."}

There are several types of RAG, including Simple RAG and Speculative RAG. Agentic RAG can be used in production for personalized interactions and autonomous AI agents. It enhances the language model's ability to identify additional sources, prioritize information, or initiate new queries based on user input, making it parti

In [20]:
query = "What is RAG in AI/ML?"
response = groq_agent.run_sync(query)
print(response.data)
print(parser.parse(response.data))
print(parser.parse(response.data)["Answer"])

Retrieved context: mentioned. This contribution is practical, as it helps practitioners implement
RAG models to address real world challenges with dynamic data and improved
accuracy. The guide provides users clear, acti...
{"Relevancy": 0.8, "Faithfulness": 0.7, "Context Quality": 0.6, "Needs Web Search": false, "Explanation": "The retrieved context mentions RAG models and their application in real-world challenges, but does not provide a clear definition or explanation of RAG in AI/ML. The context is somewhat relevant but lacks clarity and detail.", "Answer": "RAG in AI/ML refers to Retrieval-Augmented Generation models, which are a type of artificial intelligence model that combines retrieval and generation capabilities to improve the accuracy and effectiveness of natural language processing tasks. These models are designed to retrieve relevant information from a knowledge base or database and use that information to generate more accurate and informative responses to user queries. R

In [29]:
query = "Where is Louisville city Located?"
response = groq_agent.run_sync(query)
print(response.data)
# print(parser.parse(response.data))
# print(parser.parse(response.data)["Answer"])

Retrieved context: You are an expert assistant with access to the
following context extracted from documents . Your
job is to answer the user ’s question as accurately
as possible , using the context below .
Context :
{...
Retrieved context: You are an expert assistant with access to the
following context extracted from documents . Your
job is to answer the user ’s question as accurately
as possible , using the context below .
Context :
{...
Web search result: Louisville is a city in Jefferson County, Kentucky, situated on the banks of the Ohio River. It is located at the Falls of the Ohio River and serves as the largest city in Kentucky and the seat of Jef...
{
    "Relevancy": 0.8,
    "Faithfulness": 0.9,
    "Context Quality": 0.7,
    "Needs Web Search": true,
    "Explanation": "The information about Louisville city location was not available in the provided context, so a web search was performed to find the answer.",
    "Answer": "Louisville city is located in Jefferson County,