<a href="https://colab.research.google.com/github/Anirudho747/Edrk_Google-Collab/blob/main/Agentic_Rag_%2B_Crew_AI_%2B_SerpAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------------------------------------------------------
# CELL 1: Install libraries for CrewAI + Gemini + SerpAPI +
#         Hugging Face Embeddings + FAISS + PDF loader
# ---------------------------------------------------------
# Notes:
# - We pin requests==2.32.5 for compatibility with crewai-tools (warnings about Colab's 2.32.4 are fine to ignore).
# - We use langchain-huggingface to avoid deprecation warnings for HF embeddings in langchain_community.
# - We install google-search-results (SerpAPI official client).
# - We add readability + bs4 in case you later want a custom scraper fallback.

%pip install -q \
  requests==2.32.5 \
  "crewai>=0.65.0" "crewai-tools>=1.1.0" \
  "langchain==0.3.27" "langchain-core==0.3.79" "langchain-community==0.3.27" \
  langchain-huggingface \
  google-search-results \
  faiss-cpu chromadb pypdf python-dotenv sentence-transformers \
  beautifulsoup4 readability-lxml lxml-html-clean

# ---------------------------------------------------------
# CELL 2: Imports + Key loading + LLM + Embeddings + Search
# ---------------------------------------------------------
import os, re, textwrap
from dotenv import load_dotenv

# CrewAI core
from crewai import Agent, Task, Crew, LLM

# Tool base class (for custom tools) and built-in scraper
from crewai.tools import BaseTool                  # Base class for your own tools
from crewai_tools import ScrapeWebsiteTool        # Built-in website scraper (your choice C1)

# LangChain utilities for RAG
from langchain_huggingface import HuggingFaceEmbeddings  # Current, non-deprecated HF embeddings in LC
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

# SerpAPI (official client library)
from serpapi import GoogleSearch

# --- Keys ---
# Preferred: upload a .env file (with GEMINI and SERPAPI_API_KEY) and load:
load_dotenv()

# SECURITY: Do NOT hard-code keys as defaults. If these asserts fire, set keys in Cell 0 or via .env.
GEMINI = os.environ.get("GEMINI", "AIzaSyCKCwFXiwwXRGXON-j6w8GGf1EsXumg8nI")
SERPAPI_API_KEY = os.environ.get("SERPAPI_API_KEY", "4b05812c3a17221a9d9deac6dfe2ff9658cceb2068d4bc53095c4a76b4c3313b")
assert GEMINI, "Set GEMINI key in env or Cell 0"
assert SERPAPI_API_KEY, "Set SERPAPI_API_KEY in env or Cell 0"

# --- LLM ---
# A1: Gemini 1.5 Flash (fast & cheap). Swap to "gemini/gemini-1.5-pro" for more reasoning depth.
llm = LLM(
    model="gemini/gemini-1.5-flash",
    api_key=GEMINI,
    temperature=0.2,   # low temp = more deterministic, better for tools & summaries
    max_tokens=800     # increase to 1200–1600 for longer summaries
)

# --- Embeddings ---
# D1: SBERT (MiniLM) is a great default for semantic search (speed + quality).
# Alternatives:
#  - Google "text-embedding-004" (API cost, good quality)
#  - OpenAI "text-embedding-3-small/large" (API cost, good quality)
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# --- SerpAPI helper ---
# Simple function to call SerpAPI and return organic result objects
def serp_search_results(query: str, num: int = 5):
    num = max(1, min(int(num), 10))  # SerpAPI returns up to 10 per call
    params = {
        "engine": "google",
        "q": query,
        "num": num,
        "api_key": SERPAPI_API_KEY,
        "hl": "en"
    }
    data = GoogleSearch(params).get_dict() or {}
    return data.get("organic_results", []) or []

# ---------------------------------------------------------
# CELL 3: SerpAPI Tool (CrewAI BaseTool)
# ---------------------------------------------------------
# Why: CrewAI's Task(tools=[...]) expects subclasses of BaseTool with:
#  - a Pydantic args_schema describing inputs
#  - an implementation of _run(...)
# This gives validation, logging, and consistent tool calling.

from pydantic import BaseModel, Field

class SerpInput(BaseModel):
    query: str = Field(..., description="Google search query")
    num: int = Field(5, ge=1, le=10, description="Number of results to return (1-10)")

class SerpAPISearchTool(BaseTool):
    name: str = "SerpAPI Search"
    description: str = (
        "Search Google via SerpAPI and return 3–5 bullets with title, link, and snippet, "
        "plus a final line BEST_URL: <url>."
    )
    args_schema: type[BaseModel] = SerpInput

    def _run(self, query: str, num: int = 5) -> str:
        results = serp_search_results(query, num=num)
        if not results:
            return "No results."
        lines = []
        for item in results[:num]:
            title = item.get("title", "")
            link = item.get("link", "")
            snippet = item.get("snippet", "")
            lines.append(f"- {title}\n  {link}\n  {snippet}")
        # Simple heuristic: first result as BEST_URL (you can add scoring later)
        lines.append(f"BEST_URL: {results[0].get('link','')}")
        return "\n".join(lines)

# Instantiate tools (your choice C1)
search_tool = SerpAPISearchTool()
scrape_tool = ScrapeWebsiteTool()  # If a site blocks this, our prompt tells the agent to emit SCRAPE_SKIPPED.

# ---------------------------------------------------------
# CELL 4: Agents, Tasks, and Crew + PDF→FAISS RAG helpers
# ---------------------------------------------------------

# Agents: keep roles/goals simple and specific
web_search_agent = Agent(
    role="Web Researcher",
    goal="Find high-signal sources for the user's topic with links",
    backstory="Fast, precise, source-focused researcher.",
    llm=llm, verbose=True, allow_delegation=False
)

web_scraper_agent = Agent(
    role="Web Scraper",
    goal="Scrape and summarize the selected page plainly and accurately",
    backstory="Detail-oriented content extractor.",
    llm=llm, verbose=True, allow_delegation=False
)

# Task 1: Search → bullets + BEST_URL
# Why: We constrain the format so Task 2 can deterministically pick a URL.
search_task = Task(
    description=(
        "Search the web for '{topic}'. Return exactly 3–5 bullet points with title, link, and snippet.\n"
        "Then on a final line, output: BEST_URL: <the single best URL>."
    ),
    expected_output="3–5 bullets + final line 'BEST_URL: <url>'",
    tools=[search_tool],
    agent=web_search_agent,
)

# Task 2: Scrape BEST_URL using the selected tool.
# If blocked, the agent must output SCRAPE_SKIPPED (our downstream code handles it).
scrape_task = Task(
    description=(
        "From the previous output, extract the 'BEST_URL'. "
        "Use ScrapeWebsiteTool on that URL and summarize the main points in plain language for a beginner.\n"
        "If the site blocks scraping or returns empty text, DO NOT fail — instead output exactly: 'SCRAPE_SKIPPED'."
    ),
    expected_output="A concise summary of the page, or 'SCRAPE_SKIPPED' if blocked.",
    tools=[scrape_tool],
    agent=web_scraper_agent,
)

crew = Crew(
    agents=[web_search_agent, web_scraper_agent],
    tasks=[search_task, scrape_task],
    verbose=1,
    memory=False
)

# -------------------------
# PDF → FAISS RAG helpers
# -------------------------
# Why FAISS: fast in-memory vector index; great for Colab.
# Alternatives: Chroma (local/simple), Qdrant/Milvus (server), LanceDB (local with file backing).

def build_vector_db(pdf_path: str):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    docs = PyPDFLoader(pdf_path).load()  # loads pages as Documents
    # Split into overlapping chunks to improve retrieval coverage and context quality
    chunks = RecursiveCharacterTextSplitter(
        chunk_size=1000,   # tune for your content (500–1500 typical)
        chunk_overlap=80   # small overlap preserves context across splits
    ).split_documents(docs)
    return FAISS.from_documents(chunks, emb)

def retrieve_context(vdb, query: str, k: int = 5) -> str:
    # k: how many similar chunks to pull (trade-off between recall and noise)
    docs = vdb.similarity_search(query, k=k)
    return "\n\n".join(d.page_content for d in docs)

def can_answer_locally(query: str, context: str) -> bool:
    # Lightweight router: ask LLM if the retrieved context is enough
    # Alternatives:
    #  - Heuristic: require min token length or keyword match
    #  - Confidence scoring via retriever metadata (distances) and threshold
    prompt = (
        "Answer strictly 'Yes' or 'No'.\n"
        "Does the following text contain enough information to answer the question?\n\n"
        f"Question: {query}\n\nText:\n{context[:4000]}\n"
    )
    return llm(prompt).strip().lower().startswith("y")

def answer_from_context(query: str, context: str) -> str:
    # Force grounded answers; admit insufficiency when needed
    prompt = (
        "Use ONLY the provided context to answer accurately. If insufficient, say so plainly.\n\n"
        f"Context:\n{context[:16000]}\n\nQuestion: {query}\n"
    )
    return llm(prompt)

# ---------------------------------------------------------
# CELL 5: Route: local PDF RAG first, web fallback second
# ---------------------------------------------------------
# Strategy:
# 1) Retrieve from your PDF (cheap, fast, private). If enough, answer and stop.
# 2) Otherwise, run Crew pipeline (search → scrape) and summarize.

def answer_query(query: str, vdb=None, web_fallback: bool = True):
    # 1) Try local RAG
    local_ctx = retrieve_context(vdb, query, k=5) if vdb else ""
    if local_ctx and can_answer_locally(query, local_ctx):
        return {"source": "local", "context": local_ctx, "answer": answer_from_context(query, local_ctx)}

    if not web_fallback:
        return {"source": "none", "context": local_ctx, "answer": "Insufficient local context."}

    # 2) Web route (Crew): search + scrape
    try:
        result = crew.kickoff(inputs={"topic": query})
        web_ctx = str(result.raw)  # combine outputs from both tasks
    except Exception:
        # If scraping tool throws, degrade gracefully: keep only search bullets
        web_ctx = search_tool.run(query=query, num=5)

    # If scraper was blocked, keep bullets + BEST_URL only (skip empty/blocked scrape content)
    if "SCRAPE_SKIPPED" in web_ctx:
        web_ctx = "\n".join(
            line for line in web_ctx.splitlines()
            if line.startswith("- ") or line.startswith("BEST_URL:")
        )

    ans = answer_from_context(query, web_ctx)
    return {"source": "web", "context": web_ctx[:16000], "answer": ans}

# ---------------------------------------------------------
# CELL 6: Quick validation
# ---------------------------------------------------------
print("Embedding length:", len(emb.embed_query("hello world")))

# Test SerpAPI tool directly (named args for tools)
print(search_tool.run(query="Automation Testers Fired", num=7)[:500], "...")

# Optional: Build FAISS from your PDF, then ask
# vdb = build_vector_db("/content/your.pdf")
# out = answer_query("What is Agentic RAG?", vdb=vdb, web_fallback=True)
# print("Source:", out["source"])
# print(out["answer"][:1200])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding length: 384
[96mUsing Tool: SerpAPI Search[0m
- What advice would you give to your colleagues who are ...
  https://www.reddit.com/r/webdev/comments/1ip6txq/what_advice_would_you_give_to_your_colleagues_who/
  I noticed that a lot of Test Automation Specialists are getting laid off. I know a lot of these folks have a bad reputation for being ...
- Why We Fired All Testers and What Happened Next
  https://www.linkedin.com/posts/shaikshoaibrehman_testers-dev-qa-activity-7376829052793442305-bs5x
  The Day Came, We Fired All Testers Management  ...
