<a href="https://colab.research.google.com/github/Anirudho747/Edrk_Google-Collab/blob/main/13_Agentic_RAG_(PDF_%2B_Web_fallback).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Single Colab cell: Final Agentic RAG (PDF + Web fallback) — copy-paste & run
# -------------------------------------------------------------------------
# (If you haven't installed dependencies yet, this will install them. If already installed, it's fine.)
!pip install -q chromadb langchain pypdf gradio langchain-community
!pip install -q google-generativeai langchain-google-genai
!pip install -q sentence-transformers
!pip install -q google-search-results serpapi

# -------------------------------------------------------------------------
# Imports & robust SerpApi client selection
# -------------------------------------------------------------------------
import os, json, uuid, traceback, time
from google.colab import userdata, files

# prefer google-search-results client first, fallback to serpapi variants
try:
    from google_search_results import GoogleSearch as SerpGoogleSearch
    print("Using google_search_results.GoogleSearch")
except Exception:
    try:
        from serpapi import GoogleSearch as SerpGoogleSearch
        print("Using serpapi.GoogleSearch")
    except Exception:
        try:
            from serpapi.google_search_results import GoogleSearch as SerpGoogleSearch
            print("Using serpapi.google_search_results.GoogleSearch")
        except Exception as e:
            raise ImportError("Could not import GoogleSearch client. Install google-search-results or serpapi. Error: " + str(e))

# LangChain / LLM / helper imports
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.schema import Document, HumanMessage
import gradio as gr

# -------------------------------------------------------------------------
# API keys: optionally use Colab userdata or set manually before running
# -------------------------------------------------------------------------
if userdata.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
if userdata.get("SERPAPI_API_KEY"):
    os.environ["SERPAPI_API_KEY"] = userdata.get("SERPAPI_API_KEY")

if not os.environ.get("SERPAPI_API_KEY"):
    print("Warning: SERPAPI_API_KEY not set. Web fallback will be disabled until you set it.")

# -------------------------------------------------------------------------
# 1) Upload & load PDF (single upload)
# -------------------------------------------------------------------------
print("Upload your PDF now (Colab will prompt you).")
uploaded = files.upload()
if not uploaded:
    raise FileNotFoundError("No file uploaded. Re-run cell and upload the PDF.")
pdf_filename = list(uploaded.keys())[0]
print("Uploaded PDF:", pdf_filename)

loader = PyPDFLoader(pdf_filename)
pages = loader.load()
print(f"Loaded {len(pages)} pages from PDF.")

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(pages)
print(f"Split into {len(docs)} chunks.")

# -------------------------------------------------------------------------
# 2) Persistent embeddings + vectorstore (create once)
# -------------------------------------------------------------------------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(docs, embeddings, collection_name=f"pdf_collection_{uuid.uuid4().hex[:8]}")
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
print("Persistent vectorstore created.")

# -------------------------------------------------------------------------
# 3) LLM + QA chain
# -------------------------------------------------------------------------
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# -------------------------------------------------------------------------
# 4) Robust extract & LLM-call helpers (avoid .content on str, use HumanMessage)
# -------------------------------------------------------------------------
def extract_text_from_llm_response(raw):
    """Return a plain string for many common return shapes."""
    try:
        if raw is None:
            return ""
        if isinstance(raw, str):
            return raw
        if isinstance(raw, dict):
            for k in ("output_text", "text", "answer", "content"):
                if k in raw and isinstance(raw[k], str):
                    return raw[k]
            # pick first string value if any
            for v in raw.values():
                if isinstance(v, str):
                    return v
            return json.dumps(raw)
        # object with .content
        if hasattr(raw, "content"):
            c = getattr(raw, "content")
            if isinstance(c, str):
                return c
            if isinstance(c, (list, tuple)) and len(c) > 0:
                return c[0] if isinstance(c[0], str) else str(c[0])
            return str(c)
        # langchain-like .generations
        if hasattr(raw, "generations"):
            try:
                return raw.generations[0][0].text
            except Exception:
                return str(raw)
        # .text fallback
        if hasattr(raw, "text"):
            t = getattr(raw, "text")
            return t if isinstance(t, str) else str(t)
        return str(raw)
    except Exception as e:
        return f"<unextractable: {type(raw).__name__}: {e}>"

def call_llm_with_prompt(prompt):
    """
    Call global llm robustly. Use HumanMessage for generate to satisfy langchain_core expectations.
    Always return a plain string (never an object).
    """
    try:
        # Preferred: generate expects list-of-lists of BaseMessage
        if hasattr(llm, "generate"):
            raw = llm.generate([[HumanMessage(content=prompt)]])
            return extract_text_from_llm_response(raw)
        # next: invoke if available
        if hasattr(llm, "invoke"):
            raw = llm.invoke(prompt)
            return extract_text_from_llm_response(raw)
        # fallback: try callable
        raw = llm(prompt)
        return extract_text_from_llm_response(raw)
    except Exception as e:
        tb = traceback.format_exc()
        return f"LLM call failed: {type(e).__name__}: {e}\n{tb}"

# -------------------------------------------------------------------------
# 5) In-doc classifier (LLM decides if question is answerable from PDF snippets)
# -------------------------------------------------------------------------
def llm_is_in_doc(query, top_snippets):
    snippet_text = "\n\n---\n\n".join(f"SNIPPET {i+1}:\n{txt[:1200]}" for i, txt in enumerate(top_snippets)) or "No snippets found."
    safe_query = json.dumps(query)
    prompt = f"""
You are an assistant whose job is to decide whether a user's question can be answered using only the provided PDF snippets.
User question: {safe_query}

Below are the top PDF snippets retrieved by a semantic retriever. Answer ONLY in strict JSON format with two fields:
- in_doc: true or false
- explanation: one-sentence explanation why.

Do NOT include any other text.

PDF SNIPPETS:
{snippet_text}
"""
    text = call_llm_with_prompt(prompt)
    # try parse JSON
    try:
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            parsed = json.loads(text[start:end+1])
            in_doc = bool(parsed.get("in_doc") is True or str(parsed.get("in_doc")).lower() in ["true", "yes"])
            explanation = parsed.get("explanation", "")
            return {"in_doc": in_doc, "explanation": explanation, "raw": text}
    except Exception:
        pass
    # fallback heuristics
    low = text.lower()
    if any(w in low for w in ("in_doc", "true", "yes", "answerable", "can be answered")):
        return {"in_doc": True, "explanation": text, "raw": text}
    return {"in_doc": False, "explanation": text, "raw": text}

# -------------------------------------------------------------------------
# 6) SerpApi search helper (defensive)
# -------------------------------------------------------------------------
def serpapi_search(query, num=3):
    if not os.environ.get("SERPAPI_API_KEY"):
        # web disabled — return empty so chatbot can reply gracefully
        print("serpapi_search: SERPAPI_API_KEY not set; skipping web search.")
        return []
    params = {"engine": "google", "q": query, "num": num, "api_key": os.environ["SERPAPI_API_KEY"]}
    try:
        search = SerpGoogleSearch(params)
        result = search.get_dict()
    except Exception as e:
        print(f"serpapi_search: query failed: {type(e).__name__}: {e}")
        return []
    hits = []
    try:
        for r in result.get("organic_results", [])[:num]:
            title = r.get("title") or ""
            snippet = r.get("snippet") or r.get("snippet_html") or ""
            link = r.get("link") or r.get("displayed_link") or ""
            hits.append((title, snippet, link))
    except Exception:
        pass
    if not hits:
        try:
            for r in result.get("top_results", [])[:num]:
                hits.append((r.get("title", ""), r.get("snippet", ""), r.get("link", "")))
        except Exception:
            pass
    return hits

# -------------------------------------------------------------------------
# 7) Prompt-based answer synthesis (no per-query Chroma rebuild)
# -------------------------------------------------------------------------
def answer_from_context(query, pdf_snips, web_hits):
    pdf_context = "\n\n".join(f"PDF_SNIPPET {i+1}:\n{txt[:1200]}" for i, txt in enumerate(pdf_snips))
    web_context = "\n\n".join(f"WEB_{i+1}: Title: {t}\nSnippet: {s}\nURL: {u}" for i, (t, s, u) in enumerate(web_hits))
    context = "\n\n".join([c for c in [pdf_context, web_context] if c.strip()])

    prompt = f"""
You are an assistant that must answer the user's question using ONLY the information in the provided sources (PDF snippets and web snippets).
Cite sources inline using [PDF_SNIPPET #] or [WEB_#] when you reference them. If the answer cannot be found in the sources, say 'I don't know based on the given sources.'

User question:
\"\"\"{query}\"\"\"

SOURCES:
{context}

Now produce a clear, concise answer and include the source references used.
"""
    return call_llm_with_prompt(prompt)

# -------------------------------------------------------------------------
# 8) Utility: get top pdf snippets from persistent retriever
# -------------------------------------------------------------------------
def get_top_pdf_snippets(query, k=3):
    docs_local = retriever.get_relevant_documents(query)
    return [d.page_content for d in (docs_local[:k] if docs_local else [])]

# -------------------------------------------------------------------------
# 9) Chatbot handler (robust)
# -------------------------------------------------------------------------
def chatbot(query):
    try:
        if not isinstance(query, str) or query.strip() == "":
            return "Please type a question and press Enter."

        query = query.strip()

        # 1) Get top pdf snippets
        top_snips = get_top_pdf_snippets(query, k=3)

        # 2) LLM decides whether in-doc
        check = llm_is_in_doc(query, top_snips)
        if check.get("in_doc"):
            try:
                return qa_chain.run(query)
            except Exception:
                # fallback to prompt-based synth from pdf snippets
                return answer_from_context(query, top_snips, [])

        # 3) Out-of-doc: web fallback
        web_hits = serpapi_search(query, num=3)
        if not web_hits:
            return f"I don't know based on the given sources. (LLM decision: {check.get('explanation')})"

        # 4) Synthesize answer from pdf + web snippets
        return answer_from_context(query, top_snips, web_hits)

    except Exception as e:
        return f"Error in chatbot pipeline: {type(e).__name__}: {e}"

# -------------------------------------------------------------------------
# 10) Gradio UI: queued and disable flagging
# -------------------------------------------------------------------------
demo = gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(label="Ask HR Assistant a question", lines=3, placeholder="Type your HR question here..."),
    outputs=gr.Textbox(label="Answer", lines=12),
    title="AI-Powered HR Assistant (PDF + Web fallback)",
    allow_flagging="never"
)

print("Launching Gradio app. Upload complete; the UI will appear below with a public link while the Colab session is active.")
demo.launch(share=True)
# -------------------------------------------------------------------------




In [2]:
!pip install -q chromadb langchain pypdf gradio langchain-community
!pip install -q google-generativeai langchain-google-genai
!pip install -q sentence-transformers
!pip install -q serpapi


  docs_local = retriever.get_relevant_documents(query)
  return qa_chain.run(query)


In [3]:
import traceback, sys
modules = ["serpapi", "langchain_community", "langchain_google_genai", "langchain", "chromadb", "sentence_transformers"]
for m in modules:
    try:
        __import__(m)
        print("OK:", m)
    except Exception as e:
        print("ERROR:", m, type(e).__name__, e)
        traceback.print_exc()
print("Python:", sys.executable)

OK: serpapi
OK: langchain_community
OK: langchain_google_genai
OK: langchain
OK: chromadb
OK: sentence_transformers
Python: /usr/bin/python3


In [None]:
!pip install -q google-search-results
!pip install -q 'serpapi==2.1.0' || true


In [None]:
import os, sys
print("Restarting runtime to pick up newly installed packages...")
sys.stdout.flush()
os.kill(os.getpid(), 9)


Restarting runtime to pick up newly installed packages...


In [None]:
worked = False
try:
    from google_search_results import GoogleSearch as GSR_GoogleSearch
    print("OK: google_search_results.GoogleSearch (preferred).")
    worked = True
except Exception as e:
    print("google_search_results import failed:", type(e).__name__, e)

if not worked:
    try:
        from serpapi import GoogleSearch as SerpGoogleSearch
        print("OK: serpapi.GoogleSearch")
        worked = True
    except Exception as e:
        print("serpapi.GoogleSearch import failed:", type(e).__name__, e)

if not worked:
    try:
        from serpapi.google_search_results import GoogleSearch as SerpGoogleSearchAlt
        print("OK: serpapi.google_search_results.GoogleSearch (alternate).")
        worked = True
    except Exception as e:
        print("serpapi.google_search_results import failed:", type(e).__name__, e)

print("Final status:", "OK" if worked else "FAILED")
