In [28]:
!pip install gradio langchain langchain-community langchain-google-genai faiss-cpu unstructured

Collecting unstructured
  Downloading unstructured-0.18.20-py3-none-any.whl.metadata (25 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.11.16-py3-none-any.whl.metadata (15 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
[0m  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting backoff (from unstructured)
  Downloading backoff-2.2.1-py3-none

In [29]:
# Imports
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from bs4 import BeautifulSoup
import gradio as gr

In [30]:
# LLM and Embeddings
API_KEY = "Private"

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=API_KEY,
    temperature=0.2
)

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=API_KEY
)

In [31]:
# Globals
vectorstore = None
retrievalQA = None

In [32]:
# PromptTemplate
PROMPT_TEMPLATE = PromptTemplate(
    template="""You are an expert assistant.
Answer questions clearly and concisely using ONLY the provided content.

Content:
{context}

Question: {question}

If the answer is not in the content, say "I don‚Äôt know.".""",
    input_variables=["context", "question"]
)

In [33]:
# Text Cleaning
def clean_url_text(content):
    soup = BeautifulSoup(content, "html.parser")
    for tag in soup(["script", "style", "header", "footer", "nav"]):
        tag.decompose()
    text = soup.get_text(separator=" ")
    return " ".join(text.split())

In [34]:
# Processing PDF
def process_pdf(file):
    global vectorstore, retrievalQA
    try:
        if file is None:
            return "‚ö†Ô∏è Please upload a PDF first."

        file_path = file.name if hasattr(file, "name") else file['name']

        loader = PyPDFLoader(file_path)
        pages = loader.load()
        if not pages:
            return "‚ùå PDF is empty or could not be read."

        splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=30)
        docs = splitter.split_documents(pages)

        if vectorstore:
            vectorstore.add_documents(docs)
        else:
            vectorstore = FAISS.from_documents(docs, embeddings)

        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

        retrievalQA = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT_TEMPLATE},
        )
        return "‚úÖ PDF processed successfully! You can now ask questions."
    except Exception as e:
        return f"‚ùå Error processing PDF: {e}"

In [35]:
# Processing URL
def process_url(url):
    global vectorstore, retrievalQA
    try:
        if not url.strip():
            return "‚ö†Ô∏è Please enter a URL first."

        loader = WebBaseLoader(url)
        docs = loader.load()
        if not docs:
            return "‚ùå Failed to load URL or empty content."

        cleaned = clean_url_text(docs[0].page_content)
        doc = Document(page_content=cleaned, metadata={"source": url})

        splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
        chunks = splitter.split_documents([doc])

        if vectorstore:
            vectorstore.add_documents(chunks)
        else:
            vectorstore = FAISS.from_documents(chunks, embeddings)

        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

        retrievalQA = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT_TEMPLATE},
        )
        return "‚úÖ URL processed successfully! You can now ask questions."
    except Exception as e:
        return f"‚ùå Error processing URL: {e}"

In [36]:
# Query Handling
def ask_question(query):
    try:
        if retrievalQA is None:
            return "‚ö†Ô∏è Please upload a PDF or enter a URL first."

        result = retrievalQA.invoke({"query": query})
        return result["result"].strip()
    except Exception as e:
        return f"‚ùå Error: {e}"

In [37]:
# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## üåê Unified PDF + Webpage RAG Assistant")
    gr.Markdown("Upload PDFs or enter webpage URLs. Then ask questions about all loaded content.")

    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(label="Upload PDF")
            pdf_btn = gr.Button("Process PDF")
            url_input = gr.Textbox(label="Enter Webpage URL")
            url_btn = gr.Button("Process URL")
            status = gr.Textbox(label="Status")

        with gr.Column():
            query_input = gr.Textbox(label="Ask a Question")
            ask_btn = gr.Button("Ask")
            answer_output = gr.Textbox(label="Answer", lines=8)

    pdf_btn.click(process_pdf, inputs=pdf_input, outputs=status)
    url_btn.click(process_url, inputs=url_input, outputs=status)
    ask_btn.click(ask_question, inputs=query_input, outputs=answer_output)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5290aea5c26b026935.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


