In [20]:
import gradio as gr
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import ollama
import re

In [21]:
def process_pdfs_from_folder(folder_path):
    if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
        return None, None

    all_chunks = []
    
    # Iterate over all PDF files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file_name)

            loader = PyMuPDFLoader(pdf_path)
            data = loader.load()

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500, chunk_overlap=100
            )
            chunks = text_splitter.split_documents(data)
            all_chunks.extend(chunks)  # Store all chunks

    if not all_chunks:
        return None, None  # No valid PDFs found

    # Create vector database with all PDFs' text
    embeddings = OllamaEmbeddings(model="deepseek-r1:1.5b")
    vectorstore = Chroma.from_documents(
        documents=all_chunks, embedding=embeddings, persist_directory="./chroma_db"
    )
    retriever = vectorstore.as_retriever()

    return vectorstore, retriever


In [22]:
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [23]:
def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"

    response = ollama.chat(
        model="deepseek-r1:1.5b",
        messages=[{"role": "user", "content": formatted_prompt}],
    )

    response_content = response["message"]["content"]

    # Remove content between <think> and </think> tags
    final_answer = re.sub(r"<think>.*?</think>", "", response_content, flags=re.DOTALL).strip()

    return final_answer

In [24]:
def rag_chain(question, retriever):
    retrieved_docs = retriever.invoke(question)
    formatted_content = combine_docs(retrieved_docs)
    return ollama_llm(question, formatted_content)

### Creating Gradio interface

In [31]:
def ask_question(folder_path, question):
    if not folder_path or not os.path.exists(folder_path):
        return "Invalid folder path. Please enter a valid directory."

    vectorstore, retriever = process_pdfs_from_folder(folder_path)

    if retriever is None:
        return "No valid PDFs found in the folder."

    result = rag_chain(question, retriever)
    return result

interface = gr.Interface(
    fn=ask_question,
    inputs=[
        gr.FileExplorer(root_dir="./data", label="PDF Folder Path"),
        gr.Textbox(label="Ask a question"),
    ],
    outputs="text",
    title="Ask questions about PDFs in a folder",
    description="Enter the path to a folder containing PDFs and ask questions about them.",
)

interface.launch()

* Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\feder\anaconda3\envs\llm\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\feder\anaconda3\envs\llm\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\feder\anaconda3\envs\llm\Lib\site-packages\gradio\blocks.py", line 2098, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\feder\anaconda3\envs\llm\Lib\site-packages\gradio\blocks.py", line 1645, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\feder\anaconda3\envs\llm\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get