In [14]:
import gradio as gr
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import ollama
import re


In [15]:
def process_pdf(pdf_path):
    if not pdf_path:
        return None, None, None

    loader = PyMuPDFLoader(pdf_path)  # Pass file path instead of bytes
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=100
    )
    chunks = text_splitter.split_documents(data)

    embeddings = OllamaEmbeddings(model="deepseek-r1:1.5b")
    vectorstore = Chroma.from_documents(
        documents=chunks, embedding=embeddings, persist_directory="./chroma_db"
    )
    retriever = vectorstore.as_retriever()

    return text_splitter, vectorstore, retriever

In [16]:
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [17]:

def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"

    response = ollama.chat(
        model="deepseek-r1:1.5b",
        messages=[{"role": "user", "content": formatted_prompt}],
    )

    response_content = response["message"]["content"]

    # Remove content between <think> and </think> tags to remove thinking output
    final_answer = re.sub(r"<think>.*?</think>", "", response_content, flags=re.DOTALL).strip()

    return final_answer

In [18]:
def rag_chain(question, retriever):
    retrieved_docs = retriever.invoke(question)
    formatted_content = combine_docs(retrieved_docs)
    return ollama_llm(question, formatted_content)

### Creating Gradio interface

In [19]:
def ask_question(pdf_file, question):
    if pdf_file is None:
        return "Please upload a PDF file."

    pdf_path = pdf_file.name  # Extract the file path

    text_splitter, vectorstore, retriever = process_pdf(pdf_path)

    if text_splitter is None:
        return "Failed to process the PDF."

    result = rag_chain(question, retriever)
    return result

interface = gr.Interface(
    fn=ask_question,
    inputs=[
        gr.File(label="Upload PDF"),  # This now correctly returns a file path
        gr.Textbox(label="Ask a question"),
    ],
    outputs="text",
    title="Ask questions about your PDF",
    description="Use DeepSeek-R1 to answer your questions about the uploaded PDF document.",
)

interface.launch()

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


