In [None]:
from langchain.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("Diabetic_Retinopathy_Image_Classification_Using_Machine_Learning_and_Local_Binary_Patterns_Features copy.pdf")    # or use UnstructuredPDFLoader
docs = loader.load()   

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"Total chunks: {len(chunks)}")


In [None]:
from langchain_ollama import OllamaEmbeddings
embed_model = OllamaEmbeddings(model="llama3-chatqa:8b")

In [None]:
docs_embeddings = embed_model.embed_documents([chunk.page_content for chunk in chunks])
print(f"Total embeddings: {len(docs_embeddings)}")

In [None]:
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(chunks, embed_model, persist_directory="db")
vectorstore.persist()

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
from langchain_community.llms import Ollama
llm = Ollama(
    model="llama3-chatqa:8b",
    temperature=0.5,
    callback_manager=None
)

# Test the connection
response = llm.invoke("Tell me a joke")
print(response)

In [None]:
from langchain.chains.summarize import load_summarize_chain

summarizer = load_summarize_chain(llm, chain_type="map_reduce")
summary = summarizer.run(docs)  # docs = list of full Document pages or large chunks
print(summary)


In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,   # to get source chunks
    chain_type="stuff",             # or "map_reduce" if needed
)


In [None]:
user_question = "What is the highest accuracy achieved in the study?"

In [None]:
result = qa_chain({"query": user_question})
answer = result["result"]
sources = result["source_documents"]


In [None]:
print(f"Answer: {answer}")

In [None]:
import gradio as gr
import tempfile
import os

def answer_pdf(pdf_file, question):
    # Handle file upload properly
    if pdf_file is None:
        return "Please upload a PDF file"
    
    try:
        # Create a temporary file and write the PDF content
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(pdf_file)
            temp_path = temp_file.name
        
        # Process the uploaded file using the temporary file path
        loader = PyMuPDFLoader(temp_path)
        docs = loader.load()
        
        # Create text chunks
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = splitter.split_documents(docs)
        
        # Create vector store
        vectorstore = Chroma.from_documents(chunks, embed_model, persist_directory="db")
        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
        
        # Create QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=retriever,
            return_source_documents=True,
            chain_type="stuff"
        )
        
        # Get answer
        result = qa_chain({"query": question})
        
        # Clean up the temporary file
        os.unlink(temp_path)
        
        return result["result"]
        
    except Exception as e:
        return f"Error processing file: {str(e)}"

iface = gr.Interface(
    fn=answer_pdf,
    inputs=[
        gr.File(label="PDF Document", type="binary"),
        gr.Textbox(label="Question")
    ],
    outputs="text",
    title="Automated Research Assistant"
)

iface.launch(share=True,pwa=True)