In [4]:
from langchain.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("/Users/aryansinha/Desktop/Lallma Project/Diabetic_Retinopathy_Image_Classification_Using_Machine_Learning_and_Local_Binary_Patterns_Features copy.pdf")    # or use UnstructuredPDFLoader
docs = loader.load()   

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"Total chunks: {len(chunks)}")


Total chunks: 28


In [6]:
from langchain_ollama import OllamaEmbeddings
embed_model = OllamaEmbeddings(model="llama3.2")

In [7]:
docs_embeddings = embed_model.embed_documents([chunk.page_content for chunk in chunks])
print(f"Total embeddings: {len(docs_embeddings)}")

Total embeddings: 28


In [8]:
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(chunks, embed_model, persist_directory="db")
vectorstore.persist()

  vectorstore.persist()


In [9]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [10]:
from langchain_community.llms import Ollama
from langchain.cache import InMemoryCache
from langchain.globals import set_llm_cache

In [11]:

llm = Ollama(
    model="llama3.2",
    temperature=0.7,
    callback_manager=None
)

# Test the connection
response = llm.invoke("Tell me a joke")
print(response)

  llm = Ollama(


Here's one:

What do you call a fake noodle?

An impasta!


In [12]:
from langchain.chains.summarize import load_summarize_chain

summarizer = load_summarize_chain(llm, chain_type="map_reduce")
summary = summarizer.run(docs)  # docs = list of full Document pages or large chunks
print(summary)


  summary = summarizer.run(docs)  # docs = list of full Document pages or large chunks


Here is a concise summary:

A study evaluated the performance of six machine learning algorithms (Random Forest, Adaptive Boosting, K-Nearest Neighbor, Gaussian Naive Bayes, Support Vector Machine, and Quadratic Discriminant Analysis) in classifying diabetic retinopathy images into three categories. The results showed that Random Forest achieved the highest classification accuracy (0.912-0.94), particularly for distinguishing between non-diabetic retinopathy and severe retinopathy. The study highlights the effectiveness of Local Binary Patterns (LBP) features in improving accuracy with machine learning models, suggesting potential future research integrating these features with deep learning techniques.


In [13]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,   # to get source chunks
    chain_type="stuff",             # or "map_reduce" if needed
)


In [14]:
user_question = "What is the highest accuracy achieved in the study?"

In [15]:
result = qa_chain({"query": user_question})
answer = result["result"]
sources = result["source_documents"]


  result = qa_chain({"query": user_question})


In [16]:
print(f"Answer: {answer}")

Answer: The text doesn't explicitly state the question being asked, but based on the provided information, it appears to be asking for the highest accuracy achieved in the study. According to the text, the RF classifier achieves an accuracy of 0.94 when classifying between noDR and SV.


In [18]:
import gradio as gr
import tempfile
import os

def answer_pdf(pdf_file, question):
    # Handle file upload properly
    if pdf_file is None:
        return "Please upload a PDF file"
    
    try:
        # Create a temporary file and write the PDF content
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(pdf_file)
            temp_path = temp_file.name
        
        # Process the uploaded file using the temporary file path
        loader = PyMuPDFLoader(temp_path)
        docs = loader.load()
        
        # Create text chunks
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = splitter.split_documents(docs)
        
        # Create vector store
        vectorstore = Chroma.from_documents(chunks, embed_model, persist_directory="db")
        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
        
        # Create QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=retriever,
            return_source_documents=True,
            chain_type="stuff"
        )
        
        # Get answer
        result = qa_chain({"query": question})
        
        # Clean up the temporary file
        os.unlink(temp_path)
        
        return result["result"]
        
    except Exception as e:
        return f"Error processing file: {str(e)}"

iface = gr.Interface(
    fn=answer_pdf,
    inputs=[
        gr.File(label="PDF Document", type="binary"),
        gr.Textbox(label="Question")
    ],
    outputs="text",
    title="Automated Research Assistant"
)

iface.launch(share=True,pwa=True)

* Running on local URL:  http://127.0.0.1:7861


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Could not create share link. Missing file: /Users/aryansinha/.cache/huggingface/gradio/frpc/frpc_darwin_amd64_v0.3. 

Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: 

1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_darwin_amd64
2. Rename the downloaded file to: frpc_darwin_amd64_v0.3
3. Move the file to this location: /Users/aryansinha/.cache/huggingface/gradio/frpc





