In [1]:
#%pip install langchain langchain-community langchainhub faiss-cpu python-dotenv

# %pip install -U langchain-ollama

# %pip install pymupdf

# %pip install arxiv langchainhub requests


In [2]:
from langchain_ollama import OllamaLLM, OllamaEmbeddings

from langchain_community.document_loaders import PyMuPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS

from langchain.chains import RetrievalQA

import arxiv

import os

In [3]:
pdf_path = "data/raw_papers/sample_paper.pdf"

loader = PyMuPDFLoader(pdf_path)

documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = splitter.split_documents(documents)

print(f"Loaded {len(documents)} pages, split into {len(chunks)} chunks.")

Loaded 14 pages, split into 82 chunks.


In [4]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

vectorstore = FAISS.from_documents(chunks, embedding_model)

vectorstore.save_local("embeddings/faiss_index")

In [5]:
retriever = vectorstore.as_retriever()

llm = OllamaLLM(model="llama3")

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [6]:
query = "What is the main contribution of this paper?"

response = qa_chain.invoke({"query": query})

print("Answer:", response['result'])

Answer: I don't know the answer to that question. The text does not explicitly state what the main contribution of the paper is. It appears to be a conference proceeding about a paper titled "SE(3) diffusion model with application to protein backbone generation", but it does not provide a clear summary or highlight the main contribution of the paper.


In [7]:
import arxiv
from arxiv import Client

def fetch_paper_from_arxiv(query, max_results=3):

    search = arxiv.Search(
        
        query=query,
        
        max_results=max_results,
        
        sort_by=arxiv.SortCriterion.Relevance
        
    )

    client = Client()

    results = []
    
    for result in client.results(search):
        
        paper_info = {
            
            "title": result.title,
            
            "authors": [a.name for a in result.authors],
            
            "summary": result.summary,
            
            "url": result.entry_id,
            
            "pdf_url": result.pdf_url
            
        }
        
        results.append(paper_info)

    return results

arxiv_results = fetch_paper_from_arxiv("Hierarchical conditional diffusion protein")

for res in arxiv_results:
    
    print(f"\nTitle: {res['title']}\nPDF: {res['pdf_url']}\nSummary: {res['summary'][:500]}...")




Title: Demystify Protein Generation with Hierarchical Conditional Diffusion Models
PDF: http://arxiv.org/pdf/2507.18603v1
Summary: Generating novel and functional protein sequences is critical to a wide range
of applications in biology. Recent advancements in conditional diffusion models
have shown impressive empirical performance in protein generation tasks.
However, reliable generations of protein remain an open research question in de
novo protein design, especially when it comes to conditional diffusion models.
Considering the biological function of a protein is determined by multi-level
structures, we propose a novel m...

Title: Molecule Generation for Target Protein Binding with Hierarchical Consistency Diffusion Model
PDF: http://arxiv.org/pdf/2503.00975v1
Summary: Effective generation of molecular structures, or new chemical entities, that
bind to target proteins is crucial for lead identification and optimization in
drug discovery. Despite advancements in atom- and motif-wis

In [1]:
from utils.rag_agent import load_vectorstore, init_qa_chain, answer_query

vectorstore = load_vectorstore()

qa_chain = init_qa_chain(vectorstore, model_name="llama3")

query = "What is the main contribution of this paper?"

response = answer_query(query, qa_chain)

print("Answer:", response)


  return qa_chain.run(query)


Answer: I don't know. The text provides an abstract and citations to related papers, but it does not explicitly state what the main contribution of the paper is.


In [2]:
from utils.fetch_papers import fetch_paper_from_arxiv

from utils.parse_pdf import extract_and_split_pdf

from utils.chunk_embed import embed_chunks

from utils.rag_agent import run_rag_query

query_topic = "PyTorch graph transformer"  # ← user sets this

user_question = "What is the main contribution of this paper?"  # ← can change to anything

papers = fetch_paper_from_arxiv(query_topic, max_results=1)

if not papers:
    
    print("No results found.")
    
else:
    
    top_paper = papers[0]
    
    print(f"\n Title: {top_paper['title']}\n PDF: {top_paper['pdf_url']}")

    import requests, os
    
    pdf_url = top_paper["pdf_url"]
    
    local_path = f"data/raw_papers/auto_paper.pdf"
    
    response = requests.get(pdf_url)
    
    with open(local_path, "wb") as f:
        
        f.write(response.content)

    chunks = extract_and_split_pdf(local_path)
    
    retriever = embed_chunks(chunks)

    print("\n Question:", user_question)
    
    result = run_rag_query(user_question, retriever)
    
    print("\n Answer:", result)


ImportError: cannot import name 'run_rag_query' from 'utils.rag_agent' (c:\Users\ASUS\OneDrive\Desktop\Research_Agent\utils\rag_agent.py)