In [1]:
#%pip install langchain langchain-community langchainhub faiss-cpu python-dotenv

# %pip install -U langchain-ollama

# %pip install pymupdf

# %pip install arxiv langchainhub requests


In [2]:
from langchain_ollama import OllamaLLM, OllamaEmbeddings

from langchain_community.document_loaders import PyMuPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS

from langchain.chains import RetrievalQA

import arxiv

import os

In [3]:
pdf_path = "data/raw_papers/sample_paper.pdf"

loader = PyMuPDFLoader(pdf_path)

documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = splitter.split_documents(documents)

print(f"Loaded {len(documents)} pages, split into {len(chunks)} chunks.")

Loaded 14 pages, split into 82 chunks.


In [4]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

vectorstore = FAISS.from_documents(chunks, embedding_model)

vectorstore.save_local("embeddings/faiss_index")

In [5]:
retriever = vectorstore.as_retriever()

llm = OllamaLLM(model="llama3")

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [6]:
query = "What is the main contribution of this paper?"

response = qa_chain.invoke({"query": query})

print("Answer:", response['result'])

Answer: I don't know the answer to that question. The provided context appears to be a collection of papers and references related to protein generation and representation learning, but it does not explicitly state the main contribution of the paper "SE(3) diffusion model with application to protein backbone generation". To determine the main contribution, I would need more information about the specific paper being discussed.


In [7]:
import arxiv
from arxiv import Client

def fetch_paper_from_arxiv(query, max_results=3):

    search = arxiv.Search(
        
        query=query,
        
        max_results=max_results,
        
        sort_by=arxiv.SortCriterion.Relevance
        
    )

    client = Client()

    results = []
    
    for result in client.results(search):
        
        paper_info = {
            
            "title": result.title,
            
            "authors": [a.name for a in result.authors],
            
            "summary": result.summary,
            
            "url": result.entry_id,
            
            "pdf_url": result.pdf_url
            
        }
        
        results.append(paper_info)

    return results

arxiv_results = fetch_paper_from_arxiv("Hierarchical conditional diffusion protein")

for res in arxiv_results:
    
    print(f"\nTitle: {res['title']}\nPDF: {res['pdf_url']}\nSummary: {res['summary'][:500]}...")




Title: Demystify Protein Generation with Hierarchical Conditional Diffusion Models
PDF: http://arxiv.org/pdf/2507.18603v1
Summary: Generating novel and functional protein sequences is critical to a wide range
of applications in biology. Recent advancements in conditional diffusion models
have shown impressive empirical performance in protein generation tasks.
However, reliable generations of protein remain an open research question in de
novo protein design, especially when it comes to conditional diffusion models.
Considering the biological function of a protein is determined by multi-level
structures, we propose a novel m...

Title: Molecule Generation for Target Protein Binding with Hierarchical Consistency Diffusion Model
PDF: http://arxiv.org/pdf/2503.00975v1
Summary: Effective generation of molecular structures, or new chemical entities, that
bind to target proteins is crucial for lead identification and optimization in
drug discovery. Despite advancements in atom- and motif-wis

In [8]:
from utils.rag_agent import load_vectorstore, init_qa_chain, answer_query

from langchain_ollama import OllamaEmbeddings 

vectorstore = load_vectorstore()

qa_chain = init_qa_chain(vectorstore, model_name="llama3")  

query = "What is the main contribution of this paper?"

response = answer_query(query, qa_chain)

print("Answer:", response)



Answer: {'query': 'What is the main contribution of this paper?', 'result': "I don't know the answer to that question based on the provided context. The text appears to be an abstract or summary of a conference paper, and it does not explicitly state what the main contribution of the paper is. It seems to provide some background information on protein design methods and representation learning for proteins, but it does not clearly outline the main findings or contributions of the paper."}


In [None]:
from utils.fetch_papers import fetch_paper_from_arxiv

from utils.parse_pdf import extract_and_split_pdf

from utils.chunk_embed import embed_chunks

from utils.rag_agent import load_vectorstore, init_qa_chain, answer_query, ask_multiple_questions

import os

import requests

# ----------- USER INPUT -----------
query_topic = "fundamental concepts in machine learning and deep learning"

user_question = "From the entire content of this research paper, identify the main contribution. If it's not explicitly stated, infer it from the introduction, methodology, and conclusion."

# ----------- STEP 1: Fetch Arxiv paper metadata -----------
arxiv_results = fetch_paper_from_arxiv(query_topic, max_results=1)

paper = arxiv_results[0]

pdf_url = paper['pdf_url']

pdf_filename = f"{pdf_url.split('/')[-1]}.pdf"

pdf_path = f"data/raw_papers/{pdf_filename}"

# ----------- STEP 2: Download the PDF -----------
if not os.path.exists(pdf_path):
    
    response = requests.get(pdf_url)
    
    with open(pdf_path, "wb") as f:
        
        f.write(response.content)
        
    print(f"✅ PDF downloaded: {pdf_path}")
    
else:
    
    print(f"📄 PDF already exists: {pdf_path}")

# ----------- STEP 3: Extract text & split into chunks -----------
chunks = extract_and_split_pdf(pdf_path)

# ----------- STEP 4: Embed and save vectorstore -----------
embed_chunks(chunks)

# ----------- STEP 5: Load vectorstore and create QA chain -----------
vectorstore = load_vectorstore()

qa_chain = init_qa_chain(vectorstore, model_name="llama3")

# ----------- STEP 6: Ask main question -----------
response = answer_query(user_question, qa_chain)

print("🔍 Final Answer:", response)

# ----------- STEP 7: Ask multiple insightful questions -----------
ask_multiple_questions(qa_chain)


✅ PDF downloaded: data/raw_papers/2202.01319v1.pdf
Loaded 35 pages and split into 64 chunks.
FAISS index saved to: embeddings/faiss_index
🔍 Final Answer: {'query': "From the entire content of this research paper, identify the main contribution. If it's not explicitly stated, infer it from the introduction, methodology, and conclusion.", 'result': 'The main contribution of this research paper is not explicitly stated in the provided text. However, based on the introduction and methodology sections, I can infer that the main contribution is the application of deep learning techniques to traditional epidemiology, highlighting its potential for automating feature discovery and model fitting in data analysis.'}
📄 Answering multiple research questions:

➡️ Summarize the abstract of this paper.
🧠 {'query': 'Summarize the abstract of this paper.', 'result': "I don't have enough information to summarize the abstract of this paper, as there is no abstract provided. The text appears to be a mix o

In [14]:
from utils.parse_pdf import extract_and_split_pdf

from utils.chunk_embed import embed_chunks

from utils.rag_agent import load_vectorstore, init_qa_chain, answer_query

import os

pdf_folder = "data/raw_papers"

index_path = "embeddings/faiss_index"

# =============== STEP 1: Extract + Embed if index doesn't exist ===============
if not os.path.exists(index_path):
    
    print("📦 No FAISS index found — embedding all PDFs now...")
    
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
    
    all_chunks = []

    for pdf_file in pdf_files:
        
        pdf_path = os.path.join(pdf_folder, pdf_file)
        
        print(f"🔍 Processing: {pdf_path}")
        
        chunks = extract_and_split_pdf(pdf_path)
        
        all_chunks.extend(chunks)

    embed_chunks(all_chunks)
    
    print("✅ All PDFs embedded and FAISS index created.")
    
else:
    
    print("✅ FAISS index already exists. Skipping embedding.")

# =============== STEP 2: Load Vectorstore + LLM Chain ===============
vectorstore = load_vectorstore()

qa_chain = init_qa_chain(vectorstore, model_name="llama3")

# =============== STEP 3: User Query from Research Papers ===============
print("📄 You can now ask anything about the research papers in raw_papers/")

user_query = input("🧠 Enter your question:\n→ ")

answer = answer_query(user_query, qa_chain)

print("\n📌 Answer (based strictly on the PDF content):\n", answer)

✅ FAISS index already exists. Skipping embedding.
📄 You can now ask anything about the research papers in raw_papers/

📌 Answer (based strictly on the PDF content):
 {'query': 'explain me in detail about the fundamental concepts in machine learning and deep learning', 'result': "Based on the provided context, I'll try to summarize the fundamental concepts in machine learning and deep learning:\n\n**Machine Learning Fundamentals**\n\n1. **Supervised Learning**: In this approach, you have labeled data (input-output pairs) that allows an algorithm to learn patterns and make predictions.\n2. **Unsupervised Learning**: Without labeled data, algorithms discover hidden patterns or groupings within the data.\n3. **Reinforcement Learning**: This involves learning through trial-and-error by receiving rewards or penalties for certain actions.\n\n**Machine Learning Algorithms**\n\n1. **Supervised Learning Algorithms**: Focus of this article (unsaid but implied).\n2. **Unsupervised Learning Algorit