In [14]:
import requests
import xml.etree.ElementTree as ET
import fitz  

from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.document_loaders import WebBaseLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import load_prompt
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from bs4 import BeautifulSoup

In [3]:
# Step 1: Get arXiv Paper URLs
def get_arxiv_paper_urls(query, max_results=10):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
    response = requests.get(url)
    root = ET.fromstring(response.content)
    
    paper_urls = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        paper_url = entry.find("{http://www.w3.org/2005/Atom}id").text
        paper_urls.append(paper_url)
    
    return paper_urls

In [4]:
# Step 2: Extract Paper Metadata
def extract_paper_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    title_tag = soup.find("meta", {"name": "citation_title"})
    title = title_tag["content"] if title_tag else "Title not found"
    
    abstract_tag = soup.find("blockquote", {"class": "abstract"})
    abstract_text = abstract_tag.text.replace("Abstract: ", "").strip() if abstract_tag else "Abstract not found"
    
    pdf_url = url.replace("abs", "pdf") + ".pdf"
    
    return {"title": title, "abstract": abstract_text, "pdf_url": pdf_url}

In [5]:
# Step 3: Download and Extract PDF
def download_and_extract_pdf(pdf_url):
    response = requests.get(pdf_url)
    pdf_filename = pdf_url.split("/")[-1]
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
    
    doc = fitz.open(pdf_filename)
    full_text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        full_text += page.get_text("text")
    doc.close()
    
    return full_text

In [9]:
# # Step 4: Combine extracted PDF content with the vectorization pipeline
# def process_papers_and_add_to_rag(query, max_results=5):
#     # Fetch arXiv paper URLs based on query
#     urls = get_arxiv_paper_urls(query, max_results=max_results)
    
#     # Initialize a list to store documents
#     docs = []
    
#     # Extract content and download each paper
#     for url in urls:
#         content = extract_paper_content(url)
#         print(f"Processing: {content['title']}")
        
#         # Download and extract full paper content (PDF)
#         full_content = download_and_extract_pdf(content["pdf_url"])
#         print(f"Downloaded and extracted paper content, length: {len(full_content)}")
        
#         # Store title and content in a dictionary for processing
#         docs.append({"title": content["title"], "page_content": full_content})
    
#     # Step 5: Prepare the documents for embedding and chunking
#     # Convert to the format needed for text splitting and embedding
#     formatted_docs = [{"page_content": doc["page_content"], "metadata": {"title": doc["title"]}} for doc in docs]
    
#     # Use RecursiveCharacterTextSplitter to chunk the document content
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#     splits = text_splitter.split_documents(formatted_docs)
    
#     # Step 6: Embedding model and vectorstore setup
#     embed = OllamaEmbeddings(model="all-minilm")
#     vectorstore = Chroma.from_documents(documents=splits, embedding=embed)
    
#     # Step 7: RAG Chain setup
#     retriever = vectorstore.as_retriever()
#     prompt = load_prompt("rlm/rag-prompt")  # Load the RAG prompt from a source or file
#     llm = ChatOllama()  # Load your chosen language model here

#     def format_docs(docs):
#         return "\n\n".join(doc.page_content for doc in docs)

#     # Define the RAG chain
#     rag_chain = (
#         {"context": retriever | format_docs, "question": RunnablePassthrough()}
#         | prompt
#         | llm
#         | StrOutputParser()
#     )

#     return rag_chain

In [16]:


def process_papers_and_add_to_rag(query, max_results=5):
    # Fetch arXiv paper URLs based on query
    urls = get_arxiv_paper_urls(query, max_results=max_results)
    
    # Initialize a list to store documents
    docs = []
    
    # Extract content and download each paper
    for url in urls:
        content = extract_paper_content(url)
        print(f"Processing: {content['title']}")
        
        # Download and extract full paper content (PDF)
        full_content = download_and_extract_pdf(content["pdf_url"])
        print(f"Downloaded and extracted paper content, length: {len(full_content)}")
        
        # Store title and content in a dictionary for processing
        docs.append({"title": content["title"], "page_content": full_content})
    
    # Step 5: Prepare the documents for embedding and chunking
    # Convert docs into a list of `Document` objects
    formatted_docs = [
        Document(page_content=doc["page_content"], metadata={"title": doc["title"]}) for doc in docs
    ]
    
    # Use RecursiveCharacterTextSplitter to chunk the document content
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(formatted_docs)
    
    # Step 6: Embedding model and vectorstore setup
    embed = OllamaEmbeddings(model="all-minilm")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embed)
    
    # Step 7: RAG Chain setup
    retriever = vectorstore.as_retriever()
    # prompt = load_prompt("rlm/rag-prompt")  # Load the RAG prompt from a source or file
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert in academic research. Answer questions based on the following academic paper context.",
            ),
            ("user", "Here is the academic context: {context}"),
            ("user", "Now answer the following question: {question}"),
            # MessagesPlaceholder(variable_name="messages"),  # Placeholder for additional user messages
        ]
    )
    llm = ChatOllama()  # Load your chosen language model here

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Define the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # Example query to the RAG chain
    result = rag_chain.invoke("What are the latest advances in neural networks?")
    print(result)

# Example: Use the function to process papers related to "neural networks"
process_papers_and_add_to_rag(query="neural networks", max_results=3)


Processing: Lecture Notes: Neural Network Architectures
Downloaded and extracted paper content, length: 95012
Processing: Self-Organizing Multilayered Neural Networks of Optimal Complexity
Downloaded and extracted paper content, length: 22593
Processing: Neural Network Processing Neural Networks: An efficient way to learn higher order functions
Downloaded and extracted paper content, length: 7378
Based on the academic paper "Neural Network Processing Neural Networks: An Efficient Way to Learn Higher Order Functions" by Firat Tuna, the latest advances in neural networks include:

1. Neural Network Processing Neural Networks (NNPNNs): This is a new class of neural networks that can process and represent rich structures, such as functions, more eflectively than traditional neural networks. NNPNNs input neural networks and numerical values instead of just numerical values, enabling them to reason with these structures more effectively.
2. HyperNetworks: This is a recent development in the 

In [10]:
# Example: Use the function to process papers related to "neural networks"
# rag_chain = process_papers_and_add_to_rag(query="neural networks", max_results=3)

Processing: Lecture Notes: Neural Network Architectures
Downloaded and extracted paper content, length: 95012
Processing: Self-Organizing Multilayered Neural Networks of Optimal Complexity
Downloaded and extracted paper content, length: 22593
Processing: Neural Network Processing Neural Networks: An efficient way to learn higher order functions
Downloaded and extracted paper content, length: 7378


AttributeError: 'dict' object has no attribute 'page_content'