In [None]:
import requests
import feedparser
from urllib.parse import quote
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import os



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def fetch_arxiv_papers(query, max_results=3):
    base_url = "http://export.arxiv.org/api/query?"
    encoded_query = quote(query)
    search_query = (
        f"search_query=all:{encoded_query}&start=0&max_results={max_results}"
        "&sortBy=submittedDate&sortOrder=descending"
    )
    feed = feedparser.parse(base_url + search_query)

    papers = []
    for entry in feed.entries:
        paper = {
            "title": entry.title,
            "authors": [author.name for author in entry.authors],
            "summary": entry.summary,
            "pdf_url": entry.id.replace("abs", "pdf") + ".pdf",
        }
        papers.append(paper)
    return papers


In [None]:
def download_paper(output_path, url):
    response = requests.get(url)
    if response.status_code == 200:
        with open(output_path, "wb") as f:
            f.write(response.content)
        print(f" Downloaded: {output_path}")
    else:
        print(" Failed to download PDF.")

In [4]:
def read_pdf(save_path):
    reader = PdfReader(save_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text



In [5]:
def chunk_text(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " "],
    )
    chunks = splitter.split_text(text)
    return chunks

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [7]:
def store_in_faiss(chunks, metadatas=None, save_path="faiss_index"):
    docs = [
        Document(page_content=chunk, metadata=meta or {})
        for chunk, meta in zip(chunks, metadatas or [{}] * len(chunks))
    ]

    vectorstore = FAISS.from_documents(docs, embeddings)
    os.makedirs(save_path, exist_ok=True)
    vectorstore.save_local(save_path)
    print(f"✅ FAISS index saved successfully at: {save_path}")
    return vectorstore


In [8]:
papers = fetch_arxiv_papers("large language models", max_results=1)
first_paper = papers[0]
print("📄 Paper Title:", first_paper["title"])

📄 Paper Title: From nuclear star clusters to Little Red Dots: black hole growth,
  mergers, and tidal disruptions


In [9]:
pdf_path = "paper.pdf"
download_paper(pdf_path, first_paper["pdf_url"])

# Read and chunk the paper
text = read_pdf(pdf_path)
chunks = chunk_text(text)

# Create FAISS index
vectorstore = store_in_faiss(chunks, save_path="faiss_index")

# Test loading
vectorstore = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)
print("✅ FAISS index created, saved, and reloaded successfully!")
print("Files in faiss_index:", os.listdir("faiss_index"))


✅ Downloaded: paper.pdf
✅ FAISS index saved successfully at: faiss_index
✅ FAISS index created, saved, and reloaded successfully!
Files in faiss_index: ['index.faiss', 'index.pkl']
