In [None]:
import os
from dotenv import load_dotenv
import pinecone
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
import langchain
import arxiv
import requests 
from PyPDF2 import PdfReader
from io import BytesIO


print(langchain.__version__)

1.0.3


In [40]:
SYSTEM_PROMPT = """
You are a research assistant helping the user understand scientific papers retrieved from arXiv.
Use the context retrieved from Pinecone to provide accurate, concise answers with citations.
If unsure, say so.
"""


In [38]:
# !capture --no-stderr
# !pip install --quiet -U langchain_openai langchain_core langchain langchain-community langchain_pinecone beautifulsoup4 requests pinecone tabulate


In [41]:
load_dotenv()

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "arxiv")


In [42]:
# pip install pinecone
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("arxiv")
print("✅ Connected to Pinecone index:", PINECONE_INDEX_NAME, index)

✅ Connected to Pinecone index: arxiv <pinecone.data.index.Index object at 0x00000143048BEAD0>


In [43]:
from langchain_openai import AzureOpenAIEmbeddings

# Embeddings
embeddings_model = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-small",
    dimensions=1536
)

In [None]:
# pip install arxiv
import arxiv

def fetch_arxiv_papers(query, max_results=5):
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    papers = []
    for result in search.results():
        pdf_url = result.pdf_url 
        response = requests.get(pdf_url)
        pdf_file = BytesIO(response.content)
        reader = PdfReader(pdf_file)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text() + "\n"

        papers.append({
            "title": result.title,
            "abstract": result.summary,
            "url": result.entry_id,
            "pdf_text": full_text
        })
    return papers

In [None]:
def index_papers(papers):
    for paper in papers:
        vector = embeddings_model.embed_query(paper['pdf_text']) 
        index.upsert([(paper['url'], vector, {"title": paper['title']})])


In [53]:
topic = input("Enter research topic: ")
papers = fetch_arxiv_papers(topic)
index_papers(papers)
print(f"Indexed {len(papers)} papers on '{topic}' in Pinecone!")


  for result in search.results():


Indexed 5 papers on 'blue flowers' in Pinecone!
