### App 1
The application allows users to enter a research topic, which triggers an agent to search arXiv, retrieve relevant papers, extract their content, convert them to embeddings, and store them in a vector database. Once indexing is complete, the system informs the user.


In [1]:
# !capture --no-stderr
# !pip install --quiet -U langchain_openai langchain_core langchain langchain-community langchain_pinecone beautifulsoup4 requests pinecone tabulate
# !pip install arxiv
# !pip install PyPDF2


In [2]:
import os 
from dotenv import load_dotenv 
import pinecone 
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI 
import langchain 
import arxiv 
import requests 
from PyPDF2 import PdfReader 
from io import BytesIO

In [3]:
load_dotenv()

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "arxiv")


In [4]:
# pip install pinecone
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("arxiv")
print("Connected to Pinecone index:", PINECONE_INDEX_NAME, index)

Connected to Pinecone index: arxiv <pinecone.db_data.index.Index object at 0x00000270180C5160>


In [5]:
llm = AzureChatOpenAI(
    azure_deployment="gpt-4.1",
    temperature=0.2,
    max_tokens=1000
    )

In [6]:
from langchain_openai import AzureOpenAIEmbeddings

# Embeddings
embeddings_model = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-small",
    dimensions=1536
)

In [10]:
# # pip install arxiv
# import arxiv

# def fetch_arxiv_papers(query, max_results=5):
#     search = arxiv.Search(
#         query=query,
#         max_results=max_results,
#         sort_by=arxiv.SortCriterion.Relevance
#     )
#     papers = []
#     for result in search.results():
#         pdf_url = result.pdf_url 
#         response = requests.get(pdf_url)
#         pdf_file = BytesIO(response.content)
#         reader = PdfReader(pdf_file)
#         full_text = ""
#         for page in reader.pages:
#             full_text += page.extract_text() + "\n"

#         papers.append({
#             "title": result.title,
#             "abstract": result.summary,
#             "url": result.entry_id,
#             "pdf_text": full_text
#         })
#     return papers

def fetch_arxiv_papers(query, max_results=5):
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    client = arxiv.Client()  # new API client

    papers = []

    for result in client.results(search):
        try:
            pdf_url = result.pdf_url
            response = requests.get(pdf_url, timeout=20)
            response.raise_for_status()

            pdf_file = BytesIO(response.content)
            reader = PdfReader(pdf_file)
            full_text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"

            combined_text = (result.summary or "") + "\n\n" + full_text

            papers.append({
                "id": result.entry_id,
                "title": result.title,
                "abstract": result.summary,
                "url": result.entry_id,
                "pdf_text": full_text,
                "text": combined_text[:2000],
            })
        except Exception as e:
            print(f"⚠️ Skipping {result.title[:50]}... due to error: {e}")

    print(f"✅ Retrieved {len(papers)} papers for query '{query}'")
    return papers

In [None]:
# def index_papers(papers):
#     for paper in papers:
#         vector = embeddings_model.embed_query(paper['pdf_text'])
#         index.upsert([
#             (
#                 paper['url'],
#                 vector,
#                 {
#                     "title": paper['title'],
#                     "url": paper['url'],
#                     "text": paper['pdf_text'][:2000]  # optional: limit size
#                 }
#             )
#         ])

from tqdm import tqdm

def index_papers(papers, index, embeddings_model):
    """
    Index a list of papers into Pinecone with proper text field for retrieval.
    No namespace used — everything goes into the default (empty) namespace.

    Each paper should contain at least:
        - 'title'
        - 'url'
        - 'pdf_text' or 'content'
    """

    vectors_to_upsert = []

    for paper in tqdm(papers, desc="Indexing papers"):
        text = paper.get("pdf_text") or paper.get("content")
        if not text:
            print(f"⚠️ Skipping paper '{paper.get('title', 'Unknown')}' — no text found.")
            continue

        # Create embedding from truncated text
        embedding = embeddings_model.embed_query(text[:3000])  # truncate to avoid token limits

        # Prepare metadata
        metadata = {
            "title": paper.get("title", "Unknown"),
            "url": paper.get("url", ""),
            "text": text[:3000],   # ✅ this ensures retriever finds the 'text' key
        }

        # Each record needs a unique ID
        paper_id = paper.get("id") or paper.get("url") or paper.get("title", "")[:50]

        vectors_to_upsert.append((paper_id, embedding, metadata))

    # Upsert to Pinecone
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)
        print(f"✅ Indexed {len(vectors_to_upsert)} papers in Pinecone (default namespace).")
    else:
        print("⚠️ No papers indexed.")



In [13]:
# topic = input("Enter research topic: ")
# papers = fetch_arxiv_papers(topic)
# index_papers(papers)
# print(f"Indexed {len(papers)} papers on '{topic}' in Pinecone!")

topic = input("Enter research topic: ")
papers = fetch_arxiv_papers(topic)
index_papers(papers, index, embeddings_model)

print(f"✅ Indexed {len(papers)} papers on '{topic}' in Pinecone!")


✅ Retrieved 5 papers for query 'motor engines'


Indexing papers: 100%|██████████| 5/5 [00:01<00:00,  2.79it/s]


NameError: name 'namespace' is not defined

---------- end of app first --------------


## App 2
 application enables users to query the indexed knowledge base by entering research questions, which the agent retrieves relevant papers for using semantic search, generates informative responses grounded in those papers, and provides proper citations with links and metadata to the original sources.

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import AzureChatOpenAI
from langchain_pinecone import Pinecone

In [None]:
vectorstore = Pinecone(
    index_name=os.environ["PINECONE_INDEX_NAME"],
    embedding=embeddings_model
)

In [None]:
# llm = AzureChatOpenAI(
#     azure_deployment="gpt-4.1",
#     temperature=0.2,
#     max_tokens=1000
#     )

In [None]:
# def retrieve_context(query, k=5):
#     docs = vectorstore.similarity_search(query, k=k)
#     context_text = "\n\n".join([d.page_content for d in docs])
#     sources = [
#         f"- [{d.metadata.get('title', 'Untitled')}]({d.metadata.get('url', 'Unknown URL')})"
#         for d in docs
#     ]
#     return context_text, sources

def retrieve_context(query, k=5):
    docs = vectorstore.similarity_search(query, k=k)
    context_parts = []
    sources = []

    for d in docs:
        # Fallback: use text from metadata if page_content is empty
        content = d.page_content or d.metadata.get("text", "")
        if content:
            context_parts.append(content)
            sources.append(f"- [{d.metadata.get('title', 'Untitled')}]({d.metadata.get('url', 'Unknown URL')})")

    context_text = "\n\n".join(context_parts)
    return context_text, sources


In [None]:
# --- Main research Q&A function ---
def research_qa(query, k=5):
    context, sources = retrieve_context(query, k=k)

    # Create a structured prompt
    system_prompt = (
        "You are a helpful AI research assistant. "
        "Use the provided academic paper excerpts to answer the user's question clearly and concisely. "
        "Cite your sources at the end in markdown link format like [Title](URL). "
        "If the answer cannot be found in the papers, say so explicitly."
    )

    user_prompt = f"Question:\n{query}\n\nContext:\n{context}"

    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=user_prompt)
    ]

    # Invoke the model
    response = llm.invoke(messages)

    # Print formatted output
    print("\nAnswer:")
    print(response.content)
    print("\nSources:")
    for s in sources:
        print(s)

In [None]:
question = input("Enter your research question: ")
research_qa(question)

In [None]:
# results = vectorstore.similarity_search("sun", k=3)
# for i, doc in enumerate(results, 1):
#     print(f"\nDoc {i}: {doc.metadata.get('title')}")
#     print(doc.page_content or doc.metadata.get("text", "")[:300])


In [None]:
<!-- https://colab.research.google.com/drive/1iA6VqdRi1RPirf3PtLfPT74UJiHdPKxr?usp=sharing#scrollTo=ApA0U6w5v8er -->
<!-- https://colab.research.google.com/drive/1-o4mnBbFtTXfaP-2FaG3X3F6q5zxRUbW?usp=sharing#scrollTo=tD4t0206sPgC -->


to do:
Part-2: Agentic Research Assistant

Extend Part-1 by merging the two separate applications into a single unified system where an LLM-driven agent intelligently decides whether the user is in an indexing phase or a query phase based on user intent. The agent should manage state transitions explicitly—informing the user when indexing is complete and they can begin asking questions, or detecting when a user wants to start a new research topic. The system maintains conversational context and guides the user through natural transition points with clear communication about what can be done next, creating a seamless research workflow without requiring separate interfaces.