In [1]:
import os
import arxiv
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [2]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    print("Error: GROQ_API_KEY not found in .env file")
    exit(1)

In [3]:
print("--- Step 1: Environment Setup Complete ---\n")

# 2. Define our Goal
QUERY = "Agents"
MAX_RESULTS = 5

--- Step 1: Environment Setup Complete ---



In [4]:
print(f"--- Step 2: Searching ArXiv for '{QUERY}' ---")

# 3. Search ArXiv (The Raw Way)
# We use the 'arxiv' library directly. No classes, just a client.
client = arxiv.Client()
search = arxiv.Search(
    query=QUERY,
    max_results=MAX_RESULTS,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

--- Step 2: Searching ArXiv for 'Agents' ---


In [5]:
# Store results in a simple list
found_papers = []
for result in client.results(search):
    paper = {
        "title": result.title,
        "summary": result.summary,
        "url": result.entry_id,
        "published": result.published.strftime("%Y-%m-%d")
    }
    found_papers.append(paper)
    print(f"Found: {paper['title']}")

print(f"\nTotal papers found: {len(found_papers)}\n")

Found: Point Bridge: 3D Representations for Cross Domain Policy Learning
Found: LLM-in-Sandbox Elicits General Agentic Intelligence
Found: Average Unfairness in Routing Games
Found: Delayed Assignments in Online Non-Centroid Clustering with Stochastic Arrivals
Found: Controlling Long-Horizon Behavior in Language Model Agents with Explicit State Dynamics

Total papers found: 5



In [6]:
# 4. Filter with AI (The Logic)
print("--- Step 3: AI Filtering ---")

# Setup the Model
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    api_key=GROQ_API_KEY,
    temperature=0
)

--- Step 3: AI Filtering ---


In [7]:
# Create a prompt tells the AI what to do
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a research assistant. Select papers relevant to the user's query."),
    ("user", """
    User Query: {query}
    
    Here are the papers:
    {papers}
    
    Return ONLY a JSON object with a key 'indices' containing the indices (0-based numbers) of the papers that are relevant.
    Example: {{ "indices": [0, 2] }}
    """)
])

In [None]:
# detailed paper text for the AI to read
paper_text = ""
for i, p in enumerate(found_papers):
    paper_text += f"Index {i}:\nTitle: {p['title']}\nSummary: {p['summary'][:200]}...\n\n"

In [11]:
paper_text

'Index 0:\nTitle: Point Bridge: 3D Representations for Cross Domain Policy Learning\nSummary: Robot foundation models are beginning to deliver on the promise of generalist robotic agents, yet progress remains constrained by the scarcity of large-scale real-world manipulation datasets. Simulati...\n\nIndex 1:\nTitle: LLM-in-Sandbox Elicits General Agentic Intelligence\nSummary: We introduce LLM-in-Sandbox, enabling LLMs to explore within a code sandbox (i.e., a virtual computer), to elicit general intelligence in non-code domains. We first demonstrate that strong LLMs, witho...\n\nIndex 2:\nTitle: Average Unfairness in Routing Games\nSummary: We propose average unfairness as a new measure of fairness in routing games, defined as the ratio between the average latency and the minimum latency experienced by users. This measure is a natural co...\n\nIndex 3:\nTitle: Delayed Assignments in Online Non-Centroid Clustering with Stochastic Arrivals\nSummary: Clustering is a fundamental problem, 

In [9]:
# Connect the pieces: Prompt -> Model -> JSON Parser
chain = prompt | llm | JsonOutputParser()

In [10]:
result = chain.invoke({
    "query": QUERY,
    "papers": paper_text
})

In [12]:
result

{'indices': [0, 1, 3, 4]}

In [14]:
selected_indices = result.get("indices", [])
selected_indices

[0, 1, 3, 4]

In [None]:
# 5. Show Final Results

In [15]:
print("--- Final Selected Papers ---")
for idx in selected_indices:
    if 0 <= idx < len(found_papers):
        p = found_papers[idx]
        print(f"Title: {p['title']}")
        print(f"Summary: {p['summary']}")
        print(f"URL:   {p['url']}")
        print("---")

--- Final Selected Papers ---
Title: Point Bridge: 3D Representations for Cross Domain Policy Learning
Summary: Robot foundation models are beginning to deliver on the promise of generalist robotic agents, yet progress remains constrained by the scarcity of large-scale real-world manipulation datasets. Simulation and synthetic data generation offer a scalable alternative, but their usefulness is limited by the visual domain gap between simulation and reality. In this work, we present Point Bridge, a framework that leverages unified, domain-agnostic point-based representations to unlock synthetic datasets for zero-shot sim-to-real policy transfer, without explicit visual or object-level alignment. Point Bridge combines automated point-based representation extraction via Vision-Language Models (VLMs), transformer-based policy learning, and efficient inference-time pipelines to train capable real-world manipulation agents using only synthetic data. With additional co-training on small set