In [1]:
# Basic imports
import requests
import json
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import arxiv
from pprint import pprint

print("All imports successful!")
print(f"Current time: {datetime.now()}")

All imports successful!
Current time: 2025-09-20 14:28:04.504664


In [2]:
# Make a raw HTTP request to arXiv API first
base_url = "http://export.arxiv.org/api/query"

# Simple search for AI papers
params = {
    'search_query': 'cat:cs.AI',
    'start': 0,
    'max_results': 3,
    'sortBy': 'submittedDate',
    'sortOrder': 'descending'
}

print("Making request to arXiv API...")
print(f"URL: {base_url}")
print(f"Parameters: {params}")

response = requests.get(base_url, params=params)

print(f"\nResponse status: {response.status_code}")
print(f"Response headers: {dict(response.headers)}")
print(f"Response length: {len(response.content)} bytes")

Making request to arXiv API...
URL: http://export.arxiv.org/api/query
Parameters: {'search_query': 'cat:cs.AI', 'start': 0, 'max_results': 3, 'sortBy': 'submittedDate', 'sortOrder': 'descending'}

Response status: 200
Response headers: {'Connection': 'keep-alive', 'Content-Length': '2906', 'content-encoding': 'gzip', 'server': 'Apache', 'content-type': 'application/atom+xml; charset=UTF-8', 'via': '1.1 varnish, 1.1 varnish, 1.1 varnish', 'access-control-allow-origin': '*', 'Accept-Ranges': 'bytes', 'Age': '0', 'Date': 'Sat, 20 Sep 2025 13:28:29 GMT', 'X-Served-By': 'cache-lga21958-LGA, cache-lga21971-LGA, cache-man4136-MAN', 'X-Cache': 'MISS, MISS, MISS', 'X-Cache-Hits': '0, 0, 0', 'X-Timer': 'S1758374909.921910,VS0,VE276', 'Vary': 'Accept-Encoding', 'Strict-Transport-Security': 'max-age=300'}
Response length: 8883 bytes


In [3]:
# Look at the raw XML response
print("Raw XML response (first 1000 characters):")
print(response.text[:1000])
print("\n" + "="*50 + "\n")

# Parse the XML
root = ET.fromstring(response.content)
print(f"Root tag: {root.tag}")
print(f"Root attributes: {root.attrib}")

# Show the structure
print("\nXML structure:")
for i, child in enumerate(root):
    print(f"  {i}: {child.tag}")
    if i > 10:  # Don't print too many
        print("  ...")
        break

Raw XML response (first 1000 characters):
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.AI%26id_list%3D%26start%3D0%26max_results%3D3" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=cat:cs.AI&amp;id_list=&amp;start=0&amp;max_results=3</title>
  <id>http://arxiv.org/api/R0SPnx8l8tIxrHMPwcpdin6AZn8</id>
  <updated>2025-09-20T00:00:00-04:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">142084</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">3</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/2509.15217v1</id>
    <updated>2025-09-18T17:59:11Z</updated>
    <published>2025-09-18T17:59:11Z</published>
    <title>Generalizable 

In [4]:
# Manual XML parsing to understand the structure
namespaces = {
    'atom': 'http://www.w3.org/2005/Atom',
    'arxiv': 'http://arxiv.org/schemas/atom',
    'opensearch': 'http://a9.com/-/spec/opensearch/1.1/'
}

# Get total results
total_results = root.find('opensearch:totalResults', namespaces)
if total_results is not None:
    print(f"Total results available: {total_results.text}")

# Extract paper entries
entries = root.findall('atom:entry', namespaces)
print(f"Number of papers returned: {len(entries)}")

# Parse first paper in detail
if entries:
    first_paper = entries[0]
    print("\n" + "="*50)
    print("FIRST PAPER DETAILS:")
    print("="*50)
    
    # Extract all the key fields
    paper_data = {}
    
    # ID and URLs
    id_elem = first_paper.find('atom:id', namespaces)
    paper_data['id'] = id_elem.text if id_elem is not None else "N/A"
    
    # Title
    title_elem = first_paper.find('atom:title', namespaces)
    paper_data['title'] = title_elem.text.strip() if title_elem is not None else "N/A"
    
    # Summary (abstract)
    summary_elem = first_paper.find('atom:summary', namespaces)
    paper_data['summary'] = summary_elem.text.strip() if summary_elem is not None else "N/A"
    
    # Published date
    published_elem = first_paper.find('atom:published', namespaces)
    paper_data['published'] = published_elem.text if published_elem is not None else "N/A"
    
    # Authors
    authors = []
    for author in first_paper.findall('atom:author', namespaces):
        name_elem = author.find('atom:name', namespaces)
        if name_elem is not None:
            authors.append(name_elem.text)
    paper_data['authors'] = authors
    
    # Categories
    categories = []
    for category in first_paper.findall('atom:category', namespaces):
        term = category.get('term')
        if term:
            categories.append(term)
    paper_data['categories'] = categories
    
    # Links (especially PDF)
    links = {}
    for link in first_paper.findall('atom:link', namespaces):
        rel = link.get('rel', 'alternate')
        title = link.get('title', rel)
        href = link.get('href')
        links[title] = href
    paper_data['links'] = links
    
    # Print extracted data
    print(json.dumps(paper_data, indent=2))

Total results available: 142084
Number of papers returned: 3

FIRST PAPER DETAILS:
{
  "id": "http://arxiv.org/abs/2509.15217v1",
  "title": "Generalizable Geometric Image Caption Synthesis",
  "summary": "Multimodal large language models have various practical applications that\ndemand strong reasoning abilities. Despite recent advancements, these models\nstill struggle to solve complex geometric problems. A key challenge stems from\nthe lack of high-quality image-text pair datasets for understanding geometric\nimages. Furthermore, most template-based data synthesis pipelines typically\nfail to generalize to questions beyond their predefined templates. In this\npaper, we bridge this gap by introducing a complementary process of\nReinforcement Learning with Verifiable Rewards (RLVR) into the data generation\npipeline. By adopting RLVR to refine captions for geometric images synthesized\nfrom 50 basic geometric relations and using reward signals derived from\nmathematical problem-solvin

In [5]:
# Now let's use the arxiv library to do the same thing
print("Using arxiv library for the same search...")

# Create a search
search = arxiv.Search(
    query="cat:cs.AI",
    max_results=3,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending
)

print("Fetching results...")
results = list(search.results())

print(f"Got {len(results)} results")
print("\n" + "="*50)
print("FIRST PAPER USING ARXIV LIBRARY:")
print("="*50)

if results:
    paper = results[0]
    
    # Create clean dictionary
    paper_dict = {
        'arxiv_id': paper.entry_id.split('/')[-1],  # Extract ID from URL
        'title': paper.title,
        'summary': paper.summary,
        'published': paper.published.isoformat(),
        'updated': paper.updated.isoformat() if paper.updated else None,
        'authors': [str(author) for author in paper.authors],
        'primary_category': paper.primary_category,
        'categories': paper.categories,
        'pdf_url': paper.pdf_url,
        'comment': paper.comment,
        'journal_ref': paper.journal_ref,
        'doi': paper.doi
    }
    
    print(json.dumps(paper_dict, indent=2))

Using arxiv library for the same search...
Fetching results...


  results = list(search.results())


Got 3 results

FIRST PAPER USING ARXIV LIBRARY:
{
  "arxiv_id": "2509.15217v1",
  "title": "Generalizable Geometric Image Caption Synthesis",
  "summary": "Multimodal large language models have various practical applications that\ndemand strong reasoning abilities. Despite recent advancements, these models\nstill struggle to solve complex geometric problems. A key challenge stems from\nthe lack of high-quality image-text pair datasets for understanding geometric\nimages. Furthermore, most template-based data synthesis pipelines typically\nfail to generalize to questions beyond their predefined templates. In this\npaper, we bridge this gap by introducing a complementary process of\nReinforcement Learning with Verifiable Rewards (RLVR) into the data generation\npipeline. By adopting RLVR to refine captions for geometric images synthesized\nfrom 50 basic geometric relations and using reward signals derived from\nmathematical problem-solving tasks, our pipeline successfully captures the ke

In [6]:
# Let's try different types of searches
search_queries = [
    {
        'name': 'Recent AI Papers',
        'query': 'cat:cs.AI',
        'description': 'All AI papers'
    },
    {
        'name': 'Transformer Papers',
        'query': 'all:transformer',
        'description': 'Papers mentioning "transformer"'
    },
    {
        'name': 'Recent ML + Attention',
        'query': 'cat:cs.LG AND all:attention',
        'description': 'ML papers about attention'
    },
    {
        'name': 'Recent Papers by Date',
        'query': f'cat:cs.AI AND submittedDate:[{(datetime.now() - timedelta(days=7)).strftime("%Y%m%d")} TO *]',
        'description': 'AI papers from last 7 days'
    }
]

for search_config in search_queries:
    print(f"\n{'='*60}")
    print(f"SEARCH: {search_config['name']}")
    print(f"DESCRIPTION: {search_config['description']}")
    print(f"QUERY: {search_config['query']}")
    print('='*60)
    
    try:
        search = arxiv.Search(
            query=search_config['query'],
            max_results=2,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )
        
        results = list(search.results())
        print(f"Found {len(results)} results")
        
        for i, paper in enumerate(results):
            print(f"\n{i+1}. {paper.title}")
            print(f"   Authors: {', '.join(str(author) for author in paper.authors[:3])}")
            print(f"   Published: {paper.published.strftime('%Y-%m-%d')}")
            print(f"   Categories: {', '.join(paper.categories[:3])}")
            print(f"   Abstract (first 100 chars): {paper.summary[:100]}...")
            
    except Exception as e:
        print(f"Error with search: {e}")


SEARCH: Recent AI Papers
DESCRIPTION: All AI papers
QUERY: cat:cs.AI


  results = list(search.results())


Found 2 results

1. Generalizable Geometric Image Caption Synthesis
   Authors: Yue Xin, Wenyuan Wang, Rui Pan
   Published: 2025-09-18
   Categories: cs.AI, cs.CV, cs.LG
   Abstract (first 100 chars): Multimodal large language models have various practical applications that
demand strong reasoning ab...

2. Explicit Context-Driven Neural Acoustic Modeling for High-Fidelity RIR Generation
   Authors: Chen Si, Qianyi Wu, Chaitanya Amballa
   Published: 2025-09-18
   Categories: cs.SD, cs.AI, cs.LG
   Abstract (first 100 chars): Realistic sound simulation plays a critical role in many applications. A key
element in sound simula...

SEARCH: Transformer Papers
DESCRIPTION: Papers mentioning "transformer"
QUERY: all:transformer
Found 2 results

1. Geometric Image Synchronization with Deep Watermarking
   Authors: Pierre Fernandez, Tomáš Souček, Nikola Jovanović
   Published: 2025-09-18
   Categories: cs.CV
   Abstract (first 100 chars): Synchronization is the task of estimating and invertin

In [7]:
def search_arxiv_papers(query, max_results=5, sort_by='submittedDate'):
    """
    Helper function to search arXiv and return clean data
    """
    sort_criterion = {
        'submittedDate': arxiv.SortCriterion.SubmittedDate,
        'relevance': arxiv.SortCriterion.Relevance,
        'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate
    }.get(sort_by, arxiv.SortCriterion.SubmittedDate)
    
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=sort_criterion,
        sort_order=arxiv.SortOrder.Descending
    )
    
    papers = []
    for result in search.results():
        paper = {
            'arxiv_id': result.entry_id.split('/')[-1],
            'title': result.title,
            'summary': result.summary,
            'published': result.published.isoformat(),
            'authors': [str(author) for author in result.authors],
            'categories': result.categories,
            'pdf_url': result.pdf_url
        }
        papers.append(paper)
    
    return papers

def print_papers_summary(papers):
    """
    Pretty print papers summary
    """
    print(f"Found {len(papers)} papers:")
    print("-" * 80)
    
    for i, paper in enumerate(papers, 1):
        print(f"{i}. {paper['title']}")
        print(f"   ID: {paper['arxiv_id']}")
        print(f"   Authors: {', '.join(paper['authors'][:2])}")
        print(f"   Published: {paper['published'][:10]}")
        print(f"   Categories: {', '.join(paper['categories'][:3])}")
        print()

# Test the helper functions
print("Testing helper functions...")
papers = search_arxiv_papers("cat:cs.AI", max_results=3)
print_papers_summary(papers)

# Save to JSON file for later use
with open('../data/sample_papers.json', 'w') as f:
    json.dump(papers, f, indent=2)
    
print(f"\nSaved {len(papers)} papers to data/sample_papers.json")

Testing helper functions...


  for result in search.results():


Found 3 papers:
--------------------------------------------------------------------------------
1. Generalizable Geometric Image Caption Synthesis
   ID: 2509.15217v1
   Authors: Yue Xin, Wenyuan Wang
   Published: 2025-09-18
   Categories: cs.AI, cs.CV, cs.LG

2. Explicit Context-Driven Neural Acoustic Modeling for High-Fidelity RIR Generation
   ID: 2509.15210v1
   Authors: Chen Si, Qianyi Wu
   Published: 2025-09-18
   Categories: cs.SD, cs.AI, cs.LG

3. FlowRL: Matching Reward Distributions for LLM Reasoning
   ID: 2509.15207v1
   Authors: Xuekai Zhu, Daixuan Cheng
   Published: 2025-09-18
   Categories: cs.LG, cs.AI, cs.CL


Saved 3 papers to data/sample_papers.json


In [8]:
import time

def safe_arxiv_search(query, max_results=5, delay=1):
    """
    Search with error handling and rate limiting
    """
    try:
        print(f"Searching for: {query}")
        print(f"Adding {delay}s delay to respect rate limits...")
        time.sleep(delay)
        
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )
        
        results = list(search.results())
        print(f"✅ Successfully retrieved {len(results)} papers")
        return results
        
    except Exception as e:
        print(f"❌ Error occurred: {e}")
        return []

# Test error handling
test_queries = [
    "cat:cs.AI",
    "invalid:query:format",  # This might cause an error
    "cat:cs.LG AND all:neural"
]

for query in test_queries:
    papers = safe_arxiv_search(query, max_results=2, delay=1)
    print(f"Query '{query}' returned {len(papers)} papers\n")

Searching for: cat:cs.AI
Adding 1s delay to respect rate limits...


  results = list(search.results())


✅ Successfully retrieved 2 papers
Query 'cat:cs.AI' returned 2 papers

Searching for: invalid:query:format
Adding 1s delay to respect rate limits...
✅ Successfully retrieved 0 papers
Query 'invalid:query:format' returned 0 papers

Searching for: cat:cs.LG AND all:neural
Adding 1s delay to respect rate limits...
✅ Successfully retrieved 2 papers
Query 'cat:cs.LG AND all:neural' returned 2 papers



In [9]:
query

'cat:cs.LG AND all:neural'

# Where to search for your keywords:

* `all:transformer`     # Search title, abstract, comments, AND author names
* `ti:attention`        # Search ONLY in title  
* `abs:neural`          # Search ONLY in abstract
* `co:preliminary`      # Search ONLY in comments
* `au:lecun`           # Search ONLY in author names