In [1]:
#import liburl
#url = 'https://arxiv.org/find/grp_physics/1/au:+Buonanno/0/1/0/all/0/1?skip=0&query_id=55370787705b21b4'

#https://arxiv.org/find/grp_physics/1/au:+Buonanno/0/1/0/all/0/1?skip=25&query_id=fc4561a224cecdcf


#https://github.com/angusleigh/arXiv_scraper



import urllib
import feedparser

# Base api query url
base_url = 'https://arxiv.org/multi?group=grp_physics';

# Search parameters
search_query = 'au:+Buonanno' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 3

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
 
# parse the response using feedparser
feed = feedparser.parse(response)
print(feed.entries)


# Run through each entry, and print out information
for entry in feed.entries:
    print( 'e-print metadata')
    print( 'arxiv-id: %s' % entry.id.split('/abs/')[-1])
    print( 'Published: %s' % entry.published)
    print( 'Title:  %s' % entry.title)

# print out feed information
#print( 'Feed title: %s' % feed.feed.title)
#print( 'Feed last updated: %s' % feed.feed.updated)

[]


In [26]:
import urllib
url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1'
data = urllib.request.urlopen(url).read()
print( data)

b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aelectron%26id_list%3D%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8</id>\n  <updated>2018-07-05T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">135244</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/cond-mat/0102536v1</id>\n    <updated>2001-02-28T20:12:09Z</updated>\n    <published>2001-02-28T20:12:09Z</published>\n    <title>Impact of Electron-Electron C

In [19]:
"""
arXiv.py

This is a modification of the arXiv parser here:
http://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt

I changed it to query the server to retrieve all abstracts from articles
of a given category. This does not retrieve the pdf's of the articles 
(arXiv does not like that, see: http://arxiv.org/help/robots).

I used it to create the dataset for our machine learning class's Kaggle competition: 
http://inclass.kaggle.com/c/abstract-classification

Command line usage is: arXiv.py category max_results
Example: arXiv.py stat 5000

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!
"""

import sys
import time
import urllib
import feedparser
import csv
import os

# Parse input args
if len(sys.argv) != 3:
    print( "Usage: python arXiv.py category max_results")
    print( "Example: python arXiv.py stat 5000")
    print( "Exiting")
    exit()
category = "physics"
max_total_results = 3

# Open csv file to write abstracts to
#ofile  = open(os.path.dirname("results") + category + '.csv', "wb")
#writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) 
#writer.writerow(['category', 'abstract'])

# Base api query url
base_url = 'http://export.arxiv.org/api/query?'
#http://export.arxiv.org/api/query?search_query=all:A.%20Buonanno
# Search parameters
search_query = 'all:A. Buonanno' 

# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# Iterating so query is broken down into multiple smaller queries
# with time delays in between so their server doesn't hang up on us
results_per_iter = 500
for i in range(0,max_total_results,results_per_iter):    
    # Search parameters
    start = i                     
    max_results = results_per_iter
    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,start,max_results)

    # Repeat GET requests if there's a <Connection timed out> or some other kind of excpetion
    excepetion_count = 0
    successful_response = False
    while successful_response == False:
        try:
            # Perform a GET request using the base_url and query
            response = urllib.request.urlopen(base_url+query).read()
            successful_response = True
        except IOError:
            print( 'IOError exception. Likely a connection time out. Trying again in 20s.')
            time.sleep(20)
            excepetion_count += 1
            if excepetion_count > 10:
                print( 'Too many IOError exceptions. They have likely hung up for good. Stopping' )
                exit()

    # Parse the response using feedparser
    feed = feedparser.parse(response)
    #print(feed)
    # Print opensearch metadata
    if i == 0:
          print('Total results for this query: %s' % feed.feed.opensearch_totalresults)
    print( 'Recieved: %d results so far' % (i+results_per_iter))

    # Run through each entry, and save the abstract and category to a csv file
    for entry in feed.entries:
        print(entry.title)
        print(entry.author)
        print(entry)
        print(20*'=')
        #raw_abstract = entry.summary
        #abstract = ' '.join(raw_abstract.splitlines())
        #writer.writerow([category, abstract])
        #print([category, abstract])
    print( type(feed.feed.opensearch_itemsperpage))
    print( feed.feed.opensearch_itemsperpage)

    # Need additional stopping condition in case the results available < max_total_results
    #if feed.feed.opensearch_itemsperpage < max_results:
    #    break
    
    # Time delay so we don't overload their server, causing them to hang up on us
    time.sleep(5)    

print( 'Finished!')
#ofile.close()

{'feed': {'links': [{'href': 'http://arxiv.org/api/query?search_query%3Dall%3AA.%26id_list%3D%26start%3D0%26max_results%3D10', 'rel': 'self', 'type': 'application/atom+xml'}], 'title': 'ArXiv Query: search_query=all:A.&id_list=&start=0&max_results=10', 'title_detail': {'type': 'text/html', 'language': None, 'base': '', 'value': 'ArXiv Query: search_query=all:A.&id_list=&start=0&max_results=10'}, 'id': 'http://arxiv.org/api/ID1KGYA9B3Mf0hrtYOkHTz8jLgk', 'guidislink': True, 'link': 'http://arxiv.org/api/ID1KGYA9B3Mf0hrtYOkHTz8jLgk', 'updated': '2018-07-06T00:00:00-04:00', 'updated_parsed': time.struct_time(tm_year=2018, tm_mon=7, tm_mday=6, tm_hour=4, tm_min=0, tm_sec=0, tm_wday=4, tm_yday=187, tm_isdst=0), 'opensearch_totalresults': '1300400', 'opensearch_startindex': '0', 'opensearch_itemsperpage': '10'}, 'entries': [{'id': 'http://arxiv.org/abs/physics/0110044v1', 'guidislink': True, 'link': 'http://arxiv.org/abs/physics/0110044v1', 'updated': '2001-10-15T14:58:41Z', 'updated_parsed':

In [9]:
#https://github.com/lukasschwab/arxiv.py

from urllib.parse import quote_plus
from urllib.parse import urlencode
from urllib.request import urlretrieve
import feedparser

root_url = 'http://export.arxiv.org/api/'

def query(search_query="A. Buonanno", 
         id_list=[], 
         prune=True, 
         start=0, 
         max_results=10000, 
         sort_by="relevance", 
         sort_order="descending"):
    url_args = urlencode({"search_query": search_query, 
                          "id_list": ','.join(id_list),
                          "start": start,
                          "max_results": max_results,
                          "sortBy": sort_by,
                          "sortOrder": sort_order})        
    results = feedparser.parse(root_url + 'query?' + url_args)
    if results.get('status') != 200:
        # TODO: better error reporting
        raise Exception("HTTP Error " + str(results.get('status', 'no status')) + " in query")
    else:
        results = results['entries']
    for result in results:
        # Renamings and modifications
        mod_query_result(result)
        if prune:
            prune_query_result(result)
    return results


def mod_query_result(result):
    # Useful to have for download automation
    result['pdf_url'] = None
    for link in result['links']:
        if 'title' in link and link['title'] == 'pdf':
            result['pdf_url'] = link['href']
    result['affiliation'] = result.pop('arxiv_affiliation', 'None')
    result['arxiv_url'] = result.pop('link')
    result['title'] = result['title'].rstrip('\n')
    result['summary'] = result['summary'].rstrip('\n')
    result['authors'] = [d['name'] for d in result['authors']]
    if 'arxiv_comment' in result:
        result['arxiv_comment'] = result['arxiv_comment'].rstrip('\n')
    else:
        result['arxiv_comment'] = None
    if 'arxiv_journal_ref' in result:
        result['journal_reference'] = result.pop('arxiv_journal_ref')
    else:
        result['journal_reference'] = None
    if 'arxiv_doi' in result:
        result['doi'] = result.pop('arxiv_doi')
    else:
        result['doi'] = None

def prune_query_result(result):
    prune_keys = ['updated_parsed',
                  'published_parsed',
                  'arxiv_primary_category',
                  'summary_detail',
                  'author',
                  'author_detail',
                  'links',
                  'guidislink',
                  'title_detail',
                  'tags',
                  'id']
    for key in prune_keys:
        try:
            del result['key']
        except KeyError:
            pass

def to_slug(title):
    # Remove special characters
    filename = ''.join(c if c.isalnum() else '_' for c in title)
    # delete duplicate underscores
    filename = '_'.join(list(filter(None, filename.split('_'))))
    return filename

def download_PDF(obj, dirname='./', prepend_id=False, slugify=False):
    # Downloads file in obj (can be result or unique page) if it has a .pdf link
    if 'pdf_url' in obj and 'title' in obj and obj['pdf_url'] and obj['title']:
        filename = obj['title']
        if slugify:
            filename = to_slug(filename)
        if prepend_id:
            filename = obj['arxiv_url'].split('/')[-1] + '-' + filename
        filename = dirname + filename + '.pdf'
        # Download
        urlretrieve(obj['pdf_url'], filename)
        return filename
    else:
        print("Object obj has no PDF URL, or has no title")


In [2]:
res=query()

In [3]:
print(res.id_list)

AttributeError: 'list' object has no attribute 'id_list'

In [4]:


print( res[0].title)

print( res[0].author)


print(res[0].author_detail)

print(res[0].updated)

print(len(res))

Reduction of the two-body dynamics to a one-body description in
  classical electrodynamics
Alessandra Buonanno
{'name': 'Alessandra Buonanno'}
2000-10-01T19:04:34Z
1000


In [5]:
download_PDF(res[1])

'./Reduction of the two-body dynamics to a one-body description in\n  classical electrodynamics.pdf'