In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# The overview of data retrieved from arxiv 

arxiv.py interact with arxiv api

## Library

* [arxiv.py](https://github.com/lukasschwab/arxiv.py)
* [arxivpy](https://github.com/titipata/arxivpy)
* [sotawhat](https://github.com/chiphuyen/sotawhat)
* [arxiv-checker](https://github.com/adamdempsey90/arxiv-checker)
* [arxivscraper](https://github.com/Mahdisadjadi/arxivscraper)

In [1]:
import arxiv

Table: search_query field prefixes
* prefix	explanation
* ti	Title
* au	Author
* abs	Abstract
* co	Comment
* jr	Journal Reference
* cat	Subject Category
* rn	Report Number
* id	Id (use id_list instead)
* all	All of the above

Search the result, access and download. Tidy up the name of the files

In [None]:
def query(search_query="",
         id_list=[],
         prune=True,
         start=0,
         max_results=10,
         sort_by="relevance",
         sort_order="descending"):
    url_args = urlencode({"search_query": search_query,
                          "id_list": ','.join(id_list),
                          "start": start,
                          "max_results": max_results,
                          "sortBy": sort_by,
                          "sortOrder": sort_order})
    results = feedparser.parse(root_url + 'query?' + url_args)
    if results.get('status') != 200:
        # TODO: better error reporting
        raise Exception("HTTP Error " + str(results.get('status', 'no status')) + " in query")
    else:
        results = results['entries']
    results = [r for r in results if r.get("title", None)]
    for result in results:
        # Renamings and modifications
        mod_query_result(result)
        if prune:
            prune_query_result(result)
    return results

In [None]:
def download(obj, dirpath='./', slugify=slugify):
    if not obj.get('pdf_url', ''):
        print("Object has no PDF URL.")
        return
    if dirpath[-1] != '/':
        dirpath += '/'
    path = dirpath + slugify(obj) + '.pdf'
    urlretrieve(obj['pdf_url'], path)
    return path

In [None]:
# Query for a paper of interest, then download
paper = arxiv.query(id_list=["1707.08567"])[0]
arxiv.download(paper)
# You can skip the query step if you have the paper info!
paper2 = {"pdf_url": "http://arxiv.org/pdf/1707.08567v1",
          "title": "The Paper Title"}
arxiv.download(paper2)

# Returns the object id
def custom_slugify(obj):
    return obj.get('id').split('/')[-1]

# Download with a specified slugifier function
arxiv.download(paper, slugify=custom_slugify)

In [None]:
paper.get('title')
paper.get('authors')
paper.get('summary')

In [None]:
paper = arxiv.query(id_list=["1707.08567"])[0]

# Retrieve the data of an author 

In [None]:
DH = arxiv.query(search_query="au: Dieter Horns")
DH[0]

In [None]:
DH[3]['affiliation']

In [None]:
 'a'  in DH[3]['affiliation']

In [None]:
DH[0]['published'] # date time format yyyy-MM-ddTHH:mm:ssZ

In [None]:
len(DH[0]['authors'])

In [None]:
if DH[0]['authors'][0]=='Dieter Horns':
    print(2)

In [None]:
for DHevery in DH: 
    print(DHevery['title'])
    print(DHevery['authors'])
    #print(DHevery['summary'])

In [None]:
DHpaper = arxiv.query(id_list= ["1309.3846"])[0]

In [None]:
paper2 = {"pdf_url": "http://arxiv.org/pdf/1707.08567v1",
          "title": "The Paper Title"}
arxiv.download(paper2)

## Workflow

In [7]:
DH = arxiv.query(search_query="au:D. AND au:Horns AND (cat:astro-ph OR cat:hep-ph OR cat:hep-ex OR cat:id_list=physics.ins-det OR cat:astro-ph.HE OR cat:astro-ph.IM OR cat:astro-ph.CO)", max_results= 50)  

In [None]:
for DHevery in DH: 
    everyitem = {"pdf_url": DHevery["pdf_url"],
                 "title" : DHevery["title"]}
    arxiv.download(everyitem)

In [9]:
!touch DH_index.md
with open('DH_index.md', 'w') as the_file:
    # the_file.write('## Dieter Horns as First author\n')
    for DHevery in DH: 
        # if DHevery['authors'][0]=='Dieter Horns':
        # if 'University of Hamburg'  in DHevery['affiliation']:
        # if len(DHevery['authors']) < 4:
            the_file.write('* **Title:** ' + DHevery["title"] + '\n')  
            the_file.write('\n')
            the_file.write('  **Published at:** ' + DHevery['published'] + '\n')
            the_file.write('\n')
            the_file.write('  **pdf_url:** ' + DHevery['pdf_url'] + '\n') 
            the_file.write('\n')
            the_file.write('  **Summary:** ' + DHevery["summary"] + '\n') 
    the_file.close()

In [10]:
!mv DH_index.md *.pdf ~/Documents/GammaRay/DieterHorns/

mv: rename *.pdf to /Users/wangmiao/Documents/GammaRay/DieterHorns/*.pdf: No such file or directory


Then you can draw the content of summary from the retrieved data, then read the summary and find the interesting paper and download it with the help of its `pdf_url`.

## How to determine the author is the only one

In [None]:
DH = arxiv.query(search_query="au:Dieter Horns", max_results= 50) #Only pick out names containing Dieter
DH[26]['authors']

In [None]:
DH = arxiv.query(search_query="au:D. AND au:Horns AND cat:astro-ph ", max_results= 50) 

`arxiv.query` works less satisfied than `arxiv.download`

## Fetch the content between a time interval

In [None]:
DH[0]['published'] # date time format yyyy-MM-ddTHH:mm:ssZ

In [None]:
import dateutil.parser

d = dateutil.parser.parse('2008-09-26T01:51:42.000Z')
print(d.strftime('%m/%d/%Y'))  #==> '09/26/2008'

In [None]:
dateutil.parser.parse(DH[0]['published']).strftime('%m/%d/%Y')

In [None]:
import datetime

In [None]:
date_format = "%Y-%m-%dT%H:%M:%S.%fZ" 
datetime.datetime.strptime('2008-09-26T01:51:42.000Z', date_format)
datetime_object = datetime.datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
datetime_object

## Store the data using pandas

Compare with the [result](https://arxiv.org/search/?query=+Dieter+Horns&searchtype=author&abstracts=show&order=-announced_date_first&size=50&start=50) from manually search on arxiv, the result is well displayed, ordered by announced date.

## With built-in library

Below we include code snippets for these languages that perform the bare minimum functionality - calling the api and printing the raw Atom results

In [None]:
import urllib.request
url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1'
data = urllib.request.urlopen(url).read() 
data 