In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# `urllib`

* [arxiv.py](https://github.com/lukasschwab/arxiv.py)

In [2]:
from requests.exceptions import HTTPError
from urllib.parse import urlencode
from urllib.request import urlretrieve
import feedparser
import re

In [None]:
def query(search_query="",
          id_list=[],
          prune=True,
          start=0,
          max_results=10,
          sort_by="relevance",
          sort_order="descending"):
    url_args = urlencode({
        "search_query": search_query,
        "id_list": ','.join(id_list),
        "start": start,
        "max_results": max_results,
        "sortBy": sort_by,
        "sortOrder": sort_order
    })
    results = feedparser.parse(root_url + 'query?' + url_args)
    # results is a atom feed atomxxx.xml. Might be a dictionary, what will  feedparser.parse(xxxurl) return
    if results.get('status') != 200:
        # TODO: better error reporting
        raise Exception("HTTP Error " +
                        str(results.get('status', 'no status')) + " in query")
    else:
        results = results['entries']
    results = [r for r in results if r.get("title", None)]
    for result in results:
        mod_query_result(result)
        if prune:
            prune_query_result(result)
    return results

In [None]:
def mod_query_result(result):
    # Useful to have for download automation
    result['pdf_url'] = None
    for link in result['links']:
        if 'title' in link and link['title'] == 'pdf':
            result['pdf_url'] = link['href']
    result['affiliation'] = result.pop('arxiv_affiliation', 'None')
    result['arxiv_url'] = result.pop('link')
    result['title'] = result['title'].rstrip('\n')
    result['summary'] = result['summary'].rstrip('\n')
    result['authors'] = [d['name'] for d in result['authors']]
    if 'arxiv_comment' in result:
        result['arxiv_comment'] = result['arxiv_comment'].rstrip('\n')
    else:
        result['arxiv_comment'] = None
    if 'arxiv_journal_ref' in result:
        result['journal_reference'] = result.pop('arxiv_journal_ref')
    else:
        result['journal_reference'] = None
    if 'arxiv_doi' in result:
        result['doi'] = result.pop('arxiv_doi')
    else:
        result['doi'] = None

In [None]:
results[0].pop('arxiv_affiliation', 'None')

In [None]:
def prune_query_result(result):
    prune_keys = ['updated_parsed',
                  'published_parsed',
                  'arxiv_primary_category',
                  'summary_detail',
                  'author',
                  'author_detail',
                  'links',
                  'guidislink',
                  'title_detail',
                  'tags',
                  'id']
    for key in prune_keys:
        try:
            del result['key']
        except KeyError:
            pass

In [None]:
def slugify(obj):
    # Remove special characters from object title
    filename = '_'.join(re.findall(r'\w+', obj.get('title', 'UNTITLED')))
    # Prepend object id
    filename = "%s.%s" % (obj.get('pdf_url').split('/')[-1], filename)
    return filename

In [None]:
def download(obj, dirpath='./', slugify=slugify):
    if not obj.get('pdf_url', ''):
        print("Object has no PDF URL.")
        return
    if dirpath[-1] != '/':
        dirpath += '/'
    path = dirpath + slugify(obj) + '.pdf'
    urlretrieve(obj['pdf_url'], path)
    return path

In [None]:
paper = query(id_list=["1707.08567"])
download(paper) 

# `requests`

In [5]:
import requests
from bs4 import BeautifulSoup

In [13]:
data_path = "/home/wm/Playground/IdealOps/batch_jobs/data/data_sample/"

In [17]:
with open (data_path + 'norm_trend.json') as  f:
    data = json.load(f)

In [43]:
import urllib.parse
 
word='   mama'
word=urllib.parse.quote(word)
url='https://baike.baidu.com/search/word?word=%s'%word
print(url)

https://baike.baidu.com/search/word?word=%20%20%20mama


In [None]:
angs = {'japanese': 'ja',
         'english': 'en'}

def get_sound_file_for_text(text, download=False, lang='japanese'):

    r = StringIO()
    glang = langs[lang]
    text = text.replace('*', '')
    text = text.replace('/', '')
    text = text.replace('x', '')
    url = 'http://translate.google.com/translate_tts'
    if download:
        result = requests.get(url, params={'tl': glang, 'q': text})
        r.write(result.content)
        r.seek(0)
        return r
    else:
        return url

In [39]:
d = data[0] 
'&'.join([i[0] + '=' + i[1] for i in list(zip(d.keys(), d.values()))])

'系统名称=system_01&任务名称=task_01&实例名=case_01&开始时间=2019-02-25 09:03:00&完成时间=2019-02-25 11:44:53.307657&是否业务异常=0&系统异常=0'

In [44]:
for i in data[:3]:
    i 
    url_args = urlencode(i)
    
    

{'系统名称': 'system_01',
 '任务名称': 'task_01',
 '实例名': 'case_01',
 '开始时间': '2019-02-25 09:03:00',
 '完成时间': '2019-02-25 11:44:53.307657',
 '是否业务异常': '0',
 '系统异常': '0'}

{'系统名称': 'system_01',
 '任务名称': 'task_01',
 '实例名': 'case_02',
 '开始时间': '2019-02-25 11:44:53.307657',
 '完成时间': '2019-02-25 15:02:40.222676',
 '是否业务异常': '0',
 '系统异常': '0'}

{'系统名称': 'system_01',
 '任务名称': 'task_01',
 '实例名': 'case_03',
 '开始时间': '2019-02-25 15:02:40.222676',
 '完成时间': '2019-02-25 17:31:16.250314',
 '是否业务异常': '0',
 '系统异常': '0'}

In [47]:
import urllib.parse

uriencoded = urllib.parse.quote('/s?wd=无人驾驶',encoding='UTF-8')

In [25]:
r = requests.get('http://httpbin.org/get')
print(*[i for i in dir(r) if not i.startswith('_')])
r.json()
#r.text
#r.content
cup = requests.get('http://httpbin.org')
soup = BeautifulSoup(cup.text, features='html.parser')

apparent_encoding close connection content cookies elapsed encoding headers history is_permanent_redirect is_redirect iter_content iter_lines json links next ok raise_for_status raw reason request status_code text url


{'args': {},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.22.0',
  'X-Amzn-Trace-Id': 'Root=1-5e53a6a9-cc6e1fa083b21660fc162020'},
 'origin': '222.129.35.237',
 'url': 'http://httpbin.org/get'}

In [None]:
# api-endpoint 
URL = "http://maps.googleapis.com/maps/api/geocode/json"
  
# location given here 
location = "delhi technological university"
  
# defining a params dict for the parameters to be sent to the API 
PARAMS = {'address':location} 
  
# sending get request and saving the response as response object 
r = requests.get(url = URL, params = PARAMS) 
  
# extracting data in json format 
data = r.json() 
  
  
# extracting latitude, longitude and formatted address  
# of the first matching location 
latitude = data['results'][0]['geometry']['location']['lat'] 
longitude = data['results'][0]['geometry']['location']['lng'] 
formatted_address = data['results'][0]['formatted_address'] 
  
# printing the output 
print("Latitude:%s\nLongitude:%s\nFormatted Address:%s"
      %(latitude, longitude,formatted_address)) 


# Third party library 

* [arxivpy](https://github.com/titipata/arxivpy)
* [sotawhat](https://github.com/chiphuyen/sotawhat)
* [arxiv-checker](https://github.com/adamdempsey90/arxiv-checker)
* [arxivscraper](https://github.com/Mahdisadjadi/arxivscraper)

In [None]:
import arxiv

In [None]:
paper = arxiv.query(id_list=["1707.08567"])

In [None]:
type(paper) == list
len(paper)
paper[0]

for i in paper:
    print(i)

Table: search_query field prefixes
* prefix	explanation
* ti	Title
* au	Author
* abs	Abstract
* co	Comment
* jr	Journal Reference
* cat	Subject Category
* rn	Report Number
* id	Id (use id_list instead)
* all	All of the above

Search the result, access and download. Tidy up the name of the files

In [None]:
paper = arxiv.query(id_list=["1707.08567"])[0]
arxiv.download(paper)
# You can skip the query step if you have the paper info!
paper2 = {"pdf_url": "http://arxiv.org/pdf/1707.08567v1",
          "title": "The Paper Title"}
arxiv.download(paper2)

def custom_slugify(obj):
    return obj.get('id').split('/')[-1]

# Download with a specified slugifier function
arxiv.download(paper, slugify=custom_slugify)

In [None]:
paper.get('title')
paper.get('authors')
paper.get('summary')

In [None]:
paper = arxiv.query(id_list=["1707.08567"])[0]

## Retrieve the data of an author 

In [None]:
DH = arxiv.query(search_query="au: Dieter Horns")
len(DH)

In [None]:
DH[3]['affiliation']

In [None]:
 'a'  in DH[3]['affiliation']

In [None]:
DH[0]['published'] # date time format yyyy-MM-ddTHH:mm:ssZ

In [None]:
len(DH[0]['authors'])

In [None]:
if DH[0]['authors'][0]=='Dieter Horns':
    print(2)

In [None]:
for DHevery in DH: 
    print(DHevery['title'])
    print(DHevery['authors'])

In [None]:
DHpaper = arxiv.query(id_list= ["1309.3846"])[0]

In [None]:
paper2 = {"pdf_url": "http://arxiv.org/pdf/1707.08567v1",
          "title": "The Paper Title"}
arxiv.download(paper2)

## Workflow

In [None]:
DH = arxiv.query(
    search_query=
    "au:D. AND au:Horns AND (cat:astro-ph OR cat:hep-ph OR cat:hep-ex OR cat:id_list=physics.ins-det OR cat:astro-ph.HE OR cat:astro-ph.IM OR cat:astro-ph.CO)",
    max_results=50)

In [None]:
for DHevery in DH:
    everyitem = {"pdf_url": DHevery["pdf_url"], "title": DHevery["title"]}
    arxiv.download(everyitem)

In [None]:
with open('DH_index.md', 'w') as the_file:
    # the_file.write('## Dieter Horns as First author\n')
    for DHevery in DH:
        if DHevery['authors'][0] == 'Dieter Horns':
            # if 'University of Hamburg'  in DHevery['affiliation']:
            # if len(DHevery['authors']) < 4:
            the_file.write('* **Title:** ' + DHevery["title"] + '\n')
            the_file.write('\n')
            the_file.write('  **Published at:** ' + DHevery['published'] +
                           '\n')
            the_file.write('\n')
            the_file.write('  **pdf_url:** ' + DHevery['pdf_url'] + '\n')
            the_file.write('\n')
            the_file.write('  **Summary:** ' + DHevery["summary"] + '\n')
    the_file.close()

In [None]:
!mv DH_index.md *.pdf ~/Documents/GammaRay/DieterHorns/

Then you can draw the content of summary from the retrieved data, then read the summary and find the interesting paper and download it with the help of its `pdf_url`.

## How to determine the author is the only one

In [None]:
DH = arxiv.query(search_query="au:Dieter Horns",
                 max_results=50)  
DH[26]['authors']

In [None]:
DH = arxiv.query(search_query="au:D. AND au:Horns AND cat:astro-ph ", max_results= 50) 

`arxiv.query` works less satisfied than `arxiv.download`

## Fetch the content between a time interval

In [None]:
DH[0]['published'] 

In [None]:
import dateutil.parser

In [None]:
d = dateutil.parser.parse('2008-09-26T01:51:42.000Z')
print(d.strftime('%m/%d/%Y'))  #==> '09/26/2008'

In [None]:
dateutil.parser.parse(DH[0]['published']).strftime('%m/%d/%Y')

## Store the data using pandas

Compare with the [result](https://arxiv.org/search/?query=+Dieter+Horns&searchtype=author&abstracts=show&order=-announced_date_first&size=50&start=50) from manually search on arxiv, the result is well displayed, ordered by announced date.