In [1]:
import arxiv
import urllib.request as libreq
import feedparser
import pdfminer.layout
import pdfminer.high_level
from io import StringIO
from pdfminer.layout import LAParams

## Downloading Breakdown

In [2]:
"""
python_arXiv_parsing_example.py

This sample script illustrates a basic arXiv api call
followed by parsing of the results using the 
feedparser python module.

Please see the documentation at 
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api 
mailing list at arxiv-api@googlegroups.com.

urllib is included in the standard python library.
feedparser can be downloaded from http://feedparser.org/ .

Author: Julius B. Lucks

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!
"""

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'cat:cs.LG' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)
corpusEntry=[]
corpusPDF=[]
corpusID = []
# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
with libreq.urlopen(base_url+query) as url:
    response = url.read()

# parse the response using feedparser
feed = feedparser.parse(response)

# print out feed information
print ('Feed title: %s' % feed.feed.title)
print ('Feed last updated: %s' % feed.feed.updated)

# print opensearch metadata
print ('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print ('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print ('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

# Run through each entry, and print out information
for entry in feed.entries:
    corpusEntry.append(entry)
    print ('e-print metadata')
    print ('arxiv-id: %s' % entry.id.split('/abs/')[-1])
    corpusID.append(entry.id.split('/abs/')[-1])
    print ('Published: %s' % entry.published)
    print ('Title:  %s' % entry.title)
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass
    
    print ('Last Author:  %s' % author_string)
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print ('Authors:  %s' % ', '.join(author.name for author in entry.authors))
    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print ('abs page link: %s' % link.href)
        elif link.title == 'pdf':
            
            corpusPDF.append({"pdf_url": link.href})
            print ('pdf link: %s' % link.href)
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print ('Journal reference: %s' % journal_ref)
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print ('Comments: %s' % comment)
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print ('Primary Category: %s' % entry.tags[0]['term'])
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print ('All Categories: %s' % (', ').join(all_categories))
    
    # The abstract is in the <summary> element
    print ('Abstract: %s' %  entry.summary)

Feed title: ArXiv Query: search_query=cat:cs.LG&id_list=&start=0&max_results=5
Feed last updated: 2020-04-21T00:00:00-04:00
totalResults for this query: 53710
itemsPerPage for this query: 5
startIndex for this query: 0
e-print metadata
arxiv-id: cs/9905014v1
Published: 1999-05-21T14:26:07Z
Title:  Hierarchical Reinforcement Learning with the MAXQ Value Function
  Decomposition
Last Author:  Thomas G. Dietterich
Authors:  Thomas G. Dietterich
abs page link: http://arxiv.org/abs/cs/9905014v1
pdf link: http://arxiv.org/pdf/cs/9905014v1
Journal reference: No journal ref found
Comments: 63 pages, 15 figures
Primary Category: cs.LG
All Categories: cs.LG, I.2.6
Abstract: This paper presents the MAXQ approach to hierarchical reinforcement learning
based on decomposing the target Markov decision process (MDP) into a hierarchy
of smaller MDPs and decomposing the value function of the target MDP into an
additive combination of the value functions of the smaller MDPs. The paper
defines the MAXQ hi

In [3]:
corpusEntry #This is the full entry, can access id, link, when updated etc.

[{'arxiv_comment': '63 pages, 15 figures',
  'arxiv_primary_category': {'scheme': 'http://arxiv.org/schemas/atom',
   'term': 'cs.LG'},
  'author': 'Thomas G. Dietterich',
  'author_detail': {'name': 'Thomas G. Dietterich'},
  'authors': [{'name': 'Thomas G. Dietterich'}],
  'guidislink': True,
  'id': 'http://arxiv.org/abs/cs/9905014v1',
  'link': 'http://arxiv.org/abs/cs/9905014v1',
  'links': [{'href': 'http://arxiv.org/abs/cs/9905014v1',
    'rel': 'alternate',
    'type': 'text/html'},
   {'href': 'http://arxiv.org/pdf/cs/9905014v1',
    'rel': 'related',
    'title': 'pdf',
    'type': 'application/pdf'}],
  'published': '1999-05-21T14:26:07Z',
  'published_parsed': time.struct_time(tm_year=1999, tm_mon=5, tm_mday=21, tm_hour=14, tm_min=26, tm_sec=7, tm_wday=4, tm_yday=141, tm_isdst=0),
  'summary': 'This paper presents the MAXQ approach to hierarchical reinforcement learning\nbased on decomposing the target Markov decision process (MDP) into a hierarchy\nof smaller MDPs and deco

In [4]:
corpusPDF #This is the forced PDF link as a pdf_url item for the download function

[{'pdf_url': 'http://arxiv.org/pdf/cs/9905014v1'},
 {'pdf_url': 'http://arxiv.org/pdf/cs/9905015v1'},
 {'pdf_url': 'http://arxiv.org/pdf/cs/0001004v1'},
 {'pdf_url': 'http://arxiv.org/pdf/cs/0002006v1'},
 {'pdf_url': 'http://arxiv.org/pdf/cs/0009001v3'}]

In [6]:
# Override the default filename format by defining a slugify function. So can force pdf link for all even without listed
arxiv.download(corpusPDF[2],r'C:\Users\Al\Documents\ByteSizeArxiv\library', slugify=lambda x: corpusEntry[2].get('id').split('/')[-1])

'C:\\Users\\Al\\Documents\\ByteSizeArxiv\\library/0001004v1.pdf'

##  Read PDF

In [7]:
text=pdfminer.high_level.extract_text('C:\\Users\\Al\\Documents\\ByteSizeArxiv\\library/0001004v1.pdf', codec='utf-8', laparams=None)

In [8]:
text

'0\n0\n0\n2\n \nn\na\nJ\n \n7\n \n \n]\n\nG\nL\n.\ns\nc\n[\n \n \n1\nv\n4\n0\n0\n1\n0\n0\n0\n/\ns\nc\n:\nv\ni\nX\nr\na\n\nMultiplicative Algorithm for Orthgonal Groups\nand Independent Component Analysis\n\nToshinao Akuzawa∗\n\nBrain Science Institute\nRIKEN\n2-1 Hirosawa, Wako, Saitama 351-0198, Japan\n\nOctober 26, 2018\n\nAbstract\n\nThe multiplicative Newton-like method developed by the author et al. is extended to the situa-\ntion where the dynamics is restricted to the orthogonal group. A general framework is constructed\nwithout specifying the cost function. Though the restriction to the orthogonal groups makes the\nproblem somewhat complicated, an explicit expression for the amount of individual jumps is ob-\ntained. This algorithm is exactly second-order-convergent. The global instability inherent in the\nNewton method is remedied by a Levenberg-Marquardt-type variation. The method thus con-\nstructed can readily be applied to the independent component analysis. Its remarkable