In [1]:
import time
import urllib
import datetime
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET


import numpy as np
import sys
import re

In [2]:
OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

In [3]:
def harvest(arxiv="cs", startdate = "2000-01-01", enddate = "2001-12-31"): #physics:hep-ex
    """
    Harvestes metadata for a specific category on arxiv
    
    Args:
        arxiv (str, optional): category on arxiv (cs, physics:hep-ex)
    
    Returns:
        pandas dataframe: a dataframe with metadata harvested from arxiv
    """

    papers = []
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from=%s&until=%s&"%(startdate,enddate) +
           "metadataPrefix=arXiv&set=%s"%arxiv)
    
    while True:
        print( "fetching", url)
        try:
            response = urllib.request.urlopen(url)
            
        except urllib.error.HTTPError as e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))

                time.sleep(to)
                continue
                
            else:
                raise
            
        xml = response.read()

        root = ET.fromstring(xml)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text
            #print(ET.tostring(info))
            authors = []
            for author in info.find(ARXIV+"authors").findall(ARXIV+"author"):
                a= {}

                a['keyname'] = author.find(ARXIV+"keyname").text
                try:
                    a['forenames'] = author.find(ARXIV+'forenames').text
                except AttributeError as e:
                    a['forenames'] = ''
                authors.append(a)
            # if there is more than one DOI use the first one
            # often the second one (if it exists at all) refers
            # to an eratum or similar
            doi = info.find(ARXIV+"doi")
            if doi is not None:
                doi = doi.text.split()[0]
            arxivid = info.find(ARXIV+"id").text
            arxivid = re.sub('/','',arxivid)
            contents = {'title': info.find(ARXIV+"title").text,
                        'arxivid': arxivid,
                        'abstract': info.find(ARXIV+"abstract").text.strip(),
                        'created': created,
                        'categories': categories.split(),
                        'doi': doi,
                        'authors' : authors
                        }

            papers.append(contents)

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return papers


In [4]:
r=harvest(arxiv="cs", startdate = "2014-10-01", enddate = "2014-12-01")
#r.dataframe()
len(r)

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2014-10-01&until=2014-12-01&metadataPrefix=arXiv&set=cs
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=4417969|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=4417969|2001


2464

In [7]:
type(r[0])

dict

In [9]:
import pandas as pd
df = pd.DataFrame(r) 

In [12]:
df.head(30)

Unnamed: 0,title,arxivid,abstract,created,categories,doi,authors
0,Solution of moving-boundary problems by the sp...,709.0355,This paper describes a novel numerical model a...,2007-09-04,"[cs.CE, cs.NA]",10.1016/j.apnum.2007.04.009,"[{'keyname': 'Bodard', 'forenames': 'Nicolas'}..."
1,The REESSE2+ Public-key Encryption Scheme,801.4817,This paper gives the definitions of an anomalo...,2008-01-30,"[cs.CR, cs.CC]",,"[{'keyname': 'Su', 'forenames': 'Shenghui'}, {..."
2,Projective Expected Utility,802.33,Motivated by several classic decision-theoreti...,2008-02-22,"[quant-ph, cs.GT]",10.1016/j.jmp.2009.02.001,"[{'keyname': 'La Mura', 'forenames': 'Pierfran..."
3,On the Kuratowski graph planarity criterion,802.382,This paper is purely expositional. The stateme...,2008-02-26,"[math.GT, cs.DM, math.CO]",,"[{'keyname': 'Skopenkov', 'forenames': 'A.'}]"
4,From Moments to Functions in Quantum Chromodyn...,902.4095,"Single-scale quantities, like the QCD anomalou...",2009-02-24,"[hep-ph, cs.SC, math-ph, math.AG, math.CO, mat...",,"[{'keyname': 'Blümlein', 'forenames': 'J.'}, {..."
5,Notes on solving and playing peg solitaire on ...,903.3696,We consider the one-person game of peg solitai...,2009-03-23,"[math.CO, cs.DM, math.HO]",,"[{'keyname': 'Bell', 'forenames': 'George I.'}]"
6,Kripke Models for Classical Logic,904.0071,We introduce a notion of Kripke model for clas...,2009-04-01,"[math.LO, cs.LO]",10.1016/j.apal.2010.04.007,"[{'keyname': 'Ilik', 'forenames': 'Danko'}, {'..."
7,Codes over Quaternion Integers with Respect to...,905.416,I want to withdraw this paper.,2009-05-26,"[cs.IT, math.IT]",,"[{'keyname': 'Guzeltepe', 'forenames': 'Murat'..."
8,A criterion for hypothesis testing for station...,905.4937,"Given a finite-valued sample $X_1,...,X_n$ we ...",2009-05-29,"[math.ST, cs.IT, math.IT, math.PR, stat.TH]",,"[{'keyname': 'Ryabko', 'forenames': 'Daniil'}]"
9,Instruction sequence notations with probabilis...,906.3083,This paper concerns instruction sequences that...,2009-06-17,[cs.PL],,"[{'keyname': 'Bergstra', 'forenames': 'J. A.'}..."
