# Notebook that illustrates how to harvest data these codes are partly a readaptation of the ones made by Amdrej Karpathy in his brilliant project https://github.com/karpathy/arxiv-sanity-preserver.git

In [3]:
import os
import time
import pickle
import shutil
from  urllib.request import urlopen
import urllib.request
import feedparser
import random

from utils import Config, get_id_version 

## The following piece of code is essentially the same as the one contained in the module arXivAPI.py . The code connects with the arXiv search engine through queries and download the article metadata. The data is then saved in a binary file

In [8]:
search_cat='astro-ph.CO' #Category of papers to be searched by the arXiv API. See http://arxiv.org/help/api/user-manual#detailed_examples
#Example with "Cosmology and Nongalactic Astrophysics". If search_cat=None the module will harvest data relative to all the  categories. 
start_index=0 #0 = most recent API result
max_index=10 #Upper bound on paper index we will fetch for each category.
results_per_iteration=100 #Batch of results to be provided by the arXiv API.
wait_time=3.0 #Waiting time between iterations to avoid being cut out by arXiv API.

try:
    with open(Config.metadata_db, 'rb') as file:
        metadata_db = pickle.load(file)
except Exception as e:
    print('error loading existing database: ',e)
    print('Starting from an empty one')
    metadata_db = {}

base_url='http://export.arxiv.org/api/query?'
    
if search_cat is None:
    with open('arXiv_categories','rb') as file:
        arXiv_categories=(pickle.load(file)).values()
else:
    arXiv_categories=[search_cat]
        
    
num_added_tot=0
for cat in arXiv_categories:
    search_query='cat:'+cat
    
    num_cat_added_tot=0
    i=0
    while num_cat_added_tot<max_index:
        query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (search_query,i, results_per_iteration)
   
        with urllib.request.urlopen(base_url+query) as url:
            response = url.read()
        parse = feedparser.parse(response)
        if len(parse.entries)==0:
            print('There are no more search results for category %s .'%(cat))
            break
            
        num_cat_added=0
        num_cat_old=0

        for e in parse.entries:
            idx,v=get_id_version(e['id'])
            e['raw_id']=cat+'/'+idx
            e['version']=v
        #add the article to the database only if the article is not there already (keeping the version into consideration) 
        #and if the primary category of the search is the same as the one considered
            if e['arxiv_primary_category']['term']==cat and (not idx in metadata_db or v > metadata_db[idx]['version']):
                metadata_db[idx]=e
                num_cat_added+=1
                num_cat_added_tot+=1
            else:
                num_cat_old+=1
            if num_cat_added_tot>max_index-1:
                break
    
        print('Added %i papers in category %s . Papers skipped %i'%(num_cat_added,cat,num_cat_old))
        i+=results_per_iteration
        num_added_tot=num_added_tot+num_cat_added
        
        #Waiting some seconds to avoid being cut out from arXiv
        time.sleep(wait_time+random.uniform(0,0.1))

        
print('Total number of papers added %i.'%(num_added_tot))

if num_added_tot > 0:
    with open(Config.metadata_db, 'wb') as file:
        pickle.dump(metadata_db,file)

error loading existing database:  [Errno 2] No such file or directory: 'data/metadata_db'
Starting from an empty one
Added 10 papers in category astro-ph.CO . Papers skipped 3
Total number of papers added 10.


## The following piece of code is essentially the same as the codes contained in the modules download_articles.py and articles_to_txt.py . 

In [5]:
with open(Config.metadata_db,'rb') as file:
    metadata_db=pickle.load(file)

if not os.path.exists(Config.tmp): #create directory to temporarily store pdfs if not present aready
    os.makedirs(Config.tmp)

if not os.path.exists(Config.txt_db): #create directory to temporarily store pdfs if not present aready
    os.makedirs(Config.txt_db)

timeout=10 #waiting seconds before stopping the download
already_have = set(os.listdir('txt_db')) #getting list of papers that are already present in the directory  

num_to_add=0
num_added=0
with open(Config.metadata_db,'rb') as file:
    metadata_db=pickle.load(file)

for arXiv_id,metadata in metadata_db.items():
    pdf=arXiv_id+'.pdf'
    txt=arXiv_id+'.txt'
    #getting the link of the pdf from the metadata, this is positioned at the end of the list at position 'links'
    pdf_url=metadata['links'][-1]['href']+'.pdf'
    #make the link into the link specifically provided by arXiv for harvesting purposes 
    pdf_url=pdf_url.replace("arxiv.org", "export.arxiv.org")
    pdf_path=os.path.join(Config.tmp,pdf)
    txt_path=os.path.join(Config.txt_db,txt)
    try:
        if not txt in already_have:
            num_to_add+=1
            req = urlopen(pdf_url, None, timeout)
            print('Getting article %s' % (pdf_url))
            with open(pdf_path, 'wb') as file:
                shutil.copyfileobj(req, file)
            #converting the pdf into txt needs pdftotext on the system to run
            cmd = "pdftotext %s %s" % (pdf_path, txt_path)
            exit=os.system(cmd)
            #remove the pdf to save space
            os.system('rm %s'%(pdf_path))
            num_added+=1
            #check that everything went well
            if exit!=0:
                print('It seems like there was an error in converting %s. Please try again later. Exit status %i.'%(pdf,exit))
                #remove the article in case the file was created
                if os.path.isfile(txt_path):
                    os.system('rm '+txt_path)
                num_added-=1
            
        else:
            print('%s already exists, skipping.' % (arXiv_id))
    
    except Exception as e:
        print('An error incurred while downloading: %s .'%(pdf_url))
        print(e)
print('Downloaded %i articles out of %i.'%(num_added,num_to_add))    

Getting article http://export.arxiv.org/pdf/0806.0664v6.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/9911331v2.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/0311033v5.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/0010594v3.pdf
Getting article http://export.arxiv.org/pdf/0802.2889v4.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/0312617v2.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/0008166v8.pdf
Getting article http://export.arxiv.org/pdf/0709.2329v2.pdf
Getting article http://export.arxiv.org/pdf/0812.3401v2.pdf
Getting article http://export.arxiv.org/pdf/astro-ph/0410297v2.pdf
2004.01139 already exists, skipping.
2004.01135 already exists, skipping.
1911.08512 already exists, skipping.
2004.00947 already exists, skipping.
2004.00864 already exists, skipping.
2004.00863 already exists, skipping.
1909.13832 already exists, skipping.
2004.00678 already exists, skipping.
2004.00672 already exists, skipping.
2004.00649 already exists