In [9]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
import shutil
import time
from tqdm import tqdm
import pickle

In [10]:
DATA_DIR = '../Data/Papers'

SHOW = 500
MAX_PAPERS = 10000

CODES = {
    'Astrophysics':'astro-ph',
    'Condensed Matter':'cond-mat',
    'General Relativity and Quantum Cosmology':'gr-qc',
    'High Energy Physics - Experiment':'hep-ex',
    'High Energy Physics - Lattice':'hep-lat',
    'High Energy Physics - Phenomenology':'hep-ph',
    'High Energy Physics - Theory':'hep-th',
    'Mathematical Physics':'math-ph',
    'Nonlinear Sciences':'nlin',
    'Nuclear Experiment':'nucl-ex',
    'Nuclear Theory':'nucl-th',
    'Physics':'physics',
    'Quantum Physics':'quant-ph',
    'Mathematics':'math',
    'Computer Science':'cs',
    'Quantitative Biology':'q-bio',
    'Quantitative Finance':'q-fin',
    'Statistics':'stat',
    'Electrical Engineering and Systems Science':'eess',
    'Economics':'econ',
}

REVERSE_CODES = {v:k for k,v in CODES.items()}


def get_urls(archive, year, show, max_papers):
    idx = get_current_idx(archive,year,show)
    urls = [create_url(archive,year,i,show) for i in range(idx,max_papers//show)]
    pdf_urls = []
    for url in tqdm(urls):
        pdf_urls.extend(get_pdf_urls(url))
        time.sleep(3)
    return pdf_urls


def get_pdf_urls(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html,'html.parser')
    urls = [a['href'] for a in soup.find_all('a',title='Download PDF')]
    return ["https://export.arxiv.org"+url+".pdf" for url in urls]

def create_url(archive,year,idx,show):
    return f'https://export.arxiv.org/list?archive={CODES[archive]}&year={year}&month=all&submit=Go&show={show}&skip={show*idx}'

def get_num_papers(archive,year):
    return len(os.listdir(f'{DATA_DIR}/{archive}/{year}'))

def get_current_idx(archive,year,show=500):
    idx = get_num_papers(archive,year)
    idx = idx//show
    if idx%show != 0:
        idx += 1
    return idx

def save_paper(url,archive,year):
    pdf = requests.get(url).content
    file_name = f'{DATA_DIR}/{archive}/{year}/{url.split("/")[-1]}'
    with open(file_name, 'wb') as f:
        f.write(pdf)
        
        

def collection_function(archive,year,show,max_papers):
    print('Getting URLs')
    urls = get_urls(archive,year,show, max_papers)
    pickle.dump(urls,open(f'{year}urls.pkl','wb'))
    #urls = pickle.load(open(f'{year}urls.pkl','rb'))
    print('Saving Papers')
    for url in tqdm(urls):
        try:
            url = url[:8] + 'export.'+ url[8:]
            save_paper(url,archive,year)
            time.sleep(1)
        except:
            print(f'Error: {url}')
            continue
            


In [None]:
archive = 'Computer Science'
for year in range(23,19,-1):
    collection_function(archive, year,SHOW,  MAX_PAPERS)