# Library

In [5]:
import arxiv, os, requests

In [6]:
def download_pdfs(pdf_links, download_folder="documents", filename = ""):
    # Create the download folder if it doesn't already exist.
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        print(f"Created directory: {download_folder}")

    url = pdf_links
    try:
        response = requests.get(url, stream=True)
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Extract the filename from the URL
        # filename = os.path.join(download_folder, url.split("/")[-1])
        filename = os.path.join(download_folder, filename+'.pdf')

        # Save the content to a local file
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Successfully downloaded: {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for {url}: {e}")

# Arxiv finance retriver
### arxiv q-fin finance topic
- `q-fin.RM`    Risk Management
- `q-fin.CP`    Computational Finance
- `q-fin.ST`    Statistical Finance
- `q-fin.TR`    Trading and Market Microstructure
- `q-fin.EC`    Economics
- `q-fin.GN`    General Finance
- `q-fin.MF`	Mathematical Finance
- `q-fin.PM`	Portfolio Management
- `q-fin.PR`	Pricing of Securities


In [7]:
def arxiv_ingestion():
    #API connect
    client = arxiv.Client()
    #search
    search = arxiv.Search(
        query=f"cat:q-fin*", #q-fin is finance topic
        max_results=20,
        sort_by=arxiv.SortCriterion.SubmittedDate #Date sort
    )
    results = client.results(search)
    for result in client.results(search):
        # print(f"Title: {result.title}")
        # print(f"   All Categories (Subqueries): {result.categories}")
        # print(f"   PDF Link: {result.pdf_url}")
        for categorie in result.categories:
            if('q-fin' not in categorie): continue
            download_folder = f"../data/{categorie}/"
            download_pdfs(pdf_links=result.pdf_url, download_folder=download_folder, filename=result.title.replace(" ",""))
        print("-" * 30)

In [8]:
if __name__ == "__main__":
    arxiv_ingestion()

Successfully downloaded: ../data/q-fin.RM/DisasterRiskFinancingthroughTaxation:AFrameworkforRegionalParticipationinCollectiveRisk-Sharing.pdf
------------------------------
Successfully downloaded: ../data/q-fin.EC/TheTheoryofEconomicComplexity.pdf
------------------------------
Successfully downloaded: ../data/q-fin.EC/AnAI-poweredToolforCentralBankBusinessLiaisons:QuantitativeIndicatorsandOn-demandInsightsfromFirms.pdf
------------------------------
Successfully downloaded: ../data/q-fin.PR/Americanoptionsvaluationintime-dependentjump-diffusionmodelsviaintegralequationsandcharacteristicfunctions.pdf
Successfully downloaded: ../data/q-fin.CP/Americanoptionsvaluationintime-dependentjump-diffusionmodelsviaintegralequationsandcharacteristicfunctions.pdf
Successfully downloaded: ../data/q-fin.MF/Americanoptionsvaluationintime-dependentjump-diffusionmodelsviaintegralequationsandcharacteristicfunctions.pdf
------------------------------
Successfully downloaded: ../data/q-fin.TR/CausalInterv