In [271]:
import arxiv 
import requests
import os 
import re 
import urllib.request 
import json 
from bs4 import BeautifulSoup 
from tqdm import tqdm 


def standard_url_alphanumeric(input_string):
    allowed_characters = r'[a-zA-Z0-9\-._~:/?#\[\]@!$&\'()*+,;=%]'
    cleaned_string = re.sub(r'[^\w\s-]', ' ', input_string) 
    cleaned_string = re.sub(r'[-\s]+', '-', cleaned_string)
    cleaned_string = cleaned_string.strip("-")
    cleaned_string = "".join(re.findall(allowed_characters, cleaned_string))
    cleaned_string = cleaned_string.lower()
    return cleaned_string.strip('-')


def standard_title_alphanumeric(input_string):
    allowed_characters = r'[a-zA-Z0-9\-._~:/?#\[\]@!$&\'()*+,;=%]'
    cleaned_string = re.sub(r'[^\w\s-]', ' ', input_string) 
    cleaned_string = re.sub(r'[-\s]+', '_', cleaned_string)
    cleaned_string = cleaned_string.strip("_")
    cleaned_string = "".join(re.findall(allowed_characters, cleaned_string))
    cleaned_string = cleaned_string.lower()
    return cleaned_string.strip('_')


def arxiv_downloader(article_title: str, directory: str):
    search = arxiv.Search(
        query = article_title, 
        max_results = 1, 
        sort_by = arxiv.SortCriterion.Relevance
    )
    client = arxiv.Client()
    results = client.results(search)
    for result in results:
        if result.title.lower() == article_title.lower():
            print("Article found on arXiv.")
            url = result.pdf_url 
            response = requests.get(url, stream = True)
            os.makedirs(directory, exist_ok = True)
            article_title = standard_title_alphanumeric(article_title).lower()
            total_size = int(response.headers.get("content-length", 0))
            with open(f"{article_title}.pdf", "wb") as f:

                with tqdm(total = total_size, unit = "B", unit_scale = True, desc = "Downloading...") as pbar:
                    for chunk in response.iter_content(chunk_size = 1024):
                        f.write(chunk)
                        pbar.update(len(chunk))
        else:
            print("Article not found on arXiv.")


def chemrxiv_downloader(article_title: str, directory: str, max_results: int = 1):
    search_query = article_title.split()
    chemrxiv_url = f'https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?term={"%20".join(search_query)}&sort=RELEVANT_DESC&limit={max_results}'
    
    try:
        req = urllib.request.Request(
            url = chemrxiv_url, 
            headers = {"User-Agent": "Mozilla/5.0"}
        )
        with urllib.request.urlopen(req) as response:
            s = response.read() 
            jsonResponse = json.loads(s.decode("utf-8"))
            ids = [item["item"]["id"] for item in jsonResponse["itemHits"]]
            titles = [item["item"]["title"].replace("\n", "") for item in jsonResponse["itemHits"]]
            titles = [standard_title_alphanumeric(title) for title in titles]
            pdfs = ["https://chemrxiv.org/engage/api-gateway/chemrxiv/assets/orp/resource/item/" + id + "/original/" for id in ids]
            pdfs = [pdf + title + ".pdf" for pdf, title in zip(pdfs, titles)]
            article_title = standard_title_alphanumeric(article_title)
            title = titles[0]
            pdf = pdfs[0]
            if title.lower() == article_title.lower():
                print("Article found on ChemRxiv.")
                os.makedirs(directory, exist_ok = True)
                response = requests.get(pdf, stream = True)
                total_size = int(response.headers.get("content-length", 0))
                with open(f"{article_title.lower()}.pdf", "wb") as f:
                    with tqdm(total = total_size, unit = "B", unit_scale = True, desc = "Downloading...") as pbar:
                        for chunk in response.iter_content(chunk_size = 1024):
                            f.write(chunk)
                            pbar.update(len(chunk))

            else:
                print("Article not found on ChemRxiv.")
    
    except Exception as e:
        return "An error occurred."
    

def biorxiv_downloader(article_title: str, directory: str):
    search_query = article_title.split()
    search_query = "%252B".join(search_query)
    biorxiv_url = f"https://www.biorxiv.org/search/" + search_query
    
    try:
        req = urllib.request.Request(
            url = biorxiv_url, 
            headers = {"User-Agent": "Mozilla/5.0"}
        )
        with urllib.request.urlopen(req) as response:
            s = response.read() 
            soup = BeautifulSoup(s, "html.parser")
            titles = soup.find_all('span', class_='highwire-cite-title')
            titles = [title.get_text(strip = True) for title in titles]
            articles = soup.find_all("a", class_='highwire-cite-linked-title')
            links = [f"https://www.biorxiv.org{article['href']}" for article in articles]
            pdfs = [link + ".full.pdf" for link in links]
            titles = titles[1::int(len(titles) / len(links))]

            title = titles[0]
            pdf = pdfs[0]

            title = standard_title_alphanumeric(title)
            article_title = standard_title_alphanumeric(article_title)

            if title.lower() == article_title.lower():
                print("Article found on bioRxiv.")
                os.makedirs(directory, exist_ok = True)
                response = requests.get(pdf, stream = True)
                total_size = int(response.headers.get("content-length", 0))
                with open(f"{article_title.lower()}.pdf", "wb") as f:
                    with tqdm(total = total_size, unit = "B", unit_scale = True, desc = "Downloading...") as pbar:
                        for chunk in response.iter_content(chunk_size = 1024):
                            f.write(chunk)
                            pbar.update(len(chunk))
                            
            else:
                print("Article not found on bioRxiv.")
    
    except Exception as e:
        return "An error occurred."
    

def medrxiv_downloader(article_title: str, directory: str):
    search_query = article_title.split()
    search_query = "%252B".join(search_query)
    biorxiv_url = f"https://www.medrxiv.org/search/" + search_query
    
    try:
        req = urllib.request.Request(
            url = biorxiv_url, 
            headers = {"User-Agent": "Mozilla/5.0"}
        )
        with urllib.request.urlopen(req) as response:
            s = response.read() 
            soup = BeautifulSoup(s, "html.parser")
            titles = soup.find_all('span', class_='highwire-cite-title')
            titles = [title.get_text(strip = True) for title in titles]
            articles = soup.find_all("a", class_='highwire-cite-linked-title')
            links = [f"https://www.medrxiv.org{article['href']}" for article in articles]
            pdfs = [link + ".full.pdf" for link in links]
            titles = titles[1::int(len(titles) / len(links))]

            title = titles[0]
            pdf = pdfs[0]

            title = standard_title_alphanumeric(title)
            article_title = standard_title_alphanumeric(article_title)

            if title.lower() == article_title.lower():
                print("Article found on medRxiv.")
                os.makedirs(directory, exist_ok = True)
                response = requests.get(pdf, stream = True)
                total_size = int(response.headers.get("content-length", 0))
                with open(f"{article_title.lower()}.pdf", "wb") as f:
                    with tqdm(total = total_size, unit = "B", unit_scale = True, desc = "Downloading...") as pbar:
                        for chunk in response.iter_content(chunk_size = 1024):
                            f.write(chunk)
                            pbar.update(len(chunk))
                            
            else:
                print("Article not found on medRxiv.")
    
    except Exception as e:
        return "An error occurred."

In [240]:
title = "High-Tc superconductor candidates proposed by machine learning"
arxiv_downloader(
    title, 
    "/home/siwoo/PythonScripts/Natural_Language_Processing"
)

Article found on arXiv.


Downloading...: 100%|██████████| 8.63M/8.63M [00:01<00:00, 4.56MB/s]


In [272]:
title = "Swern oxidation transition state theory is ok"
chemrxiv_downloader(
    title, 
    "/home/siwoo/PythonScripts/Natural_Language_Processing"
)

Article found on ChemRxiv.


Downloading...: 100%|██████████| 9.96M/9.96M [00:02<00:00, 4.11MB/s]


In [262]:
title = "Functional identification of soluble uric acid as an endogenous inhibitor of CD38"
biorxiv_downloader(
    title, 
    "/home/siwoo/PythonScripts/Natural_Language_Processing"
)

Article found on bioRxiv.


Downloading...: 7.32MB [00:01, 6.29MB/s]                            


In [264]:
title = "Complex patterns of multimorbidity associated with severe COVID-19 and Long COVID"
medrxiv_downloader(
    title, 
    "/home/siwoo/PythonScripts/Natural_Language_Processing"
)

Article found on medRxiv.


Downloading...: 100%|██████████| 9.45M/9.45M [00:02<00:00, 3.25MB/s]


In [267]:
def article_search_and_download(article_title: str, directory: str, pre_print: str):
    allowed_options = {"arxiv", "chemrxiv", "biorxiv", "medrxiv"}
    if pre_print in allowed_options:
        if pre_print == "arxiv":
            return arxiv_downloader(article_title, directory)
        if pre_print == "chemrxiv":
            return chemrxiv_downloader(article_title, directory)
        if pre_print == "biorxiv":
            return biorxiv_downloader(article_title, directory)
        if pre_print == "medrxiv":
            medrxiv_downloader(article_title, directory)
    else:
        raise ValueError(f"Invalid option '{pre_print}'. Allowed options are: {allowed_options}")

In [270]:
article_search_and_download(
    "High-Tc superconductor candidates proposed by machine learning", 
    "/home/siwoo/PythonScripts/Natural_Language_Processing", 
    "arxiv"
)

Article found on arXiv.


Downloading...: 100%|██████████| 8.63M/8.63M [00:01<00:00, 5.26MB/s]
