In [3]:
import requests
from bs4 import BeautifulSoup
import sys
import urllib.parse
import pandas as pd
import os

In [7]:
def get_pdf_url(doi, base_url):
    url = f"{base_url}/{doi}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")

        # Find direct PDF link
        iframe = soup.find("iframe", id="pdf")
        if iframe:
            pdf_path = iframe.get("src")
            if pdf_path:
                return urllib.parse.urljoin(base_url, pdf_path)

        # Alternative for newer Sci-Hub versions
        button = soup.find("button", onclick=lambda x: x and "location.href" in x)
        if button:
            js_code = button["onclick"]
            pdf_path = js_code.split("=")[1].strip("';")
            return urllib.parse.urljoin(base_url, pdf_path)

        return None

    except Exception as e:
        print(f"Error accessing {base_url}: {str(e)}")
        return None


def download_pdf(pdf_url, filename, folder_path):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(pdf_url, headers=headers, stream=True)
        response.raise_for_status()
        
        # Create the folder if it doesn't exist
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        # Create the file in the folder
        with open(os.path.join(folder_path, filename), "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        return True
    except Exception as e:
        print(f"Error during download: {str(e)}")
        return False
    
def get_doi_from_doi_url(doi_url):
    # Get the DOI from the url "https://doi.org/10.1149/1.2069372" -> "10.1149/1.2069372"
    doi = doi_url.split("https://doi.org/")[1]
    return doi


def get_pdf_from_doi_url(doi_url, folder_path, ID):
    if len(sys.argv) != 2:
        print("Usage: python scihub_downloader.py <DOI>")
        sys.exit(1)

    scihub_domains = [
        "https://sci-hub.se",
        "https://sci-hub.st",
        "https://sci-hub.ru",
        "https://sci-hub.ee",
    ]

    pdf_url = None
    for domain in scihub_domains:
        print(f"Trying with domain: {domain}")
        doi = get_doi_from_doi_url(doi_url)
        pdf_url = get_pdf_url(doi, domain)
        if pdf_url:
            print(f"PDF found on {domain}")
            break

    if not pdf_url:
        print("Failed to locate PDF on all Sci-Hub domains")
        return
    filename = ID + ".pdf"
    
    print(f"Attempting download from: {pdf_url}")

    if download_pdf(pdf_url, filename, folder_path):
        print(f"PDF successfully saved as: {filename}")
    else:
        print("Download failed")

In [8]:
# Open references_with_doi_clean.csv
df = pd.read_csv(
    "../4_references_to_doi/references_with_doi_clean.csv",
    sep=";",
    engine="python",
    quotechar='"'
)

destination_folder = "Articles cours sensing JMT"

for index, row in df.iterrows():
    # Check if pdf is already download
    if os.path.exists(f"{destination_folder}/{row['ID']}.pdf"):
        print(f"PDF déjà téléchargé pour {row['Reference']}")
        continue
        
    # Download PDF
    print(f"Téléchargement du PDF pour {row['Reference']}")
    get_pdf_from_doi_url(row['DOI'], destination_folder, row['ID'])

PDF déjà téléchargé pour D. Guyomard and J.M. Tarascon, J. Electrochem. Soc. Vol 175 (1992)
Téléchargement du PDF pour A. Blyr, C. Sigala et al. Jr. Electrochemical Soc. Vol. 145, 1998
Trying with domain: https://sci-hub.se
Error accessing https://sci-hub.se: HTTPSConnectionPool(host='sci-hub.se', port=443): Max retries exceeded with url: /10.1149/1.1838235 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x113557d90>: Failed to resolve 'sci-hub.se' ([Errno 8] nodename nor servname provided, or not known)"))
Trying with domain: https://sci-hub.st
Error accessing https://sci-hub.st: HTTPSConnectionPool(host='sci-hub.st', port=443): Max retries exceeded with url: /10.1149/1.1838235 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x1142ac690>: Failed to resolve 'sci-hub.st' ([Errno 8] nodename nor servname provided, or not known)"))
Trying with domain: https://sci-hub.ru
Error accessing https://sci-hub.ru: HTTPSConnectionPool(host