In [1]:
import asyncio
import logging
import os
import re
import sys
from typing import Optional

import httpx
from parsel import Selector

# Set up logging
logger = logging.getLogger(__name__)

logging.basicConfig(
    # format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)d | %(funcName)s | %(message)s",
    # level=logging.DEBUG,
    stream=sys.stdout,
)


In [None]:
async def scrape_pdf_link(doi: str) -> Optional[str]:
    """
    Extracts a direct PDF link by scraping the final article webpage.

    Args:
        paper_url: The initial article URL (could be a DOI link).

    Returns:
        The direct PDF URL if found, otherwise None.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Referer": "https://scholar.google.com",  # Some sites require a referrer
    }

    # got most of the patterns from here from reverse engineering the unpaywall chrome extension
    unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=unpaywall@impactstory.org"

    try:
        pdf_url = None

        # --- Unpaywall Check ---
        async with httpx.AsyncClient(timeout=10) as client:
            response = await client.get(unpaywall_url)
            response.raise_for_status()
            data = response.json()

            paper_url = data.get("doi_url")

            if data.get("is_oa"):
                logger.info(f"Paper is Open Access according to Unpaywall. DOI: {doi}")

                if data.get("best_oa_location") and data["best_oa_location"].get("url_for_pdf"):
                    logger.info(f"Found direct PDF URL from Unpaywall: {data['best_oa_location']['url_for_pdf']}")
                    pdf_url = data["best_oa_location"]["url_for_pdf"]  # Return directly if available
                    return pdf_url

            else:
                logger.info(f"Paper is NOT Open Access according to Unpaywall. DOI: {doi}")

        # Get final redirected URL (important for DOI links)
        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
            response = await client.get(paper_url, headers=headers)
            response.raise_for_status()
            logger.info(f"Final URL after redirect: {response.url}")

        final_url = str(response.url)

        # async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
        #     response = await client.get(final_url, headers=headers)
        #     response.raise_for_status()

        selector = Selector(text=response.text)

        # --- Meta Tag Check ---
        meta_pdf_url = selector.xpath("//meta[@name='citation_pdf_url']/@content").get()
        if meta_pdf_url:
            logger.info(f"Found PDF URL in meta tag: {meta_pdf_url}")
            return meta_pdf_url

        # --- Domain-Specific Link Checks ---
        for link in selector.xpath("//a"):
            href = link.xpath("@href").get()
            if not href:
                continue

            # 1. Nature.com (Pattern 1)
            if "nature.com" in final_url:
                match = re.search(r"/nature/journal/.+?/pdf/(.+?)\.pdf$", href)
                if match:
                    pdf_url = httpx.URL(final_url).join(href).unicode_string()
                    logger.info(f"Found PDF URL (Nature.com Pattern 1): {pdf_url}")
                    return pdf_url

                # 2. Nature.com (Pattern 2)
                match = re.search(r"/articles/nmicrobiol\d+\.pdf$", href)
                if match:
                    pdf_url = httpx.URL(final_url).join(href).unicode_string()
                    logger.info(f"Found PDF URL (Nature.com Pattern 2): {pdf_url}")
                    return pdf_url

            # 3. NEJM
            if "nejm.org" in final_url:
                if link.xpath("@data-download-content").get() == "Article":
                    pdf_url = httpx.URL(final_url).join(href).unicode_string()
                    logger.info(f"Found PDF URL (NEJM): {pdf_url}")
                    return pdf_url

            # 4. Taylor & Francis Online
            if "tandfonline.com" in final_url:
                match = re.search(r"/doi/pdf/10.+?needAccess=true", href, re.IGNORECASE)
                if match:
                    pdf_url = httpx.URL(final_url).join(href).unicode_string()
                    logger.info(f"Found PDF URL (Taylor & Francis): {pdf_url}")
                    return pdf_url

            # 5. Centers for Disease Control (CDC)
            if "cdc.gov" in final_url:
                if "noDecoration" == link.xpath("@class").get() and re.search(r"\.pdf$", href):
                    pdf_url = httpx.URL(final_url).join(href).unicode_string()
                    logger.info(f"Found PDF URL (CDC): {pdf_url}")
                    return pdf_url

            # 6. ScienceDirect
            if "sciencedirect.com" in final_url:
                pdf_url_attribute = link.xpath("@pdfurl").get()
                if pdf_url_attribute:
                    pdf_url = httpx.URL(final_url).join(pdf_url_attribute).unicode_string()
                    logger.info(f"Found PDF URL (ScienceDirect): {pdf_url}")
                    return pdf_url

        # 7. IEEE Explore (check within the entire page content)
        if "ieeexplore.ieee.org" in final_url:
            match = re.search(r'"pdfPath":"(.+?)\.pdf"', response.text)
            if match:
                pdf_path = match.group(1) + ".pdf"
                pdf_url = "https://ieeexplore.ieee.org" + pdf_path
                logger.info(f"Found PDF URL (IEEE Explore): {pdf_url}")
                return pdf_url

        # --- General PDF Pattern Check (Fallback) ---
        # use the last 3 characters of the DOI to match the link because it's a commmon pattern
        # for it to be included in the URL. This is to avoid false positives.
        # Not always the case though.
        doi_last_3 = doi[-3:] if len(doi) >= 3 else doi
        PDF_PATTERNS = [
            ".pdf",
            "/pdf/",
            "pdf/",
            "download",
            "fulltext",
            "article",
            "viewer",
            "content/pdf",
            "/nature/journal",
            "/articles/",
            "/doi/pdf/",
        ]
        pdf_links = selector.css("a::attr(href)").getall()  # get all links here to loop through

        for link in pdf_links:  # loop through
            if any(pattern in link.lower() for pattern in PDF_PATTERNS):
                # check if any of the patterns are in the link and the doi_last_3 is in the link
                if doi_last_3 in link.lower():
                    pdf_url = httpx.URL(final_url).join(link).unicode_string()
                    logger.info(f"Found PDF link (General Pattern): {pdf_url}")
                    return str(pdf_url)

                # if the doi_last_3 is not in the link, check if the link is a pdf, do this as final.
                pdf_url = httpx.URL(final_url).join(link).unicode_string()
                logger.info(f"Found PDF link (General Pattern): {pdf_url}")
                return str(pdf_url)

        logger.warning("No PDF link found on the page.")
        return None

    except httpx.HTTPStatusError as e:
        logger.error(f"Unpaywall API error ({e.response.status_code}): {e}")
        if e.response.status_code == 404:
            logger.error(f"Paper with DOI {doi} not found by Unpaywall")
        return None

    except httpx.RequestError as e:
        logger.error(f"Request error: {e}")
        return None

    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        return None


Example of when DOI is not openly accessible:

https://api.unpaywall.org/v2/10.1109/icetet-sip58143.2023.10151614?email=unpaywall@impactstory.org

{
"doi": "10.1109/icetet-sip58143.2023.10151614",
"doi_url": "https://doi.org/10.1109/icetet-sip58143.2023.10151614",
"title": "Algorithmic Trading Strategy Using Technical Indicators",
"genre": "proceedings-article",
"is_paratext": false,
"published_date": "2023-04-28",
"year": 2023,
"journal_name": "2023 11th International Conference on Emerging Trends in Engineering &amp; Technology - Signal and Information Processing (ICETET - SIP)",
"is_oa": false, <-----
"oa_status": "closed",
"has_repository_copy": false,
"best_oa_location": null,
"first_oa_location": null,
"oa_locations": [],
"oa_locations_embargoed": [],
"updated": "2023-06-20T04:32:38.955606",
}

Example of when DOI is openly accessible:

https://api.unpaywall.org/v2/10.1109/ACCESS.2024.3516053?email=unpaywall@impactstory.org

{
"doi": "10.1109/access.2024.3516053",
"doi_url": "https://doi.org/10.1109/access.2024.3516053",
"title": "An Algorithmic Trading Approach Merging Machine Learning with Multi-Indicator Strategies for Optimal Performance",
"genre": "journal-article",
"is_paratext": false,
"published_date": "2024-01-01",
...,
"is_oa": true, <-----
"oa_status": "gold",
"has_repository_copy": false,
"best_oa_location": {
"url": "https://doi.org/10.1109/access.2024.3516053",
"pmh_id": null,
...,
}

}


In [3]:
async def download_paper(doi: str, title: str, output_dir: str = "downloads") -> Optional[str]:
    """
    Downloads a paper PDF given its DOI and title.
    If Unpaywall fails, it scrapes the article page to find the PDF.

    Args:
        doi: The DOI of the paper.
        title: The title of the paper (for the filename).
        output_dir: The directory to save the downloaded PDF.

    Returns:
        The file path of the downloaded PDF if successful, otherwise None.
    """
    try:
        os.makedirs(output_dir, exist_ok=True)

        # Sanitize title for a safe filename
        # safe_title = "".join(c if c.isalnum() or c in "._-" else "_" for c in title)
        safe_title = re.sub(r"[^\w\-_\.]", "_", title)
        file_name = f"{safe_title}_{doi.replace('/', '_')}.pdf"
        file_path = os.path.join(output_dir, file_name)

        # Check if file already exists
        if os.path.exists(f"{file_path}"):
            logger.info(f"Skipping download. PDF for DOI: {doi} already exists at {file_path}.")
            return file_path

        pdf_link = await scrape_pdf_link(doi)

        if not pdf_link:
            logger.error(f"Could not find a PDF link for DOI: {doi}")
            return None

        # Download the PDF
        async with httpx.AsyncClient() as client:
            try:
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
                    "Accept": "application/pdf,text/html,*/*",
                    "Referer": "https://scholar.google.com",  # Some sites require a referrer
                }

                response = await client.get(pdf_link, headers=headers, follow_redirects=True, timeout=30)
                response.raise_for_status()

                content_type = response.headers.get("Content-Type", "")
                logger.info(f"Content-Type received: {content_type}")

                if "pdf" not in content_type.lower():
                    logger.error("The downloaded file is not a PDF!")
                    return None

                with open(file_path, "wb") as f:
                    f.write(response.content)

                logger.info(f"Downloaded PDF for DOI: {doi} to {file_path}")
                return file_path

            except httpx.HTTPStatusError as e:
                logger.error(f"HTTP error downloading PDF for DOI {doi}: {e.response.status_code}")
                return None
            except httpx.RequestError as e:
                logger.error(f"Request error downloading PDF for DOI {doi}: {e}")
                return None

    except Exception as e:
        logger.exception(f"General error downloading PDF for DOI {doi}: {e}")
        return None


In [4]:
async def download_papers_parallel(results, limit=5, download=False, output_dir="downloads"):
    """
    Displays search results and optionally downloads PDFs in parallel.

    Args:
        results: The search results dictionary.
        limit: The number of top results to display.
        download: If True, download PDFs for the displayed results.
    """
    print(f"\n  Search Results (Top {limit}):")
    doi_title_pairs_for_download = []

    for item in results.get("data", [])[:limit]:
        if download:
            if item.get("externalIds") and item["externalIds"].get("DOI"):
                doi = item["externalIds"]["DOI"]
                title = item.get("title", "Unknown_Title")
                doi_title_pairs_for_download.append((doi, title))  # Collect DOI and title for parallel download
                print(f"      Downloading DOI {doi}")  # Indicate download is initiated

    print()

    downloaded_files = []
    if download and doi_title_pairs_for_download:
        download_tasks = [download_paper(doi, title, output_dir) for doi, title in doi_title_pairs_for_download]
        downloaded_files = await asyncio.gather(*download_tasks)

        successful_downloads = 0
        failed_downloads = 0
        for file_path in downloaded_files:
            if file_path:
                successful_downloads += 1
            else:
                failed_downloads += 1

        logger.info("--- Parallel Download Statistics ---")
        logger.info(f"Total papers attempted: {len(doi_title_pairs_for_download)}")
        logger.info(f"Successfully downloaded: {successful_downloads}")
        logger.info(f"Failed downloads: {failed_downloads}")
        logger.info("-----------------------------------\n")

        for i, item in enumerate(results.get("data", [])[:limit]):
            if item.get("externalIds") and item["externalIds"].get("DOI"):
                doi = item["externalIds"]["DOI"]
                file_path = downloaded_files[i]  # Get corresponding result
                if file_path:
                    print(f"      Downloaded DOI: {doi} PDF to: {file_path}")
                else:
                    print(f"      Failed to download PDF for DOI: {doi}")
    return downloaded_files


In [5]:
async def main():
    dois_titles = [
        (
            "10.3390/math13030442",
            "Sustainability, Accuracy, Fairness, and Explainability (SAFE) Machine Learning in Quantitative Trading",
        ),
        (
            "10.30574/ijsra.2024.11.1.0292",
            "Machine learning in financial markets: A critical review of algorithmic trading and risk management",
        ),
        ("10.30574/wjaets.2024.11.1.0054", "Algorithmic Trading and AI: A Review of Strategies and Market Impact"),
    ]

    # Convert the list of tuples into the desired dictionary structure
    results_data = []
    for doi, title in dois_titles:
        results_data.append({"externalIds": {"DOI": doi}, "title": title})

    # Now, example_results_data is in the format that resembles results.get("data", [])

    # To use it with your `display_results_and_download` function, you would structure your results like this:
    results_object = {"data": results_data}

    file_path = await download_papers_parallel(results_object, limit=len(results_data), download=True)

    if file_path:
        logger.info(f"PDF downloaded: {file_path}")
    else:
        logger.error("Failed to download PDF.")


await main()


  Search Results (Top 3):
      Downloading DOI 10.3390/math13030442
      Downloading DOI 10.30574/ijsra.2024.11.1.0292
      Downloading DOI 10.30574/wjaets.2024.11.1.0054

      Downloaded DOI: 10.3390/math13030442 PDF to: downloads\Sustainability__Accuracy__Fairness__and_Explainability__SAFE__Machine_Learning_in_Quantitative_Trading_10.3390_math13030442.pdf
      Downloaded DOI: 10.30574/ijsra.2024.11.1.0292 PDF to: downloads\Machine_learning_in_financial_markets__A_critical_review_of_algorithmic_trading_and_risk_management_10.30574_ijsra.2024.11.1.0292.pdf
      Downloaded DOI: 10.30574/wjaets.2024.11.1.0054 PDF to: downloads\Algorithmic_Trading_and_AI__A_Review_of_Strategies_and_Market_Impact_10.30574_wjaets.2024.11.1.0054.pdf


In [6]:
%%script false


async def hit_rate_limit_no_key():
    # ... (your coroutine code) ...
    paper_id = "0f8f25af11d027bb6602639a4dc345c67af996c0"  # Replace with a paper ID
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=title"

    async with httpx.AsyncClient() as client:
        i = 0
        while True:
            i += 1
            try:
                response = await client.get(url)
                print(f"Request: {i}, Status Code: {response.status_code}")
                response.raise_for_status()  # Raise HTTPStatusError for bad responses (4xx or 5xx)
            except httpx.HTTPStatusError as e:
                print(f"Request: {i}, Status Code: {e.response.status_code}", flush=True)
                if e.response.status_code == 429:
                    print("Rate limit exceeded!", flush=True)

                    # --- Print ALL headers ---
                    print("Response Headers:")
                    for name, value in e.response.headers.items():
                        print(f"  {name}: {value}")

                    retry_after = e.response.headers.get("Retry-After")
                    if retry_after:
                        print(f"Retry-After: {retry_after} seconds", flush=True)
                    break  # Exit the loop
            except httpx.RequestError as e:
                print(f"Request {i} failed: {e}", flush=True)
                break  # Exit if network error
            await asyncio.sleep(0.1)


# --- In your Jupyter notebook or interactive session ---
loop = asyncio.get_running_loop()  # Get the *existing* loop
task = loop.create_task(hit_rate_limit_no_key())  # Or asyncio.create_task if you have python 3.7+
# You can optionally await the task if you need to wait for it to finish:
# await task

Couldn't find program: 'false'
