In [2]:
import asyncio
import logging
import os
import re
import sys
from typing import Optional

import httpx
from parsel import Selector

# Set up logging
logger = logging.getLogger(__name__)

logging.basicConfig(
    # format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)d | %(funcName)s | %(message)s",
    # level=logging.DEBUG,
    stream=sys.stdout,
)


In [None]:
async def scrape_pdf_link(paper_url: str, doi: str) -> Optional[str]:
    """
    Extracts a direct PDF link by scraping the final article webpage.

    Args:
        paper_url: The initial article URL (could be a DOI link).

    Returns:
        The direct PDF URL if found, otherwise None.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Referer": "https://scholar.google.com",  # Some sites require a referrer
    }

    try:
        # Get final redirected URL (important for DOI links)
        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
            response = await client.get(paper_url, headers=headers)
            response.raise_for_status()
            logger.info(f"Final URL after redirect: {response.url}")

        final_url = str(response.url)

        # async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
        #     response = await client.get(final_url, headers=headers)
        #     response.raise_for_status()

        selector = Selector(text=response.text)
        pdf_links = selector.css("a::attr(href)").getall()

        doi_last_3 = doi[-3:] if len(doi) >= 3 else doi

        PDF_PATTERNS = [
            ".pdf",  # Standard PDFs
            "/pdf",  # PDF paths without extensions
            "pdf/",  # PDF paths without extensions
            "download",  # Download URLs
            "fulltext",  # Full text links
            "article",  # Article-specific PDF links
            "viewer",  # Embedded viewer PDFs
            "content/pdf",  # Used by Springer, Wiley, etc.
        ]

        for link in pdf_links:
            if any(pattern in link.lower() for pattern in PDF_PATTERNS) and doi_last_3 in link.lower():
                full_pdf_link = httpx.URL(final_url).join(link)
                logger.info(f"Found PDF link: {full_pdf_link}")
                return str(full_pdf_link)

        logger.warning("No PDF link found on the page.")
        return None

    except httpx.RequestError as e:
        logger.error(f"Error fetching page {paper_url}: {e}")
        return None


In [32]:
async def download_paper(doi: str, title: str, output_dir: str = "downloads") -> Optional[str]:
    """
    Downloads a paper PDF given its DOI and title.
    If Unpaywall fails, it scrapes the article page to find the PDF.

    Args:
        doi: The DOI of the paper.
        title: The title of the paper (for the filename).
        output_dir: The directory to save the downloaded PDF.

    Returns:
        The file path of the downloaded PDF if successful, otherwise None.
    """
    try:
        os.makedirs(output_dir, exist_ok=True)

        # Sanitize title for a safe filename
        # safe_title = "".join(c if c.isalnum() or c in "._-" else "_" for c in title)
        safe_title = re.sub(r"[^\w\-_\.]", "_", title)
        file_name = f"{safe_title}_{doi.replace('/', '_')}.pdf"
        file_path = os.path.join(output_dir, file_name)

        # Check if file already exists
        if os.path.exists(f"{file_path}"):
            logger.info(f"Skipping download. PDF for DOI: {doi} already exists at {file_path}.")
            return file_path

        pdf_url = f"https://doi.org/{doi}"
        pdf_link = await scrape_pdf_link(pdf_url, doi)

        if not pdf_link:
            logger.error(f"Could not find a PDF link for DOI: {doi}")
            return None

        # Download the PDF
        async with httpx.AsyncClient() as client:
            try:
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
                    "Accept": "application/pdf,text/html,*/*",
                    "Referer": "https://scholar.google.com",  # Some sites require a referrer
                }

                response = await client.get(pdf_link, headers=headers, follow_redirects=True, timeout=30)
                response.raise_for_status()

                content_type = response.headers.get("Content-Type", "")
                logger.info(f"Content-Type received: {content_type}")

                if "pdf" not in content_type.lower():
                    logger.error("The downloaded file is not a PDF!")
                    return None

                with open(file_path, "wb") as f:
                    f.write(response.content)

                logger.info(f"Downloaded PDF for DOI: {doi} to {file_path}")
                return file_path

            except httpx.HTTPStatusError as e:
                logger.error(f"HTTP error downloading PDF for DOI {doi}: {e.response.status_code}")
                return None
            except httpx.RequestError as e:
                logger.error(f"Request error downloading PDF for DOI {doi}: {e}")
                return None

    except Exception as e:
        logger.exception(f"General error downloading PDF for DOI {doi}: {e}")
        return None


In [1]:
async def download_papers_parallel(results, limit=5, download=False, output_dir="downloads"):
    """
    Displays search results and optionally downloads PDFs in parallel.

    Args:
        results: The search results dictionary.
        limit: The number of top results to display.
        download: If True, download PDFs for the displayed results.
    """
    print(f"\n  Search Results (Top {limit}):")
    doi_title_pairs_for_download = []

    for item in results.get("data", [])[:limit]:
        if download:
            if item.get("externalIds") and item["externalIds"].get("DOI"):
                doi = item["externalIds"]["DOI"]
                title = item.get("title", "Unknown_Title")
                doi_title_pairs_for_download.append((doi, title))  # Collect DOI and title for parallel download
                print(f"      Downloading DOI {doi}")  # Indicate download is initiated

    print()

    downloaded_files = []
    if download and doi_title_pairs_for_download:
        download_tasks = [download_paper(doi, title, output_dir) for doi, title in doi_title_pairs_for_download]
        downloaded_files = await asyncio.gather(*download_tasks)

        successful_downloads = 0
        failed_downloads = 0
        for file_path in downloaded_files:
            if file_path:
                successful_downloads += 1
            else:
                failed_downloads += 1

        logger.info("--- Parallel Download Statistics ---")
        logger.info(f"Total papers attempted: {len(doi_title_pairs_for_download)}")
        logger.info(f"Successfully downloaded: {successful_downloads}")
        logger.info(f"Failed downloads: {failed_downloads}")
        logger.info("-----------------------------------\n")

        for i, item in enumerate(results.get("data", [])[:limit]):
            if item.get("externalIds") and item["externalIds"].get("DOI"):
                doi = item["externalIds"]["DOI"]
                file_path = downloaded_files[i]  # Get corresponding result
                if file_path:
                    print(f"      Downloaded DOI: {doi} PDF to: {file_path}")
                else:
                    print(f"      Failed to download PDF for DOI: {doi}")
    return downloaded_files


In [34]:
async def main():
    dois_titles = [
        (
            "10.3390/math13030442",
            "Sustainability, Accuracy, Fairness, and Explainability (SAFE) Machine Learning in Quantitative Trading",
        ),
        (
            "10.30574/ijsra.2024.11.1.0292",
            "Machine learning in financial markets: A critical review of algorithmic trading and risk management",
        ),
        ("10.30574/wjaets.2024.11.1.0054", "Algorithmic Trading and AI: A Review of Strategies and Market Impact"),
    ]

    # Convert the list of tuples into the desired dictionary structure
    results_data = []
    for doi, title in dois_titles:
        results_data.append({"externalIds": {"DOI": doi}, "title": title})

    # Now, example_results_data is in the format that resembles results.get("data", [])

    # To use it with your `display_results_and_download` function, you would structure your results like this:
    results_object = {"data": results_data}

    file_path = await download_papers_parallel(results_object, limit=len(results_data), download=True)

    if file_path:
        logger.info(f"PDF downloaded: {file_path}")
    else:
        logger.error("Failed to download PDF.")


await main()


  Search Results (Top 3):
      Downloading DOI 10.3390/math13030442
      Downloading DOI 10.30574/ijsra.2024.11.1.0292
      Downloading DOI 10.30574/wjaets.2024.11.1.0054

2025-02-13 18:59:03,332 | INFO | 3446133886.py:25 | download_paper | Skipping download. PDF for DOI: 10.3390/math13030442 already exists at downloads\Sustainability__Accuracy__Fairness__and_Explainability__SAFE__Machine_Learning_in_Quantitative_Trading_10.3390_math13030442.pdf.
2025-02-13 18:59:03,333 | INFO | 3446133886.py:25 | download_paper | Skipping download. PDF for DOI: 10.30574/ijsra.2024.11.1.0292 already exists at downloads\Machine_learning_in_financial_markets__A_critical_review_of_algorithmic_trading_and_risk_management_10.30574_ijsra.2024.11.1.0292.pdf.
2025-02-13 18:59:03,333 | INFO | 3446133886.py:25 | download_paper | Skipping download. PDF for DOI: 10.30574/wjaets.2024.11.1.0054 already exists at downloads\Algorithmic_Trading_and_AI__A_Review_of_Strategies_and_Market_Impact_10.30574_wjaets.2024.1

In [None]:
async def hit_rate_limit_no_key():
    # ... (your coroutine code) ...
    paper_id = "0f8f25af11d027bb6602639a4dc345c67af996c0"  # Replace with a paper ID
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=title"

    async with httpx.AsyncClient() as client:
        i = 0
        while True:
            i += 1
            try:
                response = await client.get(url)
                print(f"Request: {i}, Status Code: {response.status_code}")
                response.raise_for_status()  # Raise HTTPStatusError for bad responses (4xx or 5xx)
            except httpx.HTTPStatusError as e:
                print(f"Request: {i}, Status Code: {e.response.status_code}", flush=True)
                if e.response.status_code == 429:
                    print("Rate limit exceeded!", flush=True)

                    # --- Print ALL headers ---
                    print("Response Headers:")
                    for name, value in e.response.headers.items():
                        print(f"  {name}: {value}")

                    retry_after = e.response.headers.get("Retry-After")
                    if retry_after:
                        print(f"Retry-After: {retry_after} seconds", flush=True)
                    break  # Exit the loop
            except httpx.RequestError as e:
                print(f"Request {i} failed: {e}", flush=True)
                break  # Exit if network error
            await asyncio.sleep(0.1)


# --- In your Jupyter notebook or interactive session ---
loop = asyncio.get_running_loop()  # Get the *existing* loop
task = loop.create_task(hit_rate_limit_no_key())  # Or asyncio.create_task if you have python 3.7+
# You can optionally await the task if you need to wait for it to finish:
# await task


Request: 1, Status Code: 200
Request: 2, Status Code: 200
Request: 3, Status Code: 200
Request: 4, Status Code: 200
Request: 5, Status Code: 429
Request: 5, Status Code: 429
Rate limit exceeded!
Response Headers:
  content-type: application/json
  content-length: 174
  connection: keep-alive
  date: Fri, 14 Feb 2025 16:31:23 GMT
  x-amz-apigw-id: F-5qaHezvHcEdwg=
  x-amzn-requestid: 55f58e7c-a53b-4f0d-9bcc-f06dfa089e16
  x-amzn-errortype: TooManyRequestsException
  x-cache: Error from cloudfront
  via: 1.1 9a9e1d634ed04ebc88e3edf6c14456fe.cloudfront.net (CloudFront)
  x-amz-cf-pop: JNB50-C1
  x-amz-cf-id: 0SeBs6fKZDyROtgzLIRo3hDQT79c3loDWHIMWXwCkaWxTsSES-xw0w==
