# Automatic web scraping bot

## This bot is used to find pdfs of linear equations for training of an OCR used to read formulas. In the future other topics will be added(quadratic equations, integrals, derivatives, and more).

Here we download all the necessary libraries. For this bot we will combine beautifulsoup and selenium. We must do this because google scholar detects automatic web scraping and blocks the bots. So we can't only use beautifulsoup. Beautilsoup only scrapes the HTML and ignores the javascript, so first we will use selenium to read the javascript and then beautifulsoup to scrape the HTML.

In [8]:
import os  # Provides functions for interacting with the operating system (e.g., file and folder manipulation).
import re  # Provides support for regular expressions to handle pattern matching, like searching for ".pdf".
import time  # Used for adding time delays, making the script wait for a specified period.
import random  # Generates random numbers, which is used to create random sleep times to mimic human behavior.
import requests  # Allows the script to send HTTP requests to retrieve content from websites.
from selenium import webdriver  # Provides tools for controlling web browsers programmatically.
from selenium.webdriver.chrome.service import Service  # Allows setting up the Chrome WebDriver service.
from selenium.webdriver.chrome.options import Options  # Enables configuration of Chrome's options, like running headless.
from selenium.webdriver.common.by import By  # Provides methods for locating elements on a webpage (e.g., by class name, XPath).
from webdriver_manager.chrome import ChromeDriverManager  # Automatically manages the installation of the appropriate ChromeDriver version.
from bs4 import BeautifulSoup  # A library for parsing HTML and XML documents, used to extract information from web pages.
from urllib.parse import urljoin  # A function to handle relative URLs, ensuring that links are converted to absolute URLs.

Here we select the folder for saving the pdfs.

In [9]:
# Folder to save PDF files
SAVE_FOLDER = "/Users/donpedrodado/Documents/opt/Duck_PA/math_training_sets"

Sets up Selenium WebDriver with headless Chrome.

In [10]:
def setup_driver():
    # Initialize the Chrome options object
    chrome_options = Options()
    
    # Run without opening the browser window (headless mode)
    chrome_options.add_argument("--headless")
    
    # Prevent detection of Selenium as an automated tool
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")  
    
    # Additional options for better performance in a headless environment
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize the Chrome driver with the specified options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    return driver

Downloads a PDF file from a given URL.

In [11]:
def download_pdf(url):
    """Downloads a PDF from the given URL."""
    try:
        # Send an HTTP GET request to the URL with a 10-second timeout
        response = requests.get(url, stream=True, timeout=10)
        
        # Raise an exception if the response status code indicates an error (non-2xx)
        response.raise_for_status()

        # Extract the filename from the URL (last part after the last "/")
        filename = url.split("/")[-1]

        # Create the full file path where the PDF will be saved
        filepath = os.path.join(SAVE_FOLDER, filename)

        # Open the file in write-binary mode and download the PDF in chunks
        with open(filepath, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):  # Download in chunks of 8 KB
                file.write(chunk)

        # Print a success message with the filename
        print(f"✅ Downloaded: {filename}")

    except Exception as e:
        # If an error occurs (like connection failure or invalid URL), print an error message
        print(f"❌ Failed to download {url}: {e}")

Extract PDF links from an external page.

In [12]:
def extract_pdfs_from_page(url):
    try:
        # Make an HTTP GET request to the provided URL
        response = requests.get(url, timeout=10)
        
        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # Initialize an empty list to hold the PDF links
        pdf_links = []

        # Loop through all anchor tags (<a>) that have an href attribute (i.e., links)
        for link in soup.find_all("a", href=True):
            href = link["href"]  # Get the href attribute (the URL)

            # Handle relative URLs by joining them with the base URL
            full_url = urljoin(url, href)  # This resolves relative URLs to absolute ones

            # If the link ends with '.pdf', it's likely a direct link to a PDF file
            if full_url.endswith(".pdf"):  # Check if it's a PDF link
                pdf_links.append(full_url)  # Add the PDF link to the list

            #TODO: fix parsing of HTML with beatifulsoup
        # Return the list of PDF links found on this page
        return pdf_links
    except Exception as e:
        # If an error occurs (e.g., network issues), print an error message
        print(f"❌ Error extracting PDFs from {url}: {e}")
        return []  # Return an empty list if an error occurs

Search Google Scholar and extract PDFs from external sites.

In [13]:
def find_pdfs(num_results=5):
    """Search Google Scholar and extract PDFs from external sites."""
    
    # Set up the Selenium WebDriver to interact with the Google Scholar page
    driver = setup_driver()

    # Set the search URL for Google Scholar with a search term for "linear equations"
    search_url = f"https://scholar.google.com/scholar?q=linear+equations"
    driver.get(search_url)  # Open the search results page in the browser
    
    time.sleep(random.uniform(3, 6))  # Sleep for a random amount of time (3-6 seconds) to mimic human behavior

    # Initialize a list to hold external links (to PDFs or articles)
    external_links = []
    
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Iterate through all the anchor tags (<a>) with href attributes
    for result in soup.find_all("a", href=True):
        link = result["href"]  # Get the href attribute (URL) of the link
        
        # Handle relative URLs by ensuring they are absolute URLs
        if link.startswith("//"):
            link = "https:" + link  # Prepend https: if the URL starts with //

        # Check if the link contains "[PDF]" in the text or ends with ".pdf"
        if "[PDF]" in result.text or re.search(r".*\.pdf$", link):
            external_links.append(link)  # If it’s a PDF link, add it to the external_links list

    # If no direct PDF links are found, look for article links instead
    if not external_links:
        for link in soup.find_all("a", href=True):  # Iterate over all the links again
            external_links.append(link["href"])  # Collect all other external article links

    driver.quit()  # Close the browser driver

    # Now, visit those external links and extract any PDFs they might contain
    pdf_links = []  # Initialize a list to store the final PDF links
    for ext_link in external_links[:num_results]:  # Loop through the first `num_results` external links
        pdf_links.extend(extract_pdfs_from_page(ext_link))  # Extract PDFs from the external page

    return pdf_links[:num_results]  # Return the first `num_results` PDF links

Main function to execute the script.

In [14]:
def main():
    # Check if the save folder exists, and create it if it doesn't
    if not os.path.exists(SAVE_FOLDER):
        os.makedirs(SAVE_FOLDER)

    # Call find_pdfs() to search for PDF links (returns a list of PDFs)
    pdf_links = find_pdfs()

    # If PDF links were found, download each PDF
    if pdf_links:
        for pdf in pdf_links:
            download_pdf(pdf)
    else:
        # If no PDF links were found, print a message
        print("❌ No PDF links found.")

In [15]:
if __name__ == "__main__":
    # If the script is being run directly (not imported), call the main function
    main()

❌ Error extracting PDFs from https://annals.math.princeton.edu/wp-content/uploads/annals-v171-n3-p08-p.pdf: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at "<![�$�iu�!\x08R'�#z���n"
❌ No PDF links found.
