# PDF Web Scraper Notebook


This notebook provides a script for crawling a website to find and download PDF files. The script utilizes asynchronous HTTP requests for efficient web scraping and file downloading.

The script performs the following tasks:
1. **Crawls a specified website** to find all PDF links.
2. **Downloads the PDF files** and saves them to a local directory (`data/pdf_files/`).
3. **Logs the progress and errors** encountered during the process.


In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import logging

# Directory to save downloaded PDF files
DOWNLOAD_DIR = 'data/pdf_files'
# Create the directory if it does not exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Starting URL for crawling
START_URL = 'https://www.telekom.de/hilfe'

# Set to keep track of downloaded PDF filenames
downloaded_files = set()

# Configure logger settings for better traceability
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to find PDF links on a given URL
async def find_pdfs(url, session):
    """
    Fetches the content of a page and finds all PDF links.
    
    Args:
        url (str): The URL of the page to fetch.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.
        
    Returns:
        list: A list of URLs pointing to PDF files.
    """
    pdf_links = []
    try:
        async with session.get(url) as response:
            # Only process if the response is HTML
            if 'text/html' in response.headers.get('Content-Type', '').lower():
                soup = BeautifulSoup(await response.text(), 'html.parser')
                # Extract all links ending with .pdf
                pdf_links = [
                    urljoin(url, a['href'])
                    for a in soup.find_all('a', href=True)
                    if a['href'].lower().endswith('.pdf')
                ]
            else:
                logger.error(f'{url} is not an HTML page.')
    except Exception as e:
        logger.error(f'Failed to fetch {url}. Error: {str(e)}')
    
    return pdf_links

# Function to download a PDF file
async def download_pdf(url, session):
    """
    Downloads a PDF file from the given URL and saves it to the local directory.
    
    Args:
        url (str): The URL of the PDF file to download.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.
    """
    filename = url.split('/')[-1]
    # Skip download if file has already been downloaded
    if filename in downloaded_files:
        logger.info(f'Already downloaded: {filename}')
        return
    
    try:
        async with session.get(url) as response:
            # Only process if the response status is OK
            if response.status == 200:
                file_path = os.path.join(DOWNLOAD_DIR, filename)
                # Save the PDF file to disk
                with open(file_path, 'wb') as f:
                    f.write(await response.read())
                downloaded_files.add(filename)
                logger.info(f'Downloaded: {file_path}')
            else:
                logger.error(f'Failed to download {url}. Status code: {response.status}')
    except Exception as e:
        logger.error(f'Failed to download {url}. Error: {str(e)}')

# Function to crawl the website and find PDF links
async def crawl_site(start_url):
    """
    Crawls the website starting from the given URL, finds PDF links, and downloads them.
    
    Args:
        start_url (str): The starting URL for the crawl.
    """
    urls_to_visit = {start_url}
    visited_urls = set()

    async with aiohttp.ClientSession() as session:
        while urls_to_visit:
            url = urls_to_visit.pop()
            # Skip URLs that have already been visited
            if url in visited_urls:
                continue
            visited_urls.add(url)
            logger.info(f'Crawling: {url}')

            # Find and download PDF links on the current page
            pdf_links = await find_pdfs(url, session)
            for link in pdf_links:
                await download_pdf(link, session)

            try:
                async with session.get(url) as response:
                    # Only process if the response status is OK
                    if response.status == 200:
                        soup = BeautifulSoup(await response.text(), 'html.parser')
                        # Add new URLs to visit that are within the starting URL and not visited yet
                        urls_to_visit.update(
                            urljoin(url, a['href'])
                            for a in soup.find_all('a', href=True)
                            if a['href'].startswith('/') and urljoin(url, a['href']) not in visited_urls
                        )
            except Exception as e:
                logger.error(f'Failed to crawl {url}. Error: {str(e)}')

# Entry point for the script
async def main():
    """
    Main function to start the web crawling process.
    """
    await crawl_site(START_URL)

# Function to run an asynchronous coroutine
def run_async(coro):
    """
    Runs an asynchronous coroutine using the current event loop.
    
    Args:
        coro (coroutine): The coroutine to run.
    """
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the event loop is already running, schedule the coroutine
        asyncio.ensure_future(coro)
    else:
        # If the event loop is not running, run the coroutine until complete
        loop.run_until_complete(coro)

# Run the script
if __name__ == '__main__':
    run_async(main())


2024-08-26 20:53:27,011 - INFO - Crawling: https://www.telekom.de/hilfe
2024-08-26 20:53:28,117 - INFO - Downloaded: data/pdf_files/051.pdf
2024-08-26 20:53:28,181 - INFO - Downloaded: data/pdf_files/000.pdf
2024-08-26 20:53:28,723 - INFO - Crawling: https://www.telekom.de/magenta-tv/inhalte/disney-plus
2024-08-26 20:53:28,811 - INFO - Already downloaded: 051.pdf
2024-08-26 20:53:28,811 - INFO - Already downloaded: 000.pdf
2024-08-26 20:53:28,892 - INFO - Crawling: https://www.telekom.de/telekom-shops?wt_mc=ii_sososoxx_navi-service-kontakt-telekom-shops
2024-08-26 20:53:29,164 - INFO - Already downloaded: 051.pdf
2024-08-26 20:53:29,164 - INFO - Already downloaded: 000.pdf
2024-08-26 20:53:29,450 - INFO - Crawling: https://www.telekom.de/unterwegs/vertragsverlaengerung?wt_mc=ii_sososoxx_navi-mobilfunk-ich-moechte-tarif-wechseln
2024-08-26 20:53:29,645 - INFO - Already downloaded: 051.pdf
2024-08-26 20:53:29,646 - INFO - Already downloaded: 000.pdf
2024-08-26 20:53:29,808 - INFO - Crawl