In [2]:
import base64
import json
import logging
import time
import os
import re
from io import BytesIO
from typing import List
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager

class PdfGenerator:

    driver = None
    print_options = {
        'landscape': False,
        'displayHeaderFooter': False,
        'printBackground': True,
        'preferCSSPageSize': True,
        'paperWidth': 7.7,
        'paperHeight': 25,
    }

    def __init__(self, urls: List[str], output_folder="Pragins"):
        self.urls = self.extract_urls_from_url(urls)
        self.output_folder = output_folder

    def _get_pdf_from_url(self, url, *args, **kwargs):
        self.driver.get(url)
        time.sleep(0.3)
        print_options = self.print_options.copy()
        result = self._send_devtools(self.driver, "Page.printToPDF", print_options)
        return base64.b64decode(result['data'])

    @staticmethod
    def _send_devtools(driver, cmd, params):
        resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
        url = driver.command_executor._url + resource
        body = json.dumps({'cmd': cmd, 'params': params})
        response = driver.command_executor._request('POST', url, body)
        return response.get('value')

    def _generate_pdfs(self):
        pdf_files = []
    
        for url in self.urls:
            result = self._get_pdf_from_url(url)
            pdf_files.append(result)
        return pdf_files

    def _sanitize_filename(self, url):
    # Extract the last part of the URL and remove invalid characters
        filename = os.path.basename(url)
        sanitized_filename = re.sub(r'[\/:*?"<>|]', '_', filename)
        return sanitized_filename

    def extract_urls_from_url(self, start_urls
        visited_urls = set()
        queue = [start_url]
        queued_urls = set([start_url])

        while queue:
            current_url = queue.pop(0)
            if current_url not in visited_urls:
                try:
                    response = requests.get(current_url)
                    if response.status_code == 200:
                        visited_urls.add(current_url)
                        soup = BeautifulSoup(response.content, "html.parser", from_encoding='utf-8')
                        links = soup.find_all("a", href=True)
                        for link in links:
                            absolute_url = urljoin(current_url, link["href"])
                            parsed_url = urlparse(absolute_url)
                            if parsed_url.scheme in ['http', 'https'] and \
                               'mailto:' not in absolute_url and \
                               'tel:' not in absolute_url and \
                               absolute_url.startswith(start_url):
                                if absolute_url not in visited_urls and \
                                   absolute_url not in queued_urls:
                                    queue.append(absolute_url)
                                    queued_urls.add(absolute_url)
                except Exception as e:
                    print(f"An error occurred: {e}")

        return visited_urls

    def main(self):
        webdriver_options = ChromeOptions()
        webdriver_options.add_argument('--headless')
        webdriver_options.add_argument('--disable-gpu')
    
        try:
            self.driver = webdriver.Chrome(
                service=ChromeService(ChromeDriverManager().install()),
                options=webdriver_options
            )
            pdf_files = self._generate_pdfs()
        except Exception as e:
            logging.error(f"An error occurred: {str(e)}")
            pdf_files = []

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for url, pdf_data in zip(self.urls, pdf_files):
            sanitized_filename = self._sanitize_filename(url)
            file_name = os.path.join(self.output_folder, sanitized_filename + ".pdf")
            with open(file_name, "wb") as outfile:
                outfile.write(pdf_data)

        print("PDFs saved to the 'calliopee' folder.")
        return

pdf_generator = PdfGenerator('https://www.epnyonjuraprangins.ch/')


pdf_generator.main()

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

PDFs saved to the 'calliopee' folder.
