In [6]:
import os
import re
import requests
from bs4 import BeautifulSoup

# Function to sanitize the directory name by removing illegal characters for Windows file systems
def sanitize_directory_name(name):
    # Remove invalid characters for Windows file paths: \ / : * ? " < > |
    sanitized_name = re.sub(r'[\\/*?:"<>|]', "", name)
    return sanitized_name

# Function to create a directory if it doesn't exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory created: {path}")
    else:
        print(f"Directory already exists: {path}")

# Function to download a file from a given URL
def download_file(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download {url}, status code: {response.status_code}")

# Main function to scrape the webpage and download XML files
def scrape_and_download(base_url, skip_link=None):
    # Flag to indicate when to start processing after the skipped link
    skip_mode = bool(skip_link)

    # Fetch the initial page content (category page)
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to fetch {base_url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 1: Find all authors (h2 tags with corresponding <a> tags)
    for h2_tag in soup.find_all('h2'):
        author_link = h2_tag.find('a', href=True)
        if author_link:
            author_url = "https://www.gutenberg.org" + author_link['href']


            author_name = sanitize_directory_name(author_link.text.strip())

            # Create a directory for the author
            author_dir = os.path.join(os.getcwd(), author_name)
            create_directory(author_dir)

            # Step 2: Find all ebook links for this author (within the next <ul>)
            next_ul = h2_tag.find_next_sibling('ul')
            if next_ul:
                for li_tag in next_ul.find_all('li'):
                    ebook_link = li_tag.find('a', href=True)
                    if ebook_link:
                        ebook_url = "https://www.gutenberg.org" + ebook_link['href']

                        # Skip mode for ebooks (if skip_link was specified)
                        if skip_mode:
                            if skip_link in ebook_url:
                                print(f"Resuming from {ebook_url}...")
                                skip_mode = False
                            else:
                                print(f"Skipping ebook: {ebook_url}")
                                continue

                        # Fetch the ebook page
                        ebook_response = requests.get(ebook_url)
                        if ebook_response.status_code != 200:
                            print(f"Failed to fetch {ebook_url}")
                            continue

                        ebook_soup = BeautifulSoup(ebook_response.content, 'html.parser')

                        # Step 3: Get the ebook title from <h1> tag
                        ebook_title_tag = ebook_soup.find('h1')
                        if ebook_title_tag:
                            ebook_title = sanitize_directory_name(ebook_title_tag.text.strip())

                            # Create a directory for the ebook under the author directory
                            ebook_dir = os.path.join(author_dir, ebook_title)
                            create_directory(ebook_dir)

                            # Step 4: Find and download the XML file (if available)
                            for xml_link in ebook_soup.find_all('a', href=True):
                                if xml_link['href'].endswith('.xml'):
                                    xml_url = "https://www.gutenberg.org" + xml_link['href']
                                    xml_filename = os.path.basename(xml_link['href'])
                                    xml_save_path = os.path.join(ebook_dir, xml_filename)

                                    # Download the XML file
                                    download_file(xml_url, xml_save_path)


In [3]:
bool(4949)

True

In [7]:

# Example usage
base_url = 'https://www.gutenberg.org/browse/categories/4'  # Replace with the actual base URL of the site
scrape_and_download(base_url, "6871")

Directory already exists: C:\.School\Thesisv2\Bach, Johann Sebastian, 1685-1750
Skipping ebook: https://www.gutenberg.org/ebooks/4949
Skipping ebook: https://www.gutenberg.org/ebooks/5190
Directory already exists: C:\.School\Thesisv2\Beethoven, Ludwig van, 1770-1827
Skipping ebook: https://www.gutenberg.org/ebooks/5634
Skipping ebook: https://www.gutenberg.org/ebooks/4749
Skipping ebook: https://www.gutenberg.org/ebooks/4950
Skipping ebook: https://www.gutenberg.org/ebooks/12149
Skipping ebook: https://www.gutenberg.org/ebooks/12695
Skipping ebook: https://www.gutenberg.org/ebooks/11001
Skipping ebook: https://www.gutenberg.org/ebooks/13473
Skipping ebook: https://www.gutenberg.org/ebooks/7092
Skipping ebook: https://www.gutenberg.org/ebooks/7093
Resuming from https://www.gutenberg.org/ebooks/6871...
Directory already exists: C:\.School\Thesisv2\Beethoven, Ludwig van, 1770-1827\String Quartet No. 09 in C major Opus 59 by Ludwig van Beethoven
Downloaded: C:\.School\Thesisv2\Beethoven, L