In [7]:
"""If you want to run the present code in the present version use Pycharm"""
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urlunparse
import xml.etree.ElementTree as ET


In [5]:


class SitemapGenerator:
    def __init__(self, root, filename, max_tasks=10, max_depth=3):
        self.filename = filename
        self.urls = {}
        self.root = root
        self.hostname = urlparse(root).hostname
        self.semaphore = asyncio.Semaphore(max_tasks)
        self.max_depth = max_depth
        self.session = None

    async def fetch(self, url):
        async with self.semaphore:
            try:
                response = await self.session.get(url, timeout=10)
                if response.status == 200:
                    content = await response.text()
                    return content
                else:
                    print(f"Failed to fetch {url} with status {response.status}")
            except Exception as e:
                print(f"Error fetching {url}: {e}")
            return None

    def normalize_url(self, href, current_url):
        if not href:
            return None
        parsed_href = urlparse(href)
        if parsed_href.scheme and parsed_href.netloc:
            return href if parsed_href.netloc == self.hostname else None
        elif href.startswith("//"):
            return f"https:{href}" if self.hostname in href else None
        return urljoin(current_url, href)

    async def crawl(self, url, level):
        if level > self.max_depth:
            return
        if url in self.urls:
            return
        print(f"Level: {level} / Explore {url}")
        content = await self.fetch(url)
        if content:
            url = urlunparse(urlparse(url)._replace(fragment=''))
            self.urls[url] = level
            soup = BeautifulSoup(content, "html.parser")
            tasks = []
            for link in soup.find_all('a', href=True):
                href = self.normalize_url(link.get('href'), url)
                if href and href not in self.urls:
                    tasks.append(self.crawl(href, level + 1))
            await asyncio.gather(*tasks)

    async def run(self):
        async with aiohttp.ClientSession() as self.session:
            await self.crawl(self.root, 0)
            self.generate_file()

    def generate_file(self):
        urlsbylevel = {}
        for url, level in self.urls.items():
            if level not in urlsbylevel:
                urlsbylevel[level] = []
            urlsbylevel[level].append(url)

        maxlevel = max(urlsbylevel.keys(), default=0)
        step = 1 / (maxlevel * 2 if maxlevel > 0 else 1)
        root = ET.Element('urlset', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9')

        for level, urls in sorted(urlsbylevel.items()):
            priority = round(1 - step * level, 2)
            for url in urls:
                url_element = ET.SubElement(root, "url")
                ET.SubElement(url_element, "loc").text = url
                ET.SubElement(url_element, "priority").text = str(priority)

        tree = ET.ElementTree(root)
        tree.write(self.filename, encoding="utf-8", xml_declaration=True)
        print(f"Sitemap saved to '{self.filename}'.")



In [None]:

root_url = "https://books.toscrape.com"
filename = "sitemap.xml"
max_tasks = 20
max_depth = 2
generator = SitemapGenerator(root_url, filename, max_tasks, max_depth)
asyncio.run(generator.run())
