In [1]:
import asyncio
import aiohttp
import urllib
from bs4 import BeautifulSoup

In [2]:
class SitemapGenerator:
    def __init__(self, starting_url, max_depth):
        self.parsed_urls = {}
        self.starting_url = starting_url
        self.root_hostname = urllib.parse.urlparse(self.starting_url).hostname
        self.max_depth = max_depth

    async def fetch(self, session, url):
        try:
            async with session.get(url) as response:
                return await response.text(), response.status
        except Exception as e:
            print(f"Failed to fetch {url}: {str(e)}")
            return 404  #this is default when there is error, I will improve this piece (but note that the error gets printed anyway)


    async def crawl(self, url=None, level=0):
        async with aiohttp.ClientSession() as session:
            if url is None:
                url = self.starting_url
            url_to_visit = [(url, level)]
            while url_to_visit:
                current_url, current_level = url_to_visit.pop(0)
                if current_level > self.max_depth:
                    continue
                content, status = await self.fetch(session, current_url)
                if status == 200:
                    defragmented_url = urllib.parse.urldefrag(current_url)[0]
                    if defragmented_url not in self.parsed_urls:
                        self.parsed_urls[defragmented_url] = current_level
                        print(defragmented_url) #check which url has been fetched
                        soup = BeautifulSoup(content, 'html.parser')
                        for link in soup.find_all('a'):
                            href = link.get('href')
                            if href:
                                href_hostname = urllib.parse.urlparse(href).hostname
                                full_url = None
                                #there is a better sintax for the below part, I will improve it without conditional statements
                                if href_hostname is None:
                                    if href.startswith('/'):
                                        full_url = urllib.parse.urljoin(self.starting_url, href)
                                    else:
                                        full_url = urllib.parse.urljoin(self.starting_url, '/' + href)
                                
                                elif href_hostname == self.root_hostname:
                                    full_url = href
                                    
                                if full_url and full_url not in self.parsed_urls:
                                    url_to_visit.append((full_url, current_level + 1))
                                    
                                        
                


In [3]:
async def main():
    sitemap = SitemapGenerator('https://books.toscrape.com/index.html', max_depth=2)
    await sitemap.crawl()
    print(sitemap.parsed_urls)

In [4]:
loop = asyncio.get_event_loop()
if loop.is_running():
    task = loop.create_task(main())
else:
    asyncio.run(main())

https://books.toscrape.com/index.html
https://books.toscrape.com/catalogue/category/books_1/index.html
https://books.toscrape.com/catalogue/category/books/travel_2/index.html
https://books.toscrape.com/catalogue/category/books/mystery_3/index.html
https://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html
https://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html
https://books.toscrape.com/catalogue/category/books/classics_6/index.html
https://books.toscrape.com/catalogue/category/books/philosophy_7/index.html
https://books.toscrape.com/catalogue/category/books/romance_8/index.html
https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html
https://books.toscrape.com/catalogue/category/books/fiction_10/index.html
https://books.toscrape.com/catalogue/category/books/childrens_11/index.html
https://books.toscrape.com/catalogue/category/books/religion_12/index.html
https://books.toscrape.com/catalogue/category/books/nonficti