In [3]:
import requests
import xml.etree.ElementTree as ET

In [4]:
def get_urls(sitemap_url: str = "https://www.krasanamiru.cz/product-sitemap.xml") -> list[str]:
    """
    Extract product URLs from XML sitemap.
    
    Args:
        sitemap_url: URL of the XML sitemap
        
    Returns:
        List of product URLs
    """
    try:
        print(f"🔍 Fetching sitemap: {sitemap_url}")
        response = requests.get(sitemap_url, timeout=30)
        response.raise_for_status()
        
        # Parse XML
        root = ET.fromstring(response.content)
        
        # Handle namespace (common in sitemaps)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        # Extract URLs
        urls = []
        for url_element in root.findall('.//ns:url', namespace):
            loc_element = url_element.find('ns:loc', namespace)
            if loc_element is not None:
                urls.append(loc_element.text)
        
        # Fallback: try without namespace
        if not urls:
            for url_element in root.findall('.//url'):
                loc_element = url_element.find('loc')
                if loc_element is not None:
                    urls.append(loc_element.text)
        
        print(f"✅ Found {len(urls)} URLs in sitemap")
        return urls
        
    except Exception as e:
        print(f"❌ Error fetching sitemap: {e}")
        return []

In [5]:
urls = get_urls()

🔍 Fetching sitemap: https://www.krasanamiru.cz/product-sitemap.xml
✅ Found 661 URLs in sitemap


In [12]:
url = 'https://www.krasanamiru.cz/produkty/kompaktni-bronzujici-pudr-golden-caramel/'
url in urls

True

In [14]:
with open('product_urls.txt', 'w') as f:
    f.write('\n'.join(map(str, urls)))