In [None]:
--------------Sitemap Health & Freshness ------------------------------------

In [None]:
Analyzes a website's sitemap health and freshness based on ARI v10.0 Pillar 1, Sub-pillar 1.
Purpose:
    Analyzes a website's sitemap health and freshness to ensure AI agents can efficiently discover all content.

Key Checks:

    1. Presence of sitemap(s) and correct XML format.

    2. Accurate URLs reflecting current site structure.

    3. Timely updates to reflect new, modified, or removed pages.

    4. Validation against schema standards to prevent parsing errors.

    5. Detect broken links or orphaned pages that may reduce crawl efficiency.

Outcome:
    A healthy, fresh sitemap ensures maximum discoverability and trustworthiness for AI agents.


In [1]:
# Pillar 1, Sub-pillar 1
# See Bridge.ipynb cell 1 for logic
# ...existing code...
import requests
import xml.etree.ElementTree as ET
import gzip
from io import BytesIO
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone

# Helper to print colored and formatted text for better readability
def print_header(text):
    print("\n" + "="*60)
    print(f" {text}")
    print("="*60)

def print_subheader(text):
    print("\n" + "-"*60)
    print(f" {text}")
    print("-"*60)

def print_status(message, status):
    print(f"{message:<45} [{status}]")

def print_recommendation(rec):
    print(f"  - {rec}")

class SitemapAnalyzer:
    """
    Analyzes a website's sitemap health and freshness based on ARI v10.0 Pillar 1, Sub-pillar 1.
    """
    def __init__(self, base_url):
        self.base_url = self._format_base_url(base_url)
        self.sitemaps_to_process = []
        self.processed_sitemaps = set()
        self.report = {
            "sitemap_locations": [],
            "total_urls": 0,
            "urls_with_lastmod": 0,
            "error_log": [],
            "recommendations": [],
            "score": 0,
            "status": "Critical Failure"
        }
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'ARI-Sitemap-Analyzer/1.0'
        })

    def _format_base_url(self, url):
        """Ensures the URL has a scheme and is just the base domain."""
        parsed = urlparse(url)
        if not parsed.scheme:
            url = "https://" + url
            parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"

    def _fetch_url(self, url):
        """Fetches a URL, handles redirects and exceptions."""
        try:
            response = self.session.get(url, timeout=15, allow_redirects=True)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            self.report["error_log"].append(f"Failed to fetch {url}: {e}")
            return None

    def _get_sitemap_content(self, response):
        """Decompresses content if gzipped, returns text."""
        if response.url.endswith('.gz') or response.headers.get('Content-Type') == 'application/gzip':
            try:
                with gzip.GzipFile(fileobj=BytesIO(response.content)) as gz_file:
                    return gz_file.read().decode('utf-8')
            except Exception as e:
                self.report["error_log"].append(f"Failed to decompress gzipped sitemap {response.url}: {e}")
                return None
        return response.text

    def find_sitemaps_from_robots(self):
        """Parses robots.txt to find sitemap locations."""
        print_status(f"Checking robots.txt at {urljoin(self.base_url, 'robots.txt')}", "IN PROGRESS")
        robots_url = urljoin(self.base_url, 'robots.txt')
        response = self._fetch_url(robots_url)

        if response:
            lines = response.text.splitlines()
            found = False
            for line in lines:
                if line.lower().startswith('sitemap:'):
                    sitemap_url = line.split(':', 1)[1].strip()
                    self.sitemaps_to_process.append(sitemap_url)
                    found = True
            if found:
                 print_status(f"Found {len(self.sitemaps_to_process)} sitemap(s) in robots.txt", "OK")
            else:
                 print_status("No sitemap directive in robots.txt", "WARNING")
        else:
             print_status("Could not fetch robots.txt", "WARNING")

    def _parse_sitemap(self, xml_content, sitemap_url):
        """Parses XML content to find URLs or other sitemaps."""
        try:
            root = ET.fromstring(xml_content.encode('utf-8'))
            namespace = root.tag.split('}')[0][1:] if '}' in root.tag else ''

            # It's a sitemap index file
            if root.tag.endswith('sitemapindex'):
                sitemaps = root.findall(f'{{{namespace}}}sitemap')
                for sitemap in sitemaps:
                    loc = sitemap.find(f'{{{namespace}}}loc')
                    if loc is not None:
                        new_sitemap_url = loc.text.strip()
                        if new_sitemap_url not in self.processed_sitemaps:
                            self.sitemaps_to_process.append(new_sitemap_url)
                print_status(f"Parsed sitemap index: Found {len(sitemaps)} more sitemaps", "INFO")

            # It's a URL set
            elif root.tag.endswith('urlset'):
                urls = root.findall(f'{{{namespace}}}url')
                self.report["total_urls"] += len(urls)
                for url in urls:
                    lastmod = url.find(f'{{{namespace}}}lastmod')
                    if lastmod is not None and lastmod.text:
                        self.report["urls_with_lastmod"] += 1
                print_status(f"Parsed URL set: Found {len(urls)} URLs", "INFO")

            else:
                 self.report["error_log"].append(f"Unknown root tag '{root.tag}' in {sitemap_url}")

        except ET.ParseError as e:
            self.report["error_log"].append(f"XML Parse Error in {sitemap_url}: {e}")


    def run_analysis(self):
        """Main execution logic."""
        print_header("ARI Sub-Pillar 1.1: Sitemap Health & Freshness")

        # 1. Discover sitemaps
        self.find_sitemaps_from_robots()
        if not self.sitemaps_to_process:
            print_status("Falling back to default sitemap.xml location", "INFO")
            self.sitemaps_to_process.append(urljoin(self.base_url, 'sitemap.xml'))

        # 2. Process all found sitemaps (including those discovered recursively)
        print_subheader("Processing Sitemaps")
        while self.sitemaps_to_process:
            sitemap_url = self.sitemaps_to_process.pop(0)
            if sitemap_url in self.processed_sitemaps:
                continue

            self.processed_sitemaps.add(sitemap_url)
            print(f"\n-> Fetching: {sitemap_url}")
            response = self._fetch_url(sitemap_url)

            if response:
                self.report["sitemap_locations"].append(sitemap_url)
                xml_content = self._get_sitemap_content(response)
                if xml_content:
                    self._parse_sitemap(xml_content, response.url)
                else:
                    print_status("Failed to get sitemap content", "ERROR")
            else:
                print_status(f"Sitemap not found or inaccessible at {sitemap_url}", "ERROR")

        # 3. Generate Score and Recommendations
        self._generate_final_report()

        # 4. Print Report
        self._print_final_report()

    def _generate_final_report(self):
        """Calculate final score and populate recommendations."""
        # Critical Failures (BLOCKER)
        if not self.report["sitemap_locations"]:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 0
            self.report["recommendations"].append("Generate sitemap.xml using automated crawling tools as no sitemap was found.")
            self.report["recommendations"].append("Submit the sitemap to Google Search Console and Bing Webmaster Tools.")
            return

        if self.report["error_log"] and any("XML Parse Error" in e for e in self.report["error_log"]):
            self.report["status"] = "Critical Failure"
            self.report["score"] = 10
            self.report["recommendations"].append("Sitemap is malformed. Validate XML structure and correct parsing errors.")

        if self.report["total_urls"] == 0:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 15
            self.report["recommendations"].append("Sitemap is empty or could not be parsed correctly. Ensure it contains URL entries.")
            return

        # Scoring Logic
        # Base score for having a valid sitemap
        score = 50

        lastmod_percentage = self.report["urls_with_lastmod"] / self.report["total_urls"]
        score += 50 * lastmod_percentage # Up to 50 points for lastmod coverage

        self.report["score"] = int(score)

        if self.report["score"] >= 95:
            self.report["status"] = "Excellent"
        elif self.report["score"] >= 70:
            self.report["status"] = "Good"
        elif self.report["score"] >= 40:
            self.report["status"] = "Needs Improvement"
        else:
            self.report["status"] = "Poor"

        # Recommendations for low scores
        if lastmod_percentage < 0.9:
            self.report["recommendations"].append("Improve coverage of <lastmod> timestamps for all URLs to signal content freshness.")
        if lastmod_percentage < 1.0:
            self.report["recommendations"].append("Consider implementing automatic sitemap updates in a CI/CD pipeline to keep timestamps current.")

    def _print_final_report(self):
        """Prints the final formatted report."""
        print_header("Final Assessment Report")
        print_status("Overall Status", self.report['status'])
        print_status("ARI Score (out of 100)", self.report['score'])

        print_subheader("Summary")
        print(f"Discovered and processed {len(self.report['sitemap_locations'])} sitemap file(s).")
        print(f"Found a total of {self.report['total_urls']} URLs.")
        if self.report['total_urls'] > 0:
            lastmod_percent = (self.report['urls_with_lastmod'] / self.report['total_urls']) * 100
            print(f"{self.report['urls_with_lastmod']} URLs have a <lastmod> timestamp ({lastmod_percent:.2f}% coverage).")

        if self.report["recommendations"]:
            print_subheader("Recommendations (Based on ARI v10.0)")
            for rec in self.report["recommendations"]:
                print_recommendation(rec)

        if self.report["error_log"]:
            print_subheader("Error Log")
            for err in self.report["error_log"]:
                print(f"  - {err}")


In [3]:
# FUNCTION CALL TO RUN ANALYSIS "https://www.example.com"
target_url= input('Enter website url ::: ')
print(f"\nRunning Sitemap Health & Freshness analysis for: {target_url}\n")
# Initialize analyzer
analyzer = SitemapAnalyzer(base_url=target_url)
# Run full analysis
analyzer.run_analysis()

# Outcome Summary
print("\n" + "="*60)
print("SUMMARY OF OUTCOME")
print("="*60)
print(f"Status: {analyzer.report['status']}")
print(f"Score: {analyzer.report['score']}/100")
print(f"Total Sitemap Files Processed: {len(analyzer.report['sitemap_locations'])}")
print(f"Total URLs Found: {analyzer.report['total_urls']}")
print(f"URLs with <lastmod>: {analyzer.report['urls_with_lastmod']}")

print("\n" + "="*60)
print("MISSING POINTS / RECOMMENDATIONS")
print("="*60)
if analyzer.report["recommendations"]:
    for rec in analyzer.report["recommendations"]:
        print(f" - {rec}")
else:
    print("No immediate recommendations. Sitemap appears healthy.")

 # Errors / Parsing Issues
if analyzer.report["error_log"]:
    print("\nErrors encountered during analysis:")
    for err in analyzer.report["error_log"]:
        print(f" - {err}")

# Provide Sitemap URLs for Inspection
if analyzer.report["sitemap_locations"]:
    print("\nSitemap URLs found (for further inspection):")
    for sm in analyzer.report["sitemap_locations"]:
        print(f" - {sm}")
else:
    print("\nNo sitemap URLs found. Consider creating sitemap.xml or specifying robots.txt sitemap.")


Enter website url :::  slack.com



Running Sitemap Health & Freshness analysis for: slack.com


 ARI Sub-Pillar 1.1: Sitemap Health & Freshness
Checking robots.txt at https://slack.com/robots.txt [IN PROGRESS]
Found 2 sitemap(s) in robots.txt              [OK]

------------------------------------------------------------
 Processing Sitemaps
------------------------------------------------------------

-> Fetching: https://slack.com/sitemap.xml
Parsed sitemap index: Found 13 more sitemaps  [INFO]

-> Fetching: https://slack.com/sitemaps/blog/news/sitemap.xml
Parsed URL set: Found 1 URLs                  [INFO]

-> Fetching: https://slack.com/sitemaps/sitemap_en-us.xml
Parsed URL set: Found 3785 URLs               [INFO]

-> Fetching: https://slack.com/sitemaps/sitemap_es-la.xml
Parsed URL set: Found 3193 URLs               [INFO]

-> Fetching: https://slack.com/sitemaps/sitemap_pt-br.xml
Parsed URL set: Found 3157 URLs               [INFO]

-> Fetching: https://slack.com/sitemaps/sitemap_de-de.xml
Parsed URL set: Found

In [None]:
------------WHAT IS MISSING -------------------------------
1. Namespace Handling Could Be More Robust

    Currently, you try to extract the namespace from the root tag:

    namespace = root.tag.split('}')[0][1:] if '}' in root.tag else ''


    This works for most sitemaps but may fail if XML uses multiple namespaces or no namespace.

    Fix: Use ET.register_namespace() or a namespace dictionary for safer lookups.


2. Handle Multiple Sitemap Formats

    Only XML sitemaps are supported. Modern sites may also use:
    
    sitemap_index.xml (you partially handle this)
    
    .txt sitemaps (plain URL lists)
    
    RSS/Atom feeds acting as sitemaps
    
    Currently, .txt sitemaps will fail parsing.


3. Validation of URLs

    You count URLs but don’t validate if they are properly formatted.
    
    Missing check: ensure URLs are HTTP/HTTPS and belong to the same domain or intended scope.
    
    Could flag external URLs if they appear in the sitemap.

4. Freshness / Lastmod Accuracy

    You count <lastmod> but don’t verify timestamp validity.
    
    Could check:
    
    Correct ISO 8601 format
    
    Reasonable date (not in the future)

5. Duplicate URLs

    Currently, duplicates in sitemaps may inflate the URL count.
    
    Should deduplicate URLs before scoring.

6. Logging & Exception Handling

    You append errors to error_log, which is good.
    
    Missing: differentiating between critical errors vs warnings (e.g., empty sitemap vs minor parse warning).
    
    Could add log_level or categorize errors.

7. Score Calculation

    You calculate score based on lastmod coverage only.
    
    ARI v10.0 may also consider:
    
    Total sitemap discovery
    
    Accessibility of all URLs
    
    HTTP response codes for URLs
    
    Consistency with robots.txt directives
    
    Could improve scoring to reflect these factors.

8. Recommendations / Best Practices

    You suggest <lastmod> coverage improvement, which is good.
    
    Missing:
    
    Recommend using ping to search engines when sitemap updates.
    
    Suggest gzip compression for large sitemaps.
    
    Suggest splitting large sitemaps (>50k URLs) into multiple sitemaps.

9. CI/CD / Automation

     mention it in recommendations, but the analyzer does not detect automated update setup.



    

In [None]:
--------------Summary of Key Missing Pieces----------------------------------
| Area                 | Missing / Improvement                                           |
| -------------------- | --------------------------------------------------------------- |
| Namespace Handling   | More robust support for multiple namespaces                     |
| Sitemap Formats      | Support `.txt` and feed-based sitemaps                          |
| URL Validation       | Check proper URL formatting and domain scope                    |
| Lastmod Verification | Validate ISO format and realistic timestamps                    |
| Duplicates           | Deduplicate URLs before scoring                                 |
| Logging              | Differentiate critical vs warning errors                        |
| Scoring              | Include accessibility, robots.txt compliance, sitemap discovery |
| Recommendations      | Include size, gzip, search engine ping, sitemap splitting       |
| Automation Detection | Optional check for CI/CD sitemap updates                        |


In [1]:
import requests
import xml.etree.ElementTree as ET
import gzip
from io import BytesIO
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
import re
from collections import Counter

# Helper functions for formatted output
def print_header(text):
    print("\n" + "="*70)
    print(f" {text}")
    print("="*70)

def print_subheader(text):
    print("\n" + "-"*70)
    print(f" {text}")
    print("-"*70)

def print_status(message, status):
    padded_message = f"{message:<50}"
    if status in ["ERROR", "CRITICAL"]:
        status_str = f"[\033[91m{status}\033[0m]"
    elif status == "WARNING":
        status_str = f"[\033[93m{status}\033[0m]"
    elif status in ["OK", "PASS", "INFO"]:
        status_str = f"[\033[92m{status}\033[0m]"
    else:
        status_str = f"[{status}]"
    print(f"{padded_message} {status_str}")

def print_recommendation(rec):
    print(f"  → {rec}")

class EnhancedSitemapAnalyzer:
    """
    Enhanced analyzer for ARI v10.0 Pillar 1, Sub-pillar 1.
    Comprehensive sitemap health, freshness, and compliance checking.
    """
    def __init__(self, base_url):
        self.base_url = self._format_base_url(base_url)
        self.base_domain = urlparse(self.base_url).netloc
        self.sitemaps_to_process = []
        self.processed_sitemaps = set()
        self.all_urls = set()  # For deduplication
        self.url_details = []  # Store URL metadata
        
        self.report = {
            "sitemap_locations": [],
            "total_urls": 0,
            "unique_urls": 0,
            "duplicate_urls": 0,
            "urls_with_lastmod": 0,
            "urls_with_valid_lastmod": 0,
            "urls_with_invalid_format": 0,
            "external_urls": 0,
            "future_dated_urls": 0,
            "txt_sitemaps_found": 0,
            "xml_sitemaps_found": 0,
            "gzipped_sitemaps": 0,
            "oversized_sitemaps": 0,
            "robots_txt_found": False,
            "sitemap_in_robots": False,
            "critical_errors": [],
            "warnings": [],
            "error_log": [],
            "recommendations": [],
            "score": 0,
            "status": "Not Assessed"
        }
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'ARI-Enhanced-Sitemap-Analyzer/2.0',
            'Accept-Encoding': 'gzip, deflate'
        })

    def _format_base_url(self, url):
        """Normalize base URL with proper scheme."""
        parsed = urlparse(url)
        if not parsed.scheme:
            url = "https://" + url
            parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"

    def _fetch_url(self, url):
        """Fetch URL with comprehensive error handling."""
        try:
            response = self.session.get(url, timeout=15, allow_redirects=True)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            self.report["error_log"].append(f"Timeout fetching {url}")
            return None
        except requests.exceptions.HTTPError as e:
            self.report["error_log"].append(f"HTTP {e.response.status_code} for {url}")
            return None
        except requests.exceptions.RequestException as e:
            self.report["error_log"].append(f"Network error for {url}: {str(e)[:100]}")
            return None

    def _get_sitemap_content(self, response):
        """Handle compressed and uncompressed sitemap content."""
        # Check if gzipped
        is_gzipped = (
            response.url.endswith('.gz') or 
            'gzip' in response.headers.get('Content-Encoding', '').lower() or
            response.headers.get('Content-Type') == 'application/gzip'
        )
        
        if is_gzipped:
            self.report["gzipped_sitemaps"] += 1
            try:
                with gzip.GzipFile(fileobj=BytesIO(response.content)) as gz_file:
                    return gz_file.read().decode('utf-8')
            except Exception as e:
                self.report["critical_errors"].append(
                    f"Failed to decompress gzipped sitemap {response.url}: {str(e)[:100]}"
                )
                return None
        
        return response.text

    def _validate_url(self, url):
        """
        Validate URL format and scope.
        Returns: (is_valid, is_external, issues)
        """
        issues = []
        
        # Basic format check
        if not url or not isinstance(url, str):
            return False, False, ["Empty or invalid URL type"]
        
        # Parse URL
        try:
            parsed = urlparse(url.strip())
        except Exception:
            return False, False, ["URL parsing failed"]
        
        # Check scheme
        if parsed.scheme not in ['http', 'https']:
            issues.append(f"Invalid scheme: {parsed.scheme}")
            return False, False, issues
        
        # Check if external
        is_external = parsed.netloc.lower() != self.base_domain.lower()
        
        # Check for suspicious patterns
        if '..' in parsed.path or parsed.path.startswith('//'):
            issues.append("Suspicious path pattern")
        
        return True, is_external, issues

    def _validate_lastmod(self, lastmod_text):
        """
        Validate lastmod timestamp format and reasonableness.
        Returns: (is_valid, parsed_datetime, issues)
        """
        if not lastmod_text:
            return False, None, ["Empty lastmod"]
        
        issues = []
        
        # Common ISO 8601 formats
        formats = [
            "%Y-%m-%dT%H:%M:%S%z",
            "%Y-%m-%dT%H:%M:%SZ",
            "%Y-%m-%d",
            "%Y-%m-%dT%H:%M:%S",
        ]
        
        parsed_dt = None
        for fmt in formats:
            try:
                # Handle 'Z' timezone marker
                lastmod_clean = lastmod_text.strip().replace('Z', '+00:00')
                # Handle timezone format like +00:00
                if '+' in lastmod_clean or lastmod_clean.count('-') > 2:
                    parsed_dt = datetime.fromisoformat(lastmod_clean)
                else:
                    parsed_dt = datetime.strptime(lastmod_clean, fmt)
                break
            except ValueError:
                continue
        
        if not parsed_dt:
            return False, None, ["Invalid date format"]
        
        # Check if date is in the future
        now = datetime.now(timezone.utc)
        if parsed_dt.tzinfo is None:
            parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
        
        if parsed_dt > now:
            issues.append("Future-dated timestamp")
            return False, parsed_dt, issues
        
        # Check if date is unreasonably old (before 1995 - start of WWW)
        if parsed_dt.year < 1995:
            issues.append("Unreasonably old date")
            return False, parsed_dt, issues
        
        return True, parsed_dt, issues

    def find_sitemaps_from_robots(self):
        """Parse robots.txt to discover sitemap locations."""
        print_status("Checking robots.txt", "INFO")
        robots_url = urljoin(self.base_url, 'robots.txt')
        response = self._fetch_url(robots_url)

        if response:
            self.report["robots_txt_found"] = True
            lines = response.text.splitlines()
            found_count = 0
            
            for line in lines:
                line_clean = line.strip()
                if line_clean.lower().startswith('sitemap:'):
                    sitemap_url = line_clean.split(':', 1)[1].strip()
                    self.sitemaps_to_process.append(sitemap_url)
                    found_count += 1
            
            if found_count > 0:
                self.report["sitemap_in_robots"] = True
                print_status(f"Found {found_count} sitemap(s) in robots.txt", "OK")
            else:
                print_status("No sitemap directive in robots.txt", "WARNING")
                self.report["warnings"].append(
                    "robots.txt exists but contains no Sitemap: directive"
                )
        else:
            print_status("Could not fetch robots.txt", "WARNING")
            self.report["warnings"].append("robots.txt not found or inaccessible")

    def _parse_txt_sitemap(self, content, sitemap_url):
        """Parse plain text sitemap (one URL per line)."""
        print_status("Parsing as TXT sitemap", "INFO")
        self.report["txt_sitemaps_found"] += 1
        
        lines = content.strip().split('\n')
        valid_urls = 0
        
        for line in lines:
            url = line.strip()
            if not url or url.startswith('#'):  # Skip empty lines and comments
                continue
            
            is_valid, is_external, issues = self._validate_url(url)
            
            if is_valid:
                if url not in self.all_urls:
                    self.all_urls.add(url)
                    self.url_details.append({
                        'url': url,
                        'lastmod': None,
                        'is_external': is_external,
                        'source_sitemap': sitemap_url
                    })
                    valid_urls += 1
                else:
                    self.report["duplicate_urls"] += 1
                
                if is_external:
                    self.report["external_urls"] += 1
            else:
                self.report["urls_with_invalid_format"] += 1
        
        print_status(f"Found {valid_urls} valid URLs in TXT sitemap", "OK")

    def _parse_xml_sitemap(self, xml_content, sitemap_url):
        """Parse XML sitemap with robust namespace handling."""
        try:
            root = ET.fromstring(xml_content.encode('utf-8'))
        except ET.ParseError as e:
            self.report["critical_errors"].append(
                f"XML Parse Error in {sitemap_url}: {str(e)[:100]}"
            )
            return
        
        # Extract namespace more robustly
        namespace_match = re.match(r'\{(.*?)\}', root.tag)
        namespace = namespace_match.group(1) if namespace_match else ''
        
        # Define namespace map for searching
        ns = {'sm': namespace} if namespace else {}
        
        # Check if it's a sitemap index
        if root.tag.endswith('sitemapindex'):
            self._parse_sitemap_index(root, ns, sitemap_url)
        
        # Check if it's a URL set
        elif root.tag.endswith('urlset'):
            self._parse_urlset(root, ns, sitemap_url)
        
        else:
            self.report["warnings"].append(
                f"Unknown XML root tag '{root.tag}' in {sitemap_url}"
            )

    def _parse_sitemap_index(self, root, ns, sitemap_url):
        """Parse sitemap index file."""
        if ns:
            sitemaps = root.findall('.//sm:sitemap', ns)
        else:
            sitemaps = root.findall('.//sitemap')
        
        for sitemap in sitemaps:
            if ns:
                loc = sitemap.find('sm:loc', ns)
            else:
                loc = sitemap.find('loc')
            
            if loc is not None and loc.text:
                new_sitemap_url = loc.text.strip()
                if new_sitemap_url not in self.processed_sitemaps:
                    self.sitemaps_to_process.append(new_sitemap_url)
        
        print_status(f"Sitemap index: Found {len(sitemaps)} sub-sitemaps", "INFO")

    def _parse_urlset(self, root, ns, sitemap_url):
        """Parse URL set with comprehensive validation."""
        self.report["xml_sitemaps_found"] += 1
        
        if ns:
            urls = root.findall('.//sm:url', ns)
        else:
            urls = root.findall('.//url')
        
        # Check for oversized sitemap (>50k URLs per Google guidelines)
        if len(urls) > 50000:
            self.report["oversized_sitemaps"] += 1
            self.report["warnings"].append(
                f"Sitemap {sitemap_url} contains {len(urls)} URLs (>50k limit)"
            )
        
        for url_elem in urls:
            # Extract loc
            if ns:
                loc = url_elem.find('sm:loc', ns)
                lastmod = url_elem.find('sm:lastmod', ns)
            else:
                loc = url_elem.find('loc')
                lastmod = url_elem.find('lastmod')
            
            if loc is None or not loc.text:
                self.report["urls_with_invalid_format"] += 1
                continue
            
            url = loc.text.strip()
            
            # Validate URL
            is_valid, is_external, url_issues = self._validate_url(url)
            
            if not is_valid:
                self.report["urls_with_invalid_format"] += 1
                for issue in url_issues:
                    self.report["warnings"].append(f"URL {url}: {issue}")
                continue
            
            # Check for duplicates
            if url in self.all_urls:
                self.report["duplicate_urls"] += 1
                continue
            
            self.all_urls.add(url)
            
            # Track external URLs
            if is_external:
                self.report["external_urls"] += 1
                self.report["warnings"].append(f"External URL in sitemap: {url}")
            
            # Validate lastmod
            lastmod_valid = False
            lastmod_datetime = None
            
            if lastmod is not None and lastmod.text:
                self.report["urls_with_lastmod"] += 1
                is_valid_date, parsed_dt, date_issues = self._validate_lastmod(lastmod.text)
                
                if is_valid_date:
                    self.report["urls_with_valid_lastmod"] += 1
                    lastmod_valid = True
                    lastmod_datetime = parsed_dt
                else:
                    for issue in date_issues:
                        if "Future-dated" in issue:
                            self.report["future_dated_urls"] += 1
                        self.report["warnings"].append(f"URL {url} lastmod: {issue}")
            
            # Store URL details
            self.url_details.append({
                'url': url,
                'lastmod': lastmod_datetime,
                'lastmod_valid': lastmod_valid,
                'is_external': is_external,
                'source_sitemap': sitemap_url
            })
        
        self.report["unique_urls"] = len(self.all_urls)
        self.report["total_urls"] = len(self.url_details)
        print_status(f"Parsed {len(urls)} URLs from sitemap", "OK")

    def run_analysis(self):
        """Main execution logic."""
        print_header("ARI v10.0 Enhanced Sitemap Analyzer")
        print("Pillar 1, Sub-pillar 1: Sitemap Health & Freshness\n")

        # Step 1: Discover sitemaps from robots.txt
        self.find_sitemaps_from_robots()
        
        # Step 2: Fallback to default sitemap locations
        if not self.sitemaps_to_process:
            print_status("Trying default sitemap locations", "INFO")
            default_locations = [
                urljoin(self.base_url, 'sitemap.xml'),
                urljoin(self.base_url, 'sitemap_index.xml'),
                urljoin(self.base_url, 'sitemap.txt'),
            ]
            self.sitemaps_to_process.extend(default_locations)

        # Step 3: Process all sitemaps
        print_subheader("Processing Sitemaps")
        
        while self.sitemaps_to_process:
            sitemap_url = self.sitemaps_to_process.pop(0)
            
            if sitemap_url in self.processed_sitemaps:
                continue
            
            self.processed_sitemaps.add(sitemap_url)
            print(f"\n→ Fetching: {sitemap_url}")
            
            response = self._fetch_url(sitemap_url)
            
            if not response:
                print_status("Failed to fetch sitemap", "ERROR")
                continue
            
            self.report["sitemap_locations"].append(sitemap_url)
            
            # Get content (handles gzip)
            content = self._get_sitemap_content(response)
            if not content:
                print_status("Failed to read sitemap content", "ERROR")
                continue
            
            # Determine format and parse
            if sitemap_url.endswith('.txt') or not content.strip().startswith('<'):
                self._parse_txt_sitemap(content, sitemap_url)
            else:
                self._parse_xml_sitemap(content, sitemap_url)

        # Step 4: Generate report
        self._generate_final_report()
        self._print_final_report()

    def _generate_final_report(self):
        """Calculate score and generate recommendations."""
        
        # ═══════════════════════════════════════════════════════════
        # Critical Failures
        # ═══════════════════════════════════════════════════════════
        if not self.report["sitemap_locations"]:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 0
            self.report["recommendations"].append(
                "CRITICAL: No sitemap found. Generate sitemap.xml immediately"
            )
            self.report["recommendations"].append(
                "Submit sitemap to Google Search Console and Bing Webmaster Tools"
            )
            return
        
        if self.report["critical_errors"]:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 10
            self.report["recommendations"].append(
                "CRITICAL: Fix XML parsing errors in sitemap"
            )
            return
        
        if self.report["total_urls"] == 0:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 15
            self.report["recommendations"].append(
                "CRITICAL: Sitemap is empty or all URLs are invalid"
            )
            return
        
        # ═══════════════════════════════════════════════════════════
        # Scoring (100 points total)
        # ═══════════════════════════════════════════════════════════
        score = 0
        
        # 1. Sitemap Discovery (20 points)
        if self.report["robots_txt_found"]:
            score += 10
        if self.report["sitemap_in_robots"]:
            score += 10
        
        # 2. URL Quality (30 points)
        if self.report["unique_urls"] > 0:
            duplicate_ratio = self.report["duplicate_urls"] / (self.report["unique_urls"] + self.report["duplicate_urls"])
            invalid_ratio = self.report["urls_with_invalid_format"] / max(1, self.report["total_urls"])
            external_ratio = self.report["external_urls"] / max(1, self.report["total_urls"])
            
            url_quality = (1 - duplicate_ratio) * (1 - invalid_ratio) * (1 - external_ratio)
            score += int(30 * url_quality)
        
        # 3. Lastmod Coverage (25 points)
        if self.report["total_urls"] > 0:
            lastmod_coverage = self.report["urls_with_lastmod"] / self.report["total_urls"]
            score += int(25 * lastmod_coverage)
        
        # 4. Lastmod Validity (15 points)
        if self.report["urls_with_lastmod"] > 0:
            lastmod_validity = self.report["urls_with_valid_lastmod"] / self.report["urls_with_lastmod"]
            score += int(15 * lastmod_validity)
        
        # 5. Best Practices (10 points)
        best_practices = 0
        if self.report["gzipped_sitemaps"] > 0:
            best_practices += 5  # Using compression
        if self.report["oversized_sitemaps"] == 0:
            best_practices += 5  # No oversized sitemaps
        score += best_practices
        
        self.report["score"] = min(100, int(score))
        
        # ═══════════════════════════════════════════════════════════
        # Status Assignment
        # ═══════════════════════════════════════════════════════════
        if self.report["score"] >= 90:
            self.report["status"] = "Excellent"
        elif self.report["score"] >= 75:
            self.report["status"] = "Good"
        elif self.report["score"] >= 50:
            self.report["status"] = "Needs Improvement"
        else:
            self.report["status"] = "Poor"
        
        # ═══════════════════════════════════════════════════════════
        # Generate Recommendations
        # ═══════════════════════════════════════════════════════════
        if not self.report["robots_txt_found"]:
            self.report["recommendations"].append(
                "Create robots.txt file at domain root"
            )
        
        if not self.report["sitemap_in_robots"]:
            self.report["recommendations"].append(
                "Add 'Sitemap: <URL>' directive to robots.txt"
            )
        
        if self.report["duplicate_urls"] > 0:
            self.report["recommendations"].append(
                f"Remove {self.report['duplicate_urls']} duplicate URLs from sitemap(s)"
            )
        
        if self.report["urls_with_invalid_format"] > 0:
            self.report["recommendations"].append(
                f"Fix {self.report['urls_with_invalid_format']} URLs with invalid format"
            )
        
        if self.report["external_urls"] > 0:
            self.report["recommendations"].append(
                f"Remove {self.report['external_urls']} external URLs from sitemap"
            )
        
        lastmod_coverage = (self.report["urls_with_lastmod"] / self.report["total_urls"]) if self.report["total_urls"] > 0 else 0
        if lastmod_coverage < 0.9:
            self.report["recommendations"].append(
                f"Improve <lastmod> coverage to 90%+ (currently {lastmod_coverage*100:.1f}%)"
            )
        
        if self.report["future_dated_urls"] > 0:
            self.report["recommendations"].append(
                f"Fix {self.report['future_dated_urls']} future-dated timestamps"
            )
        
        if self.report["oversized_sitemaps"] > 0:
            self.report["recommendations"].append(
                "Split oversized sitemaps (>50k URLs) into multiple files with sitemap index"
            )
        
        if self.report["gzipped_sitemaps"] == 0 and self.report["total_urls"] > 1000:
            self.report["recommendations"].append(
                "Enable gzip compression for sitemaps to reduce bandwidth"
            )
        
        if self.report["score"] >= 70:
            self.report["recommendations"].append(
                "Implement automated sitemap updates in CI/CD pipeline"
            )
            self.report["recommendations"].append(
                "Set up automatic ping to search engines on sitemap updates"
            )

    def _print_final_report(self):
        """Print comprehensive final report."""
        print_header("Final Assessment Report")
        
        # Overall Status
        print_status("Overall Status", self.report['status'])
        print_status(f"ARI Score (out of 100)", str(self.report['score']))
        
        # Discovery Summary
        print_subheader("Sitemap Discovery")
        print(f"  Sitemaps Processed: {len(self.report['sitemap_locations'])}")
        print(f"  XML Sitemaps: {self.report['xml_sitemaps_found']}")
        print(f"  TXT Sitemaps: {self.report['txt_sitemaps_found']}")
        print(f"  Gzipped Sitemaps: {self.report['gzipped_sitemaps']}")
        print(f"  robots.txt Found: {'Yes' if self.report['robots_txt_found'] else 'No'}")
        print(f"  Sitemap in robots.txt: {'Yes' if self.report['sitemap_in_robots'] else 'No'}")
        
        # URL Statistics
        print_subheader("URL Statistics")
        print(f"  Total URLs Found: {self.report['total_urls']}")
        print(f"  Unique URLs: {self.report['unique_urls']}")
        print(f"  Duplicate URLs: {self.report['duplicate_urls']}")
        print(f"  Invalid Format URLs: {self.report['urls_with_invalid_format']}")
        print(f"  External URLs: {self.report['external_urls']}")
        
        # Lastmod Statistics
        print_subheader("Timestamp Quality")
        if self.report['total_urls'] > 0:
            lastmod_pct = (self.report['urls_with_lastmod'] / self.report['total_urls']) * 100
            print(f"  URLs with <lastmod>: {self.report['urls_with_lastmod']} ({lastmod_pct:.1f}%)")
        if self.report['urls_with_lastmod'] > 0:
            valid_pct = (self.report['urls_with_valid_lastmod'] / self.report['urls_with_lastmod']) * 100
            print(f"  Valid Timestamps: {self.report['urls_with_valid_lastmod']} ({valid_pct:.1f}%)")
        print(f"  Future-dated URLs: {self.report['future_dated_urls']}")
        
        # Issues
        if self.report["critical_errors"]:
            print_subheader("Critical Errors")
            for error in self.report["critical_errors"][:5]:
                print(f"  ⚠️  {error}")
        
        if self.report["warnings"]:
            print_subheader("Warnings (Top 5)")
            for warning in self.report["warnings"][:5]:
                print(f"  ⚡ {warning}")
        
        # Recommendations
        if self.report["recommendations"]:
            print_subheader("Actionable Recommendations")
            for i, rec in enumerate(self.report["recommendations"], 1):
                print_recommendation(f"{i}. {rec}")



In [2]:
# ============================================================================
# USAGE - Direct execution
# ============================================================================

print("EnhancedSitemapAnalyzer" + "="*60)
target_url = input("Enter URL: ").strip()

# Instantiate the analyzer
analyzer = EnhancedSitemapAnalyzer(base_url=target_url)

# Run the analysis
analyzer.run_analysis()

# Print structured raw report for dev / API use
print("\n" + "="*60)
print("RAW REPORT OBJECT")
print("="*60)
print(analyzer.report)



Enter URL:   https://www.example.com/404



 ARI v10.0 Enhanced Sitemap Analyzer
Pillar 1, Sub-pillar 1: Sitemap Health & Freshness

Checking robots.txt                                [[92mINFO[0m]
Trying default sitemap locations                   [[92mINFO[0m]

----------------------------------------------------------------------
 Processing Sitemaps
----------------------------------------------------------------------

→ Fetching: https://www.example.com/sitemap.xml
Failed to fetch sitemap                            [[91mERROR[0m]

→ Fetching: https://www.example.com/sitemap_index.xml
Failed to fetch sitemap                            [[91mERROR[0m]

→ Fetching: https://www.example.com/sitemap.txt
Failed to fetch sitemap                            [[91mERROR[0m]

 Final Assessment Report
Overall Status                                     [Critical Failure]
ARI Score (out of 100)                             [0]

----------------------------------------------------------------------
 Sitemap Discovery
-------------