In [None]:
-----------GEO Readiness & Governance---------------------------

In [None]:
------------ Sub-pillar 2 Crawlability & Directive Integrity--------------------------
Purpose: Verifies robots.txt for AI and general crawler accessibility.
Strengths: Multi-agent checks (e.g., GPTBot, ClaudeBot).


In [1]:
# Pillar 1, Sub-pillar 2
# See Bridge.ipynb cell 2 for logic
# ...existing code...
import requests
import urllib.robotparser
from urllib.parse import urlparse, urljoin

# Helper to print colored and formatted text for better readability
def print_header(text):
    print("\n" + "="*60)
    print(f" {text}")
    print("="*60)

def print_subheader(text):
    print("\n" + "-"*60)
    print(f" {text}")
    print("-"*60)

def print_status(message, status):
    # Pad message for alignment
    padded_message = f"{message:<45}"
    # Add color based on status
    if status == "FAIL" or status == "CRITICAL":
        status_str = f"[\033[91m{status}\033[0m]" # Red
    elif status == "WARN":
        status_str = f"[\033[93m{status}\033[0m]" # Yellow
    elif status == "PASS" or status == "INFO":
        status_str = f"[\033[92m{status}\033[0m]" # Green
    else:
        status_str = f"[{status}]"

    print(f"{padded_message} {status_str}")

def print_recommendation(rec):
    print(f"  - {rec}")

class RobotsTxtAnalyzer:
    """
    Analyzes a website's robots.txt for crawlability and integrity based on ARI v10.0 Pillar 1, Sub-pillar 2.
    """
    def __init__(self, base_url):
        self.base_url = self._format_base_url(base_url)
        self.robots_url = urljoin(self.base_url, 'robots.txt')
        self.report = {
            "recommendations": [],
            "findings": [],
            "score": 0,
            "status": "Not Assessed"
        }
        self.robots_content = None
        self.parser = urllib.robotparser.RobotFileParser()

        # User agents to test against, including web, general AI, and specific AI crawlers
        self.USER_AGENTS = {
            "Wildcard": "*",
            "Google Search": "Googlebot",
            "Google AI": "Google-Extended",
            "OpenAI AI": "GPTBot",
            "Anthropic AI": "anthropic-ai"
        }

        # Common paths for critical resources
        self.CRITICAL_PATHS = ["/static/", "/assets/", "/css/", "/js/", "/images/", "/media/"]

    def _format_base_url(self, url):
        """Ensures the URL has a scheme and is just the base domain."""
        parsed = urlparse(url)
        if not parsed.scheme:
            url = "https://" + url
            parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"

    def fetch_robots_txt(self):
        """Fetches the robots.txt file from the target domain."""
        print_status(f"Fetching {self.robots_url}", "IN PROGRESS")
        try:
            response = requests.get(self.robots_url, timeout=10, headers={'User-Agent': 'ARI-Robots-Analyzer/1.0'})
            if response.status_code == 200:
                self.robots_content = response.text
                self.parser.parse(self.robots_content.splitlines())
                print_status(f"Successfully fetched robots.txt", "PASS")
                return True
            else:
                self.report["findings"].append(f"robots.txt is missing or inaccessible (Status: {response.status_code}).")
                print_status(f"robots.txt inaccessible (HTTP {response.status_code})", "WARN")
                return False
        except requests.exceptions.RequestException as e:
            self.report["findings"].append(f"Could not fetch robots.txt due to a network error: {e}")
            print_status("Failed to fetch robots.txt (Network Error)", "FAIL")
            return False

    def check_sitemap_directive(self):
        """Checks for the presence of a Sitemap directive."""
        if self.robots_content and 'sitemap:' in self.robots_content.lower():
            self.report["findings"].append("Sitemap directive is present in robots.txt.")
            print_status("Sitemap directive check", "PASS")
            return True
        else:
            self.report["findings"].append("Sitemap directive is missing from robots.txt.")
            print_status("Sitemap directive check", "WARN")
            return False

    def check_crawlability(self):
        """Checks if key user agents are blocked from the root or critical paths."""
        is_globally_blocked = False

        print_subheader("Agent Crawlability Analysis")

        for name, agent in self.USER_AGENTS.items():
            # Check for root access
            can_fetch_root = self.parser.can_fetch(agent, self.base_url + "/")
            status = "PASS" if can_fetch_root else "FAIL"
            print_status(f"Root access for '{name}' ({agent})", status)

            if not can_fetch_root:
                 self.report["findings"].append(f"Agent '{name}' is blocked from crawling the site root.")
                 if agent == '*':
                    is_globally_blocked = True

            # Check critical resource paths
            blocked_resources = []
            for path in self.CRITICAL_PATHS:
                if not self.parser.can_fetch(agent, self.base_url + path):
                    blocked_resources.append(path)

            if blocked_resources:
                status = "FAIL"
                self.report["findings"].append(f"Agent '{name}' is blocked from critical resource paths: {', '.join(blocked_resources)}")
            else:
                status = "PASS"
            print_status(f"Critical resource access for '{name}'", status)

        return is_globally_blocked

    def run_analysis(self):
        """Main execution logic."""
        print_header("ARI Sub-Pillar 1.2: Crawlability & Directive Integrity")

        if not self.fetch_robots_txt():
            self.report["status"] = "Poor"
            self.report["score"] = 40  # Not a blocker, but a significant issue
            self.report["recommendations"].append("Create a properly formatted robots.txt file to provide clear instructions to crawlers.")
            self.report["recommendations"].append("Include a sitemap reference in the new robots.txt file.")
            self._print_final_report()
            return

        has_sitemap = self.check_sitemap_directive()
        is_blocked = self.check_crawlability()

        # Scoring Logic
        if is_blocked:
            self.report["status"] = "Critical Failure"
            self.report["score"] = 0 # BLOCKER
            self.report["recommendations"].append("Major Issue: The site is globally blocked (Disallow: / for User-agent: *). This is a critical barrier for all agents.")
        else:
            # Start with a base score for a valid, non-blocking file
            score = 70
            status = "Good"

            # Bonus for having a sitemap
            if has_sitemap:
                score += 20
            else:
                self.report["recommendations"].append("Include a sitemap reference in robots.txt for better crawl efficiency.")

            # Penalty for any blocked critical resources
            if any("blocked from critical resource" in f for f in self.report["findings"]):
                score -= 30
                self.report["recommendations"].append("Allow access to critical resources (CSS, JS, images) for AI agents to ensure proper page rendering and understanding.")

            # Penalty for blocking specific (non-*) AI agents
            if any("is blocked from crawling" in f and "Wildcard" not in f for f in self.report["findings"]):
                score -= 15
                self.report["recommendations"].append("Review agent-specific directives to ensure key AI crawlers (e.g., Google-Extended, GPTBot) are not unintentionally blocked.")

            self.report["score"] = max(0, score) # Ensure score doesn't go below 0

            if self.report["score"] >= 90:
                self.report["status"] = "Excellent"
            elif self.report["score"] >= 65:
                 self.report["status"] = "Good"
            else:
                 self.report["status"] = "Needs Improvement"

        # Final recommendation if the file is empty or lacks directives
        if not self.robots_content.strip() or ("user-agent" not in self.robots_content.lower()):
            self.report["recommendations"].append("The robots.txt file is empty or malformed. Add appropriate User-agent and Disallow/Allow directives.")
            self.report["score"] = min(self.report["score"], 30) # Cap score for malformed file
            self.report["status"] = "Poor"

        self._print_final_report()

    def _print_final_report(self):
        """Prints the final formatted report."""
        print_header("Final Assessment Report")
        print_status("Overall Status", self.report['status'])
        print_status("ARI Score (out of 100)", self.report['score'])

        print_subheader("Findings")
        if self.report["findings"]:
            for finding in self.report["findings"]:
                print(f"  - {finding}")
        else:
            print("  - No significant issues found.")

        if self.report["recommendations"]:
            print_subheader("Recommendations (Based on ARI v10.0)")
            for rec in self.report["recommendations"]:
                print_recommendation(rec)



In [2]:
# Prompt the user for a URL
target = input("Enter URL :::: ").strip()

# Initialize the analyzer
RTA = RobotsTxtAnalyzer(target)

# Run the full analysis
RTA.run_analysis()



# Print the raw Python report object
print("\n" + "="*60)
print("RAW REPORT OBJECT")
print("="*60)
print(RTA.report) 

Enter URL ::::  github.com



 ARI Sub-Pillar 1.2: Crawlability & Directive Integrity
Fetching https://github.com/robots.txt        [IN PROGRESS]
Successfully fetched robots.txt               [[92mPASS[0m]
Sitemap directive check                       [[93mWARN[0m]

------------------------------------------------------------
 Agent Crawlability Analysis
------------------------------------------------------------
Root access for 'Wildcard' (*)                [[92mPASS[0m]
Critical resource access for 'Wildcard'       [[92mPASS[0m]
Root access for 'Google Search' (Googlebot)   [[92mPASS[0m]
Critical resource access for 'Google Search'  [[92mPASS[0m]
Root access for 'Google AI' (Google-Extended) [[92mPASS[0m]
Critical resource access for 'Google AI'      [[92mPASS[0m]
Root access for 'OpenAI AI' (GPTBot)          [[92mPASS[0m]
Critical resource access for 'OpenAI AI'      [[92mPASS[0m]
Root access for 'Anthropic AI' (anthropic-ai) [[92mPASS[0m]
Critical resource access for 'Anthropic AI'   [[

In [None]:
--------------WHAT IS MISSING ---------------------
️1. Structured Findings (Major Gap)

    Current: All findings are strings:
    
    "Agent 'GPTBot' is blocked from critical resource paths: /static/, /css/"
    
    
    Problem: Hard for downstream tools or automated analysis.
    
    Fix: Use dicts per agent/path, for example:
    
    {
        "agent": "GPTBot",
        "root_access": False,
        "blocked_paths": ["/static/", "/css/"],
        "status": "FAIL"
    }
️2. Per-path Granularity

    Right now, blocked resources are aggregated in one string per agent.
    
    Improvement: Store per-path results so you know exactly which paths are blocked/allowed.
    This helps produce a more precise score.

3️.  More Detailed Scoring

    Penalties are fixed (-30 for blocked resources, -15 for specific agents).
    
    Improvement: Weight penalties based on:
    
    How many critical paths are blocked
    
    Number of AI agents blocked
    
    Global vs agent-specific block
    
    Makes the score more granular and realistic.

️4. Sitemap Validation

    Currently checks only for 'sitemap:' string.
    
    Improvement: Validate the URL (HTTP 200 + correct format) to ensure it’s actually reachable.

5 . Empty / Malformed robots.txt Handling

    already check for empty content or missing User-agent.
    
    Improvement: Add a structured finding for this issue, e.g.:
    
    {"issue": "empty_or_malformed", "status": "FAIL"}

️6.  Machine-Readable Output

    Currently everything is printed and recommendations are strings.
    
    Improvement: Add a JSON-exportable report

    def export_json_report(self, path="robots_report.json"):
        import json
        with open(path, "w") as f:
            json.dump(self.report, f, indent=2)

️7. Optional Enhancements

    Support custom user-agent lists from a config file or input.
    
    Consider asynchronous checks for performance if many agents/paths.
    
    Include timestamp and URL metadata in the report.

In [None]:
-------------------------------IMPROVEMENTS --------------------------------------
Improvements in this version

    1. Structured findings per agent, including:
    
    2. root_access, blocked_paths, status, message
    
    3. Per-path crawlability tracked separately.
    
    4. Granular scoring:
    
    5. Bonus for sitemap, penalties based on number of blocked paths
    
    6. Sitemap validation (presence check)
    
    7. Handles empty/malformed robots.txt properly
    
    8. Machine-readable JSON export
    
    9. Timestamps included in the report



In [5]:
# Pillar 1, Sub-pillar 2 — Structured Robots.txt Analyzer
import requests
import urllib.robotparser
from urllib.parse import urlparse, urljoin
import json

# Helper functions for colored CLI output
def print_header(text):
    print("\n" + "="*60)
    print(f" {text}")
    print("="*60)

def print_subheader(text):
    print("\n" + "-"*60)
    print(f" {text}")
    print("-"*60)

def print_status(message, status):
    padded_message = f"{message:<50}"
    colors = {"FAIL":91, "CRITICAL":91, "WARN":93, "PASS":92, "INFO":92}
    status_str = f"[\033[{colors.get(status,0)}m{status}\033[0m]"
    print(f"{padded_message} {status_str}")

def print_recommendation(rec):
    print(f"  - {rec}")

class RobotsTxtAnalyzer:
    """
    ARI v10.0 Pillar 1, Sub-pillar 2
    Checks robots.txt for crawlability, sitemap presence, and integrity.
    Produces structured findings per agent and per path.
    """
    CRITICAL_PATHS = ["/static/", "/assets/", "/css/", "/js/", "/images/", "/media/"]

    DEFAULT_AGENTS = {
        "Wildcard": "*",
        "Google Search": "Googlebot",
        "Google AI": "Google-Extended",
        "OpenAI AI": "GPTBot",
        "Anthropic AI": "anthropic-ai"
    }

    def __init__(self, base_url, agents=None):
        self.base_url = self._format_base_url(base_url)
        self.robots_url = urljoin(self.base_url, "robots.txt")
        self.report = {
            "url": self.base_url,
            "status": "Not Assessed",
            "score": 0,
            "findings": [],
            "recommendations": [],
            "timestamp": None
        }
        self.robots_content = ""
        self.parser = urllib.robotparser.RobotFileParser()
        self.USER_AGENTS = agents or self.DEFAULT_AGENTS

    def _format_base_url(self, url):
        parsed = urlparse(url)
        if not parsed.scheme:
            url = "https://" + url
            parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"

    def fetch_robots_txt(self):
        print_status(f"Fetching {self.robots_url}", "IN PROGRESS")
        try:
            resp = requests.get(self.robots_url, timeout=10,
                                headers={'User-Agent': 'ARI-Robots-Analyzer/1.0'})
            if resp.status_code == 200:
                self.robots_content = resp.text
                self.parser.parse(self.robots_content.splitlines())
                print_status("robots.txt fetched successfully", "PASS")
                return True
            else:
                self.report["findings"].append({
                    "issue": "robots_missing",
                    "status": "WARN",
                    "http_status": resp.status_code,
                    "message": "robots.txt missing or inaccessible"
                })
                print_status(f"robots.txt inaccessible (HTTP {resp.status_code})", "WARN")
                return False
        except requests.exceptions.RequestException as e:
            self.report["findings"].append({
                "issue": "robots_network_error",
                "status": "FAIL",
                "message": str(e)
            })
            print_status("Failed to fetch robots.txt (Network Error)", "FAIL")
            return False

    def check_sitemap(self):
        if self.robots_content and "sitemap:" in self.robots_content.lower():
            self.report["findings"].append({
                "issue": "sitemap_present",
                "status": "PASS",
                "message": "Sitemap directive is present"
            })
            print_status("Sitemap directive check", "PASS")
            return True
        else:
            self.report["findings"].append({
                "issue": "sitemap_missing",
                "status": "WARN",
                "message": "Sitemap directive is missing"
            })
            print_status("Sitemap directive check", "WARN")
            return False

    def check_crawlability(self):
        print_subheader("Agent Crawlability Analysis")
        global_block = False

        for name, agent in self.USER_AGENTS.items():
            agent_result = {"agent": name, "user_agent": agent, "root_access": True, "blocked_paths": []}

            # Root access
            can_fetch_root = self.parser.can_fetch(agent, self.base_url + "/")
            agent_result["root_access"] = can_fetch_root
            if not can_fetch_root:
                agent_result["status"] = "FAIL"
                agent_result["message"] = "Blocked from root"
                self.report["recommendations"].append(
                    f"Allow '{name}' to crawl the root path '/'"
                )
                if agent == "*":
                    global_block = True
            else:
                agent_result["status"] = "PASS"
                agent_result["message"] = "Root accessible"

            print_status(f"Root access for '{name}' ({agent})", agent_result["status"])

            # Critical paths
            for path in self.CRITICAL_PATHS:
                if not self.parser.can_fetch(agent, self.base_url + path):
                    agent_result["blocked_paths"].append(path)

            if agent_result["blocked_paths"]:
                print_status(f"Critical resource access for '{name}'", "FAIL")
                self.report["recommendations"].append(
                    f"Agent '{name}' blocked from critical paths: {', '.join(agent_result['blocked_paths'])}"
                )
            else:
                print_status(f"Critical resource access for '{name}'", "PASS")

            self.report["findings"].append(agent_result)

        return global_block

    def run_analysis(self):
        import datetime
        self.report["timestamp"] = datetime.datetime.now(datetime.timezone.utc).isoformat()
        print_header("ARI Sub-Pillar 1.2: Crawlability & Directive Integrity")

        if not self.fetch_robots_txt():
            self.report["status"] = "Poor"
            self.report["score"] = 40
            self.report["recommendations"].append(
                "Create a properly formatted robots.txt file to provide clear instructions."
            )
            self._print_final_report()
            return

        has_sitemap = self.check_sitemap()
        is_blocked = self.check_crawlability()

        # Scoring
        score = 70
        if has_sitemap:
            score += 20
        if is_blocked:
            score = 0
            self.report["status"] = "Critical Failure"
            self.report["recommendations"].append(
                "Site is globally blocked (Disallow: / for User-agent: *). Critical barrier for all agents."
            )
        else:
            # Penalties for blocked paths
            blocked_penalty = sum(len(f.get("blocked_paths", [])) for f in self.report["findings"])
            score -= min(blocked_penalty * 5, 30)  # max -30 for blocked paths
            self.report["status"] = "Good" if score >= 65 else "Needs Improvement"

        # Handle empty/malformed robots.txt
        if not self.robots_content.strip() or "user-agent" not in self.robots_content.lower():
            self.report["recommendations"].append(
                "The robots.txt file is empty or malformed. Add User-agent and Allow/Disallow directives."
            )
            score = min(score, 30)
            self.report["status"] = "Poor"

        self.report["score"] = max(0, score)
        self._print_final_report()

    def _print_final_report(self):
        print_header("Final Assessment Report")
        print_status("Overall Status", self.report["status"])
        print_status("ARI Score (out of 100)", self.report["score"])

        print_subheader("Findings")
        for f in self.report["findings"]:
            print(json.dumps(f, indent=2))

        if self.report["recommendations"]:
            print_subheader("Recommendations")
            for rec in self.report["recommendations"]:
                print_recommendation(rec)

    def export_json_report(self, path="robots_report.json"):
        with open(path, "w") as f:
            json.dump(self.report, f, indent=2)
        print_status(f"JSON report exported to {path}", "INFO")


In [7]:
def analyze_robots_txt(url, export_path=None):
    """
    Runs the full robots.txt analysis for the given URL.
    Optionally exports a structured JSON report.
    Returns the report object.
    """
    # Initialize analyzer
    analyzer = RobotsTxtAnalyzer(url)

    # Run analysis
    analyzer.run_analysis()

    # Export JSON if path provided
    if export_path:
        analyzer.export_json_report(export_path)

    # Return raw report object
    return analyzer.report


# ---------------------------
# Example function call
# ---------------------------
target_url = input("Enter URL :::: ").strip()
report = analyze_robots_txt(target_url, export_path="robots_report.json")

# Print raw report nicely
import json
print("\n" + "="*60)
print("RAW REPORT OBJECT")
print("="*60)
print(json.dumps(report, indent=2))


Enter URL ::::  https://buildbridges.co/



 ARI Sub-Pillar 1.2: Crawlability & Directive Integrity
Fetching https://buildbridges.co/robots.txt        [[0mIN PROGRESS[0m]
robots.txt fetched successfully                    [[92mPASS[0m]
Sitemap directive check                            [[93mWARN[0m]

------------------------------------------------------------
 Agent Crawlability Analysis
------------------------------------------------------------
Root access for 'Wildcard' (*)                     [[92mPASS[0m]
Critical resource access for 'Wildcard'            [[92mPASS[0m]
Root access for 'Google Search' (Googlebot)        [[92mPASS[0m]
Critical resource access for 'Google Search'       [[92mPASS[0m]
Root access for 'Google AI' (Google-Extended)      [[92mPASS[0m]
Critical resource access for 'Google AI'           [[92mPASS[0m]
Root access for 'OpenAI AI' (GPTBot)               [[92mPASS[0m]
Critical resource access for 'OpenAI AI'           [[92mPASS[0m]
Root access for 'Anthropic AI' (anthropic-ai)   