># Interactive Privacy Notice & Cookie Choice Scraper
>
>    This notebook allows you to interactively scrape privacy notices and cookie choices from websites, capturing the layered interaction patterns. It will:
>    1. Visit a website and capture the initial privacy/cookie notice
>    2. Take screenshots of each layer/state
>    3. Let you choose which options to click and explore
>    4. Record all text, buttons, and links at each state
>    5. Store the data in a hierarchical structure
>    This approach gives you complete control over the exploration process while automating the data collection.

## 1. Setup and Dependencies

In [2]:
# Install required packages if needed
#!pip install selenium webdriver-manager beautifulsoup4 pandas requests IPython tqdm pillow openpyxl

In [3]:
# Import libraries
import os
import time
import uuid
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from IPython.display import display, Image, HTML
import re
from urllib.parse import urlparse
import json
import uuid
from tqdm.notebook import tqdm

## 2. Configure Settings

In [29]:
# Configuration settings
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3  # Maximum click depth to explore
WAIT_TIME = 5  # Time in seconds to wait for elements to load


# Create screenshot directory if it doesn't exist
os.makedirs(SCREENSHOTS_DIR, exist_ok=True)


# Websites to analyze
websites = [
    # Major news/media sites
    "https://www.theguardian.com",
    #"https://www.nytimes.com"
]

In [30]:
# # Configuration settings
# SCREENSHOTS_DIR = "screenshots"
# DATA_FILE = "privacy_notice_data.xlsx"
# MAX_DEPTH = 3  # Maximum click depth to explore
# WAIT_TIME = 5  # Time in seconds to wait for elements to load


# # Create screenshot directory if it doesn't exist
# os.makedirs(SCREENSHOTS_DIR, exist_ok=True)


# # Websites to analyze
# websites = [
#     # Major news/media sites
#     "https://www.theguardian.com",
#     "https://www.nytimes.com",
#     "https://www.bbc.com",
#     "https://www.cnn.com",
    
#     # Tech companies
#     "https://www.microsoft.com",
#     "https://www.google.com",
#     "https://www.apple.com",
#     "https://www.meta.com",
    
#     # E-commerce
#     "https://www.amazon.com",
#     "https://www.ebay.com",
#     "https://www.etsy.com",
#     "https://www.walmart.com",
    
#     # Social media
#     "https://www.reddit.com",
#     "https://www.twitter.com",
#     "https://www.linkedin.com",
#     "https://www.pinterest.com",
    
#     # Entertainment
#     "https://www.netflix.com",
#     "https://www.spotify.com",
#     "https://www.youtube.com"
# ]

## 3. Initialize WebDriver

In [31]:
def initialize_driver(headless=False):

    # Initialize and return a Chrome WebDriver instance
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Add user agent to appear more like a regular browser
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # Initialize with retry logic
    for attempt in range(3):
        try:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.set_page_load_timeout(30)
            return driver
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt == 2:
                raise
            time.sleep(2)

# Initialize the driver (visible browser for interaction)
driver = initialize_driver(headless=False)
print("WebDriver initialized successfully!")

WebDriver initialized successfully!


## 4. Core Scraping Functions

In [32]:
def take_screenshot(driver, website, state_id=None):
    """Take a screenshot and return the file path"""

    domain = urlparse(website).netloc.replace('.', '_')
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    state_suffix = f"_state_{state_id}" if state_id else ""
    filename = f"{domain}{state_suffix}_{timestamp}.png"
    filepath = os.path.join(SCREENSHOTS_DIR, filename)
    
    driver.save_screenshot(filepath)
    return filepath

In [33]:
def find_cookie_notice_iframes(driver):
    """Find potential iframes containing cookie notices"""
    iframe_elements = []
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                iframe_id = iframe.get_attribute("id") or ""
                iframe_class = iframe.get_attribute("class") or ""
                iframe_src = iframe.get_attribute("src") or ""
                
                # Check if iframe might contain cookie notice
                keywords = ['cookie', 'consent', 'privacy', 'gdpr', 'ccpa', 'notice', 'cmp']
                if any(keyword in iframe_id.lower() for keyword in keywords) or \
                   any(keyword in iframe_class.lower() for keyword in keywords) or \
                   any(keyword in iframe_src.lower() for keyword in keywords):
                    iframe_elements.append(iframe)
            except StaleElementReferenceException:
                continue
    except Exception as e:
        print(f"Error finding iframes: {e}")
    
    return iframe_elements

In [34]:
def find_cookie_notice(driver):
    """Find cookie notice elements and return them"""
    notice_elements = []
    
    # First check for iframes that might contain cookie notices
    iframe_elements = find_cookie_notice_iframes(driver)
    
    # Then check for cookie notice elements in the main document
    for selector in COOKIE_SELECTORS:
        try:
            elements = driver.find_elements(By.XPATH, selector)
            for element in elements:
                if element.is_displayed():
                    # Check if this element has any visible text or buttons
                    text = element.text.strip()
                    if text and len(text) > 10:  # Arbitrary threshold to filter out unrelated elements
                        notice_elements.append(element)
        except Exception as e:
            print(f"Error with selector {selector}: {e}")
    
    return notice_elements, iframe_elements

In [35]:
def extract_notice_details(element):
    """Extract text, buttons and links from a notice element"""
    details = {
        "text": element.text.strip(),
        "buttons": [],
        "links": []
    }
    
    # Extract buttons
    try:
        buttons = element.find_elements(By.TAG_NAME, "button")
        for button in buttons:
            if button.is_displayed():
                button_text = button.text.strip()
                if button_text:
                    details["buttons"].append({
                        "text": button_text,
                        "element": button
                    })
    except Exception as e:
        print(f"Error extracting buttons: {e}")
    
    # Also look for link-like buttons
    try:
        spans_divs = element.find_elements(By.CSS_SELECTOR, "span[role='button'], div[role='button']")
        for item in spans_divs:
            if item.is_displayed():
                item_text = item.text.strip()
                if item_text:
                    details["buttons"].append({
                        "text": item_text,
                        "element": item
                    })
    except Exception as e:
        print(f"Error extracting span/div buttons: {e}")
    
    # Extract links
    try:
        links = element.find_elements(By.TAG_NAME, "a")
        for link in links:
            if link.is_displayed():
                link_text = link.text.strip()
                link_href = link.get_attribute("href")
                if link_text and link_href:
                    details["links"].append({
                        "text": link_text,
                        "href": link_href,
                        "element": link
                    })
    except Exception as e:
        print(f"Error extracting links: {e}")
    
    return details

In [None]:
def switch_to_iframe_and_extract(driver, iframe):
    """Switch to an iframe and extract notice details"""
    try:
        driver.switch_to.frame(iframe)
        notice_elements, _ = find_cookie_notice(driver)
        
        if not notice_elements:
            # If no specific notice elements found, use body
            body = driver.find_element(By.TAG_NAME, "body")
            notice_details = extract_notice_details(body)
        else:
            # Use the first notice element
            notice_details = extract_notice_details(notice_elements[0])
        
        # Store actual href values that can be navigated to directly
        for link in notice_details["links"]:
            link["direct_url"] = link["href"]
            
        driver.switch_to.default_content()
        return notice_details
    except Exception as e:
        print(f"Error processing iframe: {e}")
        driver.switch_to.default_content()
        return {"text": "", "buttons": [], "links": []}

In [None]:
def extract_policy_page_content(driver):
    """Extract content from a privacy/cookie policy page"""
    policy_details = {
        "text": "",
        "buttons": [],
        "links": []
    }
    
    # Try to find the main content container
    try:
        # Try common selectors for policy content
        for selector in [
            "main", 
            "article", 
            ".content", 
            "#content", 
            ".privacy-policy",
            ".cookie-policy",
            ".policy-content"
        ]:
            try:
                content_element = driver.find_element(By.CSS_SELECTOR, selector)
                if content_element:
                    policy_text = content_element.text
                    if len(policy_text) > 200:  # If substantial text found
                        policy_details["text"] = policy_text
                        # Also get links within this container
                        links = content_element.find_elements(By.TAG_NAME, "a")
                        for link in links:
                            if link.is_displayed():
                                link_text = link.text.strip()
                                link_href = link.get_attribute("href")
                                if link_text and link_href:
                                    policy_details["links"].append({
                                        "text": link_text,
                                        "href": link_href,
                                        "element": link
                                    })
                        return policy_details
            except NoSuchElementException:
                continue
            
        # If no content found with specific selectors, fall back to body
        body = driver.find_element(By.TAG_NAME, "body")
        policy_details["text"] = body.text[:5000] + "..." if len(body.text) > 5000 else body.text
        
    except Exception as e:
        print(f"Error extracting policy content: {e}")
        policy_details["text"] = "Error extracting policy content"
        
    return policy_details

In [37]:
def display_screenshot(filepath):
    #Display a screenshot in the notebook
    display(Image(filename=filepath, width=800))

In [38]:
def get_actionable_elements(driver):
    #Find all visible and clickable elements on the page
    actionable = []
    
    # Look for anything that seems clickable
    selectors = [
        "button",
        "a",
        "input[type='button']",
        "input[type='submit']",
        "[role='button']",
        ".button",
        ".btn",
        "[class*='button']",
        "[class*='btn']"
    ]
    
    for selector in selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    text = element.text.strip() or element.get_attribute("value") or element.get_attribute("title")
                    if text:
                        actionable.append({
                            "text": text,
                            "element": element,
                            "type": element.tag_name,
                            "location": element.location
                        })
        except Exception as e:
            print(f"Error finding {selector} elements: {e}")
    return actionable

In [39]:
def highlight_element(driver, element, duration=0.5):
    #Temporarily highlight an element on the page

    original_style = element.get_attribute("style")
    
    # Apply a red border highlight
    driver.execute_script(
        "arguments[0].setAttribute('style', arguments[1]);",
        element,
        "border: 3px solid red; background-color: rgba(255, 0, 0, 0.1);"
    )
    
    # Wait briefly
    time.sleep(duration)
    
    # Restore original style
    driver.execute_script(
        "arguments[0].setAttribute('style', arguments[1]);", 
        element,
        original_style
    )

In [40]:
def analyze_cookie_mechanisms(website):
    # Analyze cookie consent mechanisms on a website

    mechanisms = {
        "website": website,
        "has_explicit_consent": False,
        "has_reject_option": False,
        "has_link_to_policy": False,
        "has_granular_options": False,
        "has_preselected_options": False,
        "notice_placement": None,
        "dark_patterns": []
    }
    # We'll be populating this during our exploration of the site
    return mechanisms

In [41]:
def detect_dark_patterns(notice_details):
    """Detect potential dark patterns in cookie notices"""
    patterns = []
    
    # Check notice text
    text = notice_details["text"].lower()
    
    # Check for confirmshaming (making users feel bad for opting out)
    confirmshame_phrases = ["no thanks", "i don't care", "i'm not interested", "ignore", "later", "not now"]
    if any(phrase in text for phrase in confirmshame_phrases):
        patterns.append("Confirmshaming")
    
    # Check buttons
    accept_buttons = []
    reject_buttons = []
    
    for button in notice_details["buttons"]:
        button_text = button["text"].lower()
        
        # Accept buttons
        if any(term in button_text for term in ["accept", "agree", "allow", "yes", "okay", "got it", "continue"]):
            accept_buttons.append(button)
        
        # Reject buttons
        if any(term in button_text for term in ["reject", "decline", "no", "later", "not now", "opt out", "don't sell"]):
            reject_buttons.append(button)
    
    # Check for color manipulation (accept button highlighted, reject button muted)
    # This is a simplified check since we don't have easy access to CSS attributes
    if len(accept_buttons) > 0 and len(reject_buttons) > 0:
        pass  # Would check button styling here if possible
    
    # Check for absence of reject button
    if len(accept_buttons) > 0 and len(reject_buttons) == 0:
        patterns.append("No explicit reject option")
    
    return patterns

In [42]:
def decode_hierarchy(data):
    """Convert flat data with parent-child relationships into a hierarchical structure"""
    # Create a mapping of state IDs to rows
    state_map = {row["StateID"]: row for row in data}
    
    # Create a tree structure
    hierarchy = []
    
    # Find root nodes (those with no parent)
    for row in data:
        if row["ParentID"] is None:
            # Deep copy the row to avoid modifying the original
            node = row.copy()
            node["children"] = []
            hierarchy.append(node)
    
    # Process child nodes
    for row in data:
        if row["ParentID"] is not None:
            parent = next((node for node in hierarchy if node["StateID"] == row["ParentID"]), None)
            
            if parent:
                # Add as direct child of a root node
                child = row.copy()
                child["children"] = []
                parent["children"].append(child)
            else:
                # Search deeper in the hierarchy
                def add_to_parent(nodes):
                    for node in nodes:
                        if node["StateID"] == row["ParentID"]:
                            child = row.copy()
                            child["children"] = []
                            node["children"].append(child)
                            return True
                        if add_to_parent(node["children"]):
                            return True
                    return False
                
                add_to_parent(hierarchy)
    
    return hierarchy

In [43]:
def clean_data_for_excel(data):
    """Prepare data for Excel by removing Selenium elements"""
    cleaned_data = []
    
    for row in data:
        cleaned_row = {}
        for key, value in row.items():
            # Skip Selenium WebElement objects and children
            if key != 'element' and key != 'children':
                cleaned_row[key] = value
        cleaned_data.append(cleaned_row)
    
    return cleaned_data

>## 5. Interactive Exploration Functions
>
>    This section contains the functions that allow you to interactively explore cookie notices and privacy policies. You'll be able to:
>
>    1. Navigate through different layers of cookie notices
>    2. Click different buttons and links to see how they change the notice
>    3. Record the state of each interaction
>    4. Track the hierarchical structure of choices

In [44]:
# Common selectors for cookie notices
COOKIE_SELECTORS = [
    # General cookie and consent banners
    "//div[contains(@class, 'cookie')]",
    "//div[contains(@class, 'gdpr')]",
    "//div[contains(@class, 'consent')]",
    "//div[contains(@id, 'cookie')]",
    "//div[contains(@id, 'consent')]",
    "//div[contains(@id, 'privacy')]",
    "//div[contains(@class, 'privacy-banner')]",
    "//div[contains(@class, 'banner')]",
    "//div[contains(@class, 'notice')]",
    "//div[contains(@class, 'alert')]",
    "//div[contains(@class, 'notification')]",
    "//div[contains(@class, 'popup')]",
    "//div[contains(@class, 'modal')]",
    "//div[contains(@class, 'overlay')]",
    
    # Common third-party consent platforms
    "//iframe[contains(@id, 'sp_message_iframe')]",  # Sourcepoint CMP
    "//div[contains(@id, 'onetrust')]",              # OneTrust
    "//div[contains(@id, 'truste')]",                # TrustArc
    "//div[contains(@id, 'didomi')]",                # Didomi
    "//div[contains(@id, 'cmp')]",                   # Generic CMP
    "//div[contains(@id, 'CybotCookiebotDialog')]",  # Cookiebot
    "//div[contains(@id, 'quantcast')]",             # Quantcast
    "//div[contains(@id, 'termly')]",                # Termly
    "//div[contains(@class, 'termly')]",             # Termly
    "//div[contains(@id, 'osano')]",                 # Osano
    "//div[contains(@class, 'osano')]",              # Osano
    
    # Fixed position banners (bottom/top of page)
    "//div[contains(@class, 'fixed') and (contains(@class, 'bottom') or contains(@class, 'top'))]",
    "//footer[contains(@class, 'cookie') or contains(@id, 'cookie')]"
]

In [45]:
def find_cookie_notice_iframes(driver):
    """Find potential iframes containing cookie notices"""
    iframe_elements = []
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                iframe_id = iframe.get_attribute("id") or ""
                iframe_class = iframe.get_attribute("class") or ""
                iframe_src = iframe.get_attribute("src") or ""
                
                # Check if iframe might contain cookie notice
                keywords = ['cookie', 'consent', 'privacy', 'gdpr', 'ccpa', 'notice', 'cmp']
                if any(keyword in iframe_id.lower() for keyword in keywords) or \
                   any(keyword in iframe_class.lower() for keyword in keywords) or \
                   any(keyword in iframe_src.lower() for keyword in keywords):
                    iframe_elements.append(iframe)
            except StaleElementReferenceException:
                continue
    except Exception as e:
        print(f"Error finding iframes: {e}")
    
    return iframe_elements


In [46]:
def display_options(notice_details):
    """Display interactive options to the user"""
    print(f"\n--- Notice Text ---\n{notice_details['text']}\n")
    
    print("--- Available Buttons ---")
    for i, button in enumerate(notice_details['buttons']):
        print(f"{i+1}. {button['text']}")
    
    print("\n--- Available Links ---")
    for i, link in enumerate(notice_details['links']):
        print(f"{i+1}. {link['text']} => {link['href']}")
    
    print("\n--- Actions ---")
    print("To click a button, enter: B<number> (e.g., B1 for the first button)")
    print("To click a link, enter: L<number> (e.g., L1 for the first link)")
    print("To go back, enter: BACK")
    print("To finish exploration, enter: DONE")
    
    return input("Enter your choice: ")


In [47]:
def record_state(data, website, parent_id, state_id, notice_details, screenshot_path, choice=None):
    """Record the current state to the data structure"""
    # Format links for Excel
    links_data = {}
    for i, link in enumerate(notice_details['links'][:5], 1):
        links_data[f"Link {i}"] = link['text']
        links_data[f"Link {i} Detail"] = link['href']
    
    # Ensure all link columns exist
    for i in range(len(notice_details['links']) + 1, 6):
        links_data[f"Link {i}"] = None
        links_data[f"Link {i} Detail"] = None
    
    # Create row with hierarchy information
    row = {
        "Website": website,
        "ParentID": parent_id,  # ID of the parent state (None for root)
        "StateID": state_id,    # Unique ID for this state
        "Snapshot": screenshot_path,
        "Text of Notice": notice_details['text'],
        "Choice Provided": notice_details['buttons'][0]['text'] if notice_details['buttons'] else "No choices detected",
        "ChoiceMade": choice,  # What choice led to this state
        **links_data             # Add all the link columns
    }
    
    data.append(row)
    return data

In [None]:
def explore_notice_interactively(driver, website, max_depth=3):
    """Interactive exploration of cookie notices with state tracking"""
    data = []  # To store all state data
    visited_states = {}  # To track visited states (avoid loops)
    state_stack = []  # For back navigation
    mechanisms = analyze_cookie_mechanisms(website)  # For tracking consent mechanisms
    
    # Initial visit to the website
    print(f"Visiting {website}...")
    try:
        driver.get(website)
        time.sleep(WAIT_TIME)  # Wait for page to load
    except Exception as e:
        print(f"Error loading {website}: {e}")
        return data, mechanisms
    
    # Generate a root state ID
    root_state_id = str(uuid.uuid4())[:8]
    current_state_id = root_state_id
    parent_id = None
    
    # Take initial screenshot
    screenshot_path = take_screenshot(driver, website, current_state_id)
    print("Initial page loaded. Screenshot taken.")
    print(f"Screenshot saved to: {screenshot_path}")
    
    # Find notice elements
    notice_elements, iframe_elements = find_cookie_notice(driver)
    
    if notice_elements:
        print(f"Found {len(notice_elements)} potential cookie notice elements")
        notice_details = extract_notice_details(notice_elements[0])
    elif iframe_elements:
        print(f"Found {len(iframe_elements)} potential cookie notice iframes")
        notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
    else:
        print("No cookie notice found on this page")
        notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
    
    # Record initial state
    data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path)
    
    # Begin interactive exploration
    depth = 0
    exploring = True
    
    while exploring and depth < max_depth:
        choice = display_options(notice_details)
        
        if choice.upper() == "DONE":
            print("Exploration complete!")
            exploring = False
            break
            
        elif choice.upper() == "BACK":
            if state_stack:
                # Pop the current state and go back to previous
                prev_url, prev_state_id, prev_parent_id = state_stack.pop()
                print(f"Going back to state {prev_state_id}...")
                
                # Navigate back (either by browser back or by URL)
                try:
                    driver.get(prev_url)
                    time.sleep(WAIT_TIME)  # Wait for page to load
                    
                    # Update current state trackers
                    current_state_id = prev_state_id
                    parent_id = prev_parent_id
                    depth -= 1 if depth > 0 else 0
                    
                    # Rediscover notice elements
                    notice_elements, iframe_elements = find_cookie_notice(driver)
                    
                    if notice_elements:
                        notice_details = extract_notice_details(notice_elements[0])
                    elif iframe_elements:
                        notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                    else:
                        notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
                        
                    screenshot_path = take_screenshot(driver, website, current_state_id)
                    print(f"Screenshot saved to: {screenshot_path}")
                    
                except Exception as e:
                    print(f"Error going back: {e}")
            else:
                print("Cannot go back further (at root state)")
                
        elif choice.upper().startswith("B") and notice_details['buttons']:
            try:
                button_idx = int(choice[1:]) - 1
                if 0 <= button_idx < len(notice_details['buttons']):
                    button = notice_details['buttons'][button_idx]
                    print(f"Clicking button: {button['text']}")
                    
                    # Save current state for back navigation
                    state_stack.append((driver.current_url, current_state_id, parent_id))
                    
                    # Update state trackers
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    
                    # Click the button
                    try:
                        button['element'].click()
                        time.sleep(WAIT_TIME)  # Wait for any changes
                        
                        # Take screenshot of the new state
                        screenshot_path = take_screenshot(driver, website, current_state_id)
                        print(f"Screenshot saved to: {screenshot_path}")
                        
                        # Find new notice state
                        notice_elements, iframe_elements = find_cookie_notice(driver)
                        
                        if notice_elements:
                            notice_details = extract_notice_details(notice_elements[0])
                        elif iframe_elements:
                            notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                        else:
                            notice_details = {"text": "No cookie notice detected after clicking", "buttons": [], "links": []}
                        
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path, button['text'])
                        
                        # Increment depth
                        depth += 1
                        
                    except Exception as e:
                        print(f"Error clicking button: {e}")
                else:
                    print(f"Invalid button index: {button_idx}")
            except ValueError:
                print(f"Invalid button choice: {choice}")
                
        elif choice.upper().startswith("L") and notice_details['links']:
            try:
                link_idx = int(choice[1:]) - 1
                if 0 <= link_idx < len(notice_details['links']):
                    link = notice_details['links'][link_idx]
                    print(f"Clicking link: {link['text']} ({link['href']})")
                    
                    # Save current state for back navigation
                    state_stack.append((driver.current_url, current_state_id, parent_id))
                    
                    # Update state trackers
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    
                    # Click the link
                    try:
                        # In the elif choice.upper().startswith("L") section, replace the clicking logic with:
                        if link['href'].startswith('http'):
                            print(f"Navigating directly to: {link['href']}")
                            driver.get(link['href'])
                        else:
                            # For relative URLs
                            base_url = urlparse(driver.current_url)
                            full_url = f"{base_url.scheme}://{base_url.netloc}{link['href']}"
                            print(f"Navigating directly to: {full_url}")
                            driver.get(full_url)
                            
                        time.sleep(WAIT_TIME)  # Wait for page to load
                        
                        # Take screenshot of the new state
                        screenshot_path = take_screenshot(driver, website, current_state_id)
                        print(f"Screenshot saved to: {screenshot_path}")

                        # # Take screenshot of the new state
                        # screenshot_path = take_screenshot(driver, website, current_state_id)
                        # print(f"Screenshot saved to: {screenshot_path}")

                        # Extract content from the policy page
                        policy_details = extract_policy_page_content(driver)
                                                
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, 
                                        policy_details, screenshot_path, f"Link: {link['text']}")
                        
                        # Check for cookie notices on the new page
                        notice_elements, iframe_elements = find_cookie_notice(driver)
                        
                        if notice_elements:
                            notice_details = extract_notice_details(notice_elements[0])
                        elif iframe_elements:
                            notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                        else:
                            # For policy pages, capture the main content
                            try:
                                main_content = driver.find_element(By.TAG_NAME, "main")
                                notice_details = extract_notice_details(main_content)
                            except:
                                try:
                                    # Look for article or content area
                                    content = driver.find_element(By.CSS_SELECTOR, "article, .content, #content, .privacy-policy")
                                    notice_details = extract_notice_details(content)
                                except:
                                    # Fallback to body
                                    body = driver.find_element(By.TAG_NAME, "body")
                                    notice_details = {
                                        "text": "Policy page content (excerpt):\n" + body.text[:500] + "...",
                                        "buttons": [],
                                        "links": []
                                    }
                        
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path, f"Link: {link['text']}")
                        
                        # Increment depth
                        depth += 1
                        
                    except Exception as e:
                        print(f"Error clicking link: {e}")
                else:
                    print(f"Invalid link index: {link_idx}")
            except ValueError:
                print(f"Invalid link choice: {choice}")
        
        else:
            print(f"Invalid choice or no such option: {choice}")
    
    print(f"Exploration finished after {depth} levels")
    
    # Update mechanism analysis
    if any("accept" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_explicit_consent'] = True
    
    if any("reject" in button['text'].lower() or "decline" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_reject_option'] = True
        
    if any("privacy" in link['text'].lower() or "policy" in link['text'].lower() for link in notice_details['links']):
        mechanisms['has_link_to_policy'] = True
    
    if any("settings" in button['text'].lower() or "preferences" in button['text'].lower() or "customize" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_granular_options'] = True
    
    # Detect potential dark patterns
    mechanisms['dark_patterns'] = detect_dark_patterns(notice_details)
    
    return data, mechanisms

In [None]:
def explore_policy_links(driver, website, parent_state_id, current_depth, max_depth, data):
    """Recursively explore links within policy pages up to max_depth"""
    if current_depth >= max_depth:
        return data
    
    # Extract all links from the current page
    policy_details = extract_policy_page_content(driver)
    
    # Save current URL to return to
    current_url = driver.current_url
    
    # Look for relevant policy links
    policy_keywords = ["privacy", "cookie", "data", "gdpr", "ccpa", "consent", "opt-out", "preferences"]
    
    for link in policy_details["links"]:
        link_text = link["text"].lower()
        
        # Check if this link appears to be a policy-related link
        if any(keyword in link_text for keyword in policy_keywords):
            try:
                # Generate new state ID
                new_state_id = str(uuid.uuid4())[:8]
                
                # Navigate to the link
                print(f"Following policy link: {link['text']} -> {link['href']}")
                driver.get(link['href'])
                time.sleep(WAIT_TIME)
                
                # Take screenshot
                screenshot_path = take_screenshot(driver, website, new_state_id)
                
                # Extract content
                sub_policy_details = extract_policy_page_content(driver)
                
                # Record this state
                data = record_state(data, website, parent_state_id, new_state_id, 
                                   sub_policy_details, screenshot_path, f"Policy link: {link['text']}")
                
                # Recursively explore this page's links
                data = explore_policy_links(driver, website, new_state_id, current_depth + 1, max_depth, data)
                
                # Return to the original page
                driver.get(current_url)
                time.sleep(WAIT_TIME)
                
            except Exception as e:
                print(f"Error following policy link {link['text']}: {e}")
                driver.get(current_url)  # Return to the original page
                time.sleep(WAIT_TIME)
    
    return data

# Interactive explorer function

In [None]:
# Run the Interactive Explorer

# Select a website to explore
website = "https://www.theguardian.com"

# Explore the website interactively
data, mechanisms = explore_notice_interactively(driver, website, max_depth=MAX_DEPTH)

# Display the mechanism analysis
print("\n--- Cookie Consent Mechanism Analysis ---")
for key, value in mechanisms.items():
    print(f"{key}: {value}")

# Clean data for Excel export
cleaned_data = clean_data_for_excel(data)

# Create a DataFrame
df = pd.DataFrame(cleaned_data)

# Preview the data
print(df.head())

# Save to Excel
df.to_excel(DATA_FILE, index=False)
print(f"Data saved to {DATA_FILE}")

# Create a hierarchical visualization of the exploration
hierarchy = decode_hierarchy(data)

Visiting https://www.theguardian.com...
Initial page loaded. Screenshot taken.
Screenshot saved to: screenshots/www_theguardian_com_state_fa447dfd_20250328_135712.png
Found 1 potential cookie notice iframes

--- Notice Text ---
California residents have certain rights with regard to the sale of personal information to third parties. Guardian News and Media and our partners use information collected through cookies or in other forms to improve experience on our site and pages, analyze how it is used and show personalized advertising.
At any point, you can opt out of the sale of all of your personal information by pressing
Do not sell or share my personal information
You can find out more in our privacy policy and cookie policy, and manage your choices by going to ‘California resident – Do Not Sell’ at the bottom of any page.

--- Available Buttons ---
1. Do not sell or share my personal information

--- Available Links ---
1. privacy policy => https://www.theguardian.com/help/privacy-po

In [51]:
def display_hierarchy_text(node, level=0):
    """Display the hierarchy in text format"""
    indent = "  " * level
    choice = node.get("ChoiceMade", "Initial State")
    state_id = node.get("StateID", "Unknown")
    
    print(f"{indent}Step {level}: {choice} (State ID: {state_id})")
    print(f"{indent}Screenshot: {node.get('Snapshot', 'None')}")
    
    # Add notice text summary (first 100 chars)
    notice_text = node.get("Text of Notice", "")
    if notice_text:
        summary = notice_text[:100] + ("..." if len(notice_text) > 100 else "")
        print(f"{indent}Notice: {summary}")
    
    print()
    
    # Display children
    for child in node.get("children", []):
        display_hierarchy_text(child, level + 1)

# Display the hierarchy in text format
print("\n--- Exploration Hierarchy ---")
for root in hierarchy:
    display_hierarchy_text(root)

# Cleanup
driver.quit()
print("WebDriver closed successfully.")


--- Exploration Hierarchy ---
Step 0: None (State ID: fa447dfd)
Screenshot: screenshots/www_theguardian_com_state_fa447dfd_20250328_135712.png
Notice: California residents have certain rights with regard to the sale of personal information to third pa...

WebDriver closed successfully.


In [52]:
# Batch Processing Multiple Websites (Optional)

def batch_scrape_websites(websites_list, headless=True):
    """Automatically scrape multiple websites without interactive exploration"""
    all_data = []
    all_mechanisms = []
    
    # Initialize new driver for batch processing
    batch_driver = initialize_driver(headless=headless)
    
    try:
        for website in websites_list:
            print(f"\nProcessing {website}...")
            
            # Basic data for this site
            site_data = []
            mechanisms = analyze_cookie_mechanisms(website)
            
            # Visit website
            try:
                batch_driver.get(website)
                time.sleep(WAIT_TIME)  # Wait for page to load
                
                # Take initial screenshot
                state_id = str(uuid.uuid4())[:8]
                screenshot = take_screenshot(batch_driver, website, state_id)
                
                # Find notice elements
                notice_elements, iframe_elements = find_cookie_notice(batch_driver)
                
                if notice_elements:
                    notice_details = extract_notice_details(notice_elements[0])
                    
                    # Record initial state
                    site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
                    
                    # Update mechanism analysis
                    if any("accept" in button['text'].lower() for button in notice_details['buttons']):
                        mechanisms['has_explicit_consent'] = True
                    
                    if any("reject" in button['text'].lower() or "decline" in button['text'].lower() for button in notice_details['buttons']):
                        mechanisms['has_reject_option'] = True
                        
                    if any("privacy" in link['text'].lower() or "policy" in link['text'].lower() for link in notice_details['links']):
                        mechanisms['has_link_to_policy'] = True
                    
                    if any("settings" in button['text'].lower() or "preferences" in button['text'].lower() or "customize" in button['text'].lower() for button in notice_details['buttons']):
                        mechanisms['has_granular_options'] = True
                    
                    # Detect potential dark patterns
                    mechanisms['dark_patterns'] = detect_dark_patterns(notice_details)
                    
                    # Try clicking the first button (usually Accept)
                    if notice_details['buttons']:
                        try:
                            first_button = notice_details['buttons'][0]
                            print(f"Automatically clicking: {first_button['text']}")
                            first_button['element'].click()
                            time.sleep(WAIT_TIME)
                            
                            # Take after-click screenshot
                            after_state_id = str(uuid.uuid4())[:8]
                            after_screenshot = take_screenshot(batch_driver, website, after_state_id)
                            
                            # Check for any remaining notices
                            after_notices, _ = find_cookie_notice(batch_driver)
                            if after_notices:
                                after_details = extract_notice_details(after_notices[0])
                                site_data = record_state(site_data, website, state_id, after_state_id, 
                                                        after_details, after_screenshot, first_button['text'])
                        except Exception as e:
                            print(f"Error clicking first button: {e}")
                elif iframe_elements:
                    notice_details = switch_to_iframe_and_extract(batch_driver, iframe_elements[0])
                    site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
                else:
                    print("No cookie notice found")
                    notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
                    site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
                
                # Add data to the global collection
                all_data.extend(site_data)
                all_mechanisms.append(mechanisms)
                
            except Exception as e:
                print(f"Error processing {website}: {e}")
        
        return all_data, all_mechanisms
    
    finally:
        # Clean up
        batch_driver.quit()

# To run batch processing (uncomment to use)
# batch_data, batch_mechanisms = batch_scrape_websites(websites)
# batch_df = pd.DataFrame(clean_data_for_excel(batch_data))
# batch_df.to_excel("batch_" + DATA_FILE, index=False)