># Interactive Privacy Notice & Cookie Choice Scraper
>
>    This notebook allows you to interactively scrape privacy notices and cookie choices from websites, capturing the layered interaction patterns. It will:
>    1. Visit a website and capture the initial privacy/cookie notice
>    2. Take screenshots of each layer/state
>    3. Let you choose which options to click and explore
>    4. Record all text, buttons, and links at each state
>    5. Store the data in a hierarchical structure
>    This approach gives you complete control over the exploration process while automating the data collection.

## 1. Setup and Dependencies

In [43]:
# Install required packages if needed
#!pip install selenium webdriver-manager beautifulsoup4 pandas requests IPython tqdm pillow openpyxl

In [44]:
# Import libraries
import os
import time
import uuid
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from IPython.display import display, Image, HTML
import re
from urllib.parse import urlparse
import json
import uuid
from tqdm.notebook import tqdm
import random

## 2. Configure Settings

In [45]:
# Configuration settings
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3  # Maximum click depth to explore
WAIT_TIME = 5  # Time in seconds to wait for elements to load


# Create screenshot directory if it doesn't exist
os.makedirs(SCREENSHOTS_DIR, exist_ok=True)


# Websites to analyze
websites = [
    # Major news/media sites
    "https://www.expedia.com",
    #"https://www.nytimes.com"
]

In [46]:
# # Configuration settings
# SCREENSHOTS_DIR = "screenshots"
# DATA_FILE = "privacy_notice_data.xlsx"
# MAX_DEPTH = 3  # Maximum click depth to explore
# WAIT_TIME = 5  # Time in seconds to wait for elements to load


# # Create screenshot directory if it doesn't exist
# os.makedirs(SCREENSHOTS_DIR, exist_ok=True)


# # Websites to analyze
# websites = [
#     # Major news/media sites
#     "https://www.theguardian.com",
#     "https://www.nytimes.com",
#     "https://www.bbc.com",
#     "https://www.cnn.com",
    
#     # Tech companies
#     "https://www.microsoft.com",
#     "https://www.google.com",
#     "https://www.apple.com",
#     "https://www.meta.com",
    
#     # E-commerce
#     "https://www.amazon.com",
#     "https://www.ebay.com",
#     "https://www.etsy.com",
#     "https://www.walmart.com",
    
#     # Social media
#     "https://www.reddit.com",
#     "https://www.twitter.com",
#     "https://www.linkedin.com",
#     "https://www.pinterest.com",
    
#     # Entertainment
#     "https://www.netflix.com",
#     "https://www.spotify.com",
#     "https://www.youtube.com"
# ]

## 3. Initialize WebDriver

In [47]:
def initialize_driver(headless=False):
    # Initialize and return a Chrome WebDriver instance
    chrome_options = Options()
    
    # Add stealth settings to avoid bot detection
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)
    
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Add user agent to appear more like a regular browser
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # Initialize with retry logic
    for attempt in range(3):
        try:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.set_page_load_timeout(30)
            
            # Additional CDP commands after driver is initialized
            driver.execute_cdp_cmd("Network.setUserAgentOverride", {
                "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
                "platform": "Windows",
                "acceptLanguage": "en-US,en;q=0.9"
            })
            
            return driver
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt == 2:
                raise
            time.sleep(2)

In [48]:
# def initialize_driver(headless=False):

#     # Initialize and return a Chrome WebDriver instance
#     chrome_options = Options()
#     if headless:
#         chrome_options.add_argument("--headless")
#     chrome_options.add_argument("--disable-gpu")
#     chrome_options.add_argument("--window-size=1920,1080")
#     chrome_options.add_argument("--disable-notifications")
#     chrome_options.add_argument("--no-sandbox")
#     chrome_options.add_argument("--disable-dev-shm-usage")

#     # Add user agent to appear more like a regular browser
#     chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

#     # Initialize with retry logic
#     for attempt in range(3):
#         try:
#             service = Service(ChromeDriverManager().install())
#             driver = webdriver.Chrome(service=service, options=chrome_options)
#             driver.set_page_load_timeout(30)
#             return driver
#         except Exception as e:
#             print(f"Attempt {attempt+1} failed: {e}")
#             if attempt == 2:
#                 raise
#             time.sleep(2)



In [49]:
# Initialize the driver (visible browser for interaction)
driver = initialize_driver(headless=False)
print("WebDriver initialized successfully!")

WebDriver initialized successfully!


## 4. Core Scraping Functions

In [50]:
def take_screenshot(driver, website, state_id=None):
    """Take a screenshot and return the file path"""

    domain = urlparse(website).netloc.replace('.', '_')
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    state_suffix = f"_state_{state_id}" if state_id else ""
    filename = f"{domain}{state_suffix}_{timestamp}.png"
    filepath = os.path.join(SCREENSHOTS_DIR, filename)
    
    driver.save_screenshot(filepath)
    return filepath

In [51]:
def find_cookie_notice_iframes(driver):
    """Find potential iframes containing cookie notices"""
    iframe_elements = []
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                iframe_id = iframe.get_attribute("id") or ""
                iframe_class = iframe.get_attribute("class") or ""
                iframe_src = iframe.get_attribute("src") or ""
                
                # Check if iframe might contain cookie notice
                keywords = ['cookie', 'consent', 'privacy', 'gdpr', 'ccpa', 'notice', 'cmp']
                if any(keyword in iframe_id.lower() for keyword in keywords) or \
                   any(keyword in iframe_class.lower() for keyword in keywords) or \
                   any(keyword in iframe_src.lower() for keyword in keywords):
                    iframe_elements.append(iframe)
            except StaleElementReferenceException:
                continue
    except Exception as e:
        print(f"Error finding iframes: {e}")
    
    return iframe_elements

In [52]:
def find_cookie_notice(driver):
    """Find cookie notice elements and return them"""
    notice_elements = []
    
    # First check for iframes that might contain cookie notices
    iframe_elements = find_cookie_notice_iframes(driver)
    
    # Then check for cookie notice elements in the main document
    for selector in COOKIE_SELECTORS:
        try:
            elements = driver.find_elements(By.XPATH, selector)
            for element in elements:
                if element.is_displayed():
                    # Check if this element has any visible text or buttons
                    text = element.text.strip()
                    if text and len(text) > 10:  # Arbitrary threshold to filter out unrelated elements
                        notice_elements.append(element)
        except Exception as e:
            print(f"Error with selector {selector}: {e}")
    
    return notice_elements, iframe_elements

In [53]:
def wait_for_cookie_notice(driver, timeout=10):
    """Wait for cookie notice to appear with exponential backoff"""
    start_time = time.time()
    max_time = start_time + timeout
    backoff = 0.5
    
    while time.time() < max_time:
        notice_elements, iframe_elements = find_cookie_notice(driver)
        if notice_elements or iframe_elements:
            return True
            
        # Scroll slightly to trigger lazy-loaded elements
        driver.execute_script("window.scrollTo(0, 100);")
        time.sleep(backoff)
        backoff = min(backoff * 1.5, 2)  # Exponential backoff with cap
        
    return False

In [54]:
def extract_notice_details(element):
    """Extract text, buttons and links from a notice element"""
    details = {
        "text": element.text.strip(),
        "buttons": [],
        "links": []
    }
    
    # Extract buttons
    try:
        buttons = element.find_elements(By.TAG_NAME, "button")
        for button in buttons:
            if button.is_displayed():
                button_text = button.text.strip()
                if button_text:
                    details["buttons"].append({
                        "text": button_text,
                        "element": button
                    })
    except Exception as e:
        print(f"Error extracting buttons: {e}")
    
    # Also look for link-like buttons
    try:
        spans_divs = element.find_elements(By.CSS_SELECTOR, "span[role='button'], div[role='button']")
        for item in spans_divs:
            if item.is_displayed():
                item_text = item.text.strip()
                if item_text:
                    details["buttons"].append({
                        "text": item_text,
                        "element": item
                    })
    except Exception as e:
        print(f"Error extracting span/div buttons: {e}")
    
    # Extract links
    try:
        links = element.find_elements(By.TAG_NAME, "a")
        for link in links:
            if link.is_displayed():
                link_text = link.text.strip()
                link_href = link.get_attribute("href")
                if link_text and link_href:
                    details["links"].append({
                        "text": link_text,
                        "href": link_href,
                        "element": link
                    })
    except Exception as e:
        print(f"Error extracting links: {e}")
    
    return details

In [55]:
def switch_to_iframe_and_extract(driver, iframe):
    """Switch to an iframe and extract notice details"""
    try:
        driver.switch_to.frame(iframe)
        notice_elements, _ = find_cookie_notice(driver)
        
        if not notice_elements:
            # If no specific notice elements found, use body
            body = driver.find_element(By.TAG_NAME, "body")
            notice_details = extract_notice_details(body)
        else:
            # Use the first notice element
            notice_details = extract_notice_details(notice_elements[0])
        
        # Store actual href values that can be navigated to directly
        for link in notice_details["links"]:
            link["direct_url"] = link["href"]
            
        driver.switch_to.default_content()
        return notice_details
    except Exception as e:
        print(f"Error processing iframe: {e}")
        driver.switch_to.default_content()
        return {"text": "", "buttons": [], "links": []}

In [56]:
def extract_policy_page_content(driver):
    """Extract content from a privacy/cookie policy page"""
    policy_details = {
        "text": "",
        "buttons": [],
        "links": []
    }
    
    # Try to find the main content container
    try:
        # Try common selectors for policy content
        for selector in [
            "main", 
            "article", 
            ".content", 
            "#content", 
            ".privacy-policy",
            ".cookie-policy",
            ".policy-content"
        ]:
            try:
                content_element = driver.find_element(By.CSS_SELECTOR, selector)
                if content_element:
                    policy_text = content_element.text
                    if len(policy_text) > 200:  # If substantial text found
                        policy_details["text"] = policy_text
                        # Also get links within this container
                        links = content_element.find_elements(By.TAG_NAME, "a")
                        for link in links:
                            if link.is_displayed():
                                link_text = link.text.strip()
                                link_href = link.get_attribute("href")
                                if link_text and link_href:
                                    policy_details["links"].append({
                                        "text": link_text,
                                        "href": link_href,
                                        "element": link
                                    })
                        return policy_details
            except NoSuchElementException:
                continue
            
        # If no content found with specific selectors, fall back to body
        body = driver.find_element(By.TAG_NAME, "body")
        policy_details["text"] = body.text[:5000] + "..." if len(body.text) > 5000 else body.text
        
    except Exception as e:
        print(f"Error extracting policy content: {e}")
        policy_details["text"] = "Error extracting policy content"
        
    return policy_details

In [57]:
def display_screenshot(filepath):
    #Display a screenshot in the notebook
    display(Image(filename=filepath, width=800))

In [58]:
def get_actionable_elements(driver):
    #Find all visible and clickable elements on the page
    actionable = []
    
    # Look for anything that seems clickable
    selectors = [
        "button",
        "a",
        "input[type='button']",
        "input[type='submit']",
        "[role='button']",
        ".button",
        ".btn",
        "[class*='button']",
        "[class*='btn']"
    ]
    
    for selector in selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    text = element.text.strip() or element.get_attribute("value") or element.get_attribute("title")
                    if text:
                        actionable.append({
                            "text": text,
                            "element": element,
                            "type": element.tag_name,
                            "location": element.location
                        })
        except Exception as e:
            print(f"Error finding {selector} elements: {e}")
    return actionable

In [59]:
def highlight_element(driver, element, duration=0.5):
    #Temporarily highlight an element on the page

    original_style = element.get_attribute("style")
    
    # Apply a red border highlight
    driver.execute_script(
        "arguments[0].setAttribute('style', arguments[1]);",
        element,
        "border: 3px solid red; background-color: rgba(255, 0, 0, 0.1);"
    )
    
    # Wait briefly
    time.sleep(duration)
    
    # Restore original style
    driver.execute_script(
        "arguments[0].setAttribute('style', arguments[1]);", 
        element,
        original_style
    )

In [60]:
def analyze_cookie_mechanisms(website):
    # Analyze cookie consent mechanisms on a website

    mechanisms = {
        "website": website,
        "has_explicit_consent": False,
        "has_reject_option": False,
        "has_link_to_policy": False,
        "has_granular_options": False,
        "has_preselected_options": False,
        "notice_placement": None,
        "dark_patterns": []
    }
    # We'll be populating this during our exploration of the site
    return mechanisms

In [61]:
def detect_dark_patterns(notice_details):
    """Detect potential dark patterns in cookie notices"""
    patterns = []
    
    # Check notice text
    text = notice_details["text"].lower()
    
    # Check for confirmshaming (making users feel bad for opting out)
    confirmshame_phrases = ["no thanks", "i don't care", "i'm not interested", "ignore", "later", "not now"]
    if any(phrase in text for phrase in confirmshame_phrases):
        patterns.append("Confirmshaming")
    
    # Check buttons
    accept_buttons = []
    reject_buttons = []
    
    for button in notice_details["buttons"]:
        button_text = button["text"].lower()
        
        # Accept buttons
        if any(term in button_text for term in ["accept", "agree", "allow", "yes", "okay", "got it", "continue"]):
            accept_buttons.append(button)
        
        # Reject buttons
        if any(term in button_text for term in ["reject", "decline", "no", "later", "not now", "opt out", "don't sell"]):
            reject_buttons.append(button)
    
    # Check for color manipulation (accept button highlighted, reject button muted)
    # This is a simplified check since we don't have easy access to CSS attributes
    if len(accept_buttons) > 0 and len(reject_buttons) > 0:
        pass  # Would check button styling here if possible
    
    # Check for absence of reject button
    if len(accept_buttons) > 0 and len(reject_buttons) == 0:
        patterns.append("No explicit reject option")
    
    return patterns

In [62]:
def decode_hierarchy(data):
    """Convert flat data with parent-child relationships into a hierarchical structure"""
    # Create a mapping of state IDs to rows
    state_map = {row["StateID"]: row for row in data}
    
    # Create a tree structure
    hierarchy = []
    
    # Find root nodes (those with no parent)
    for row in data:
        if row["ParentID"] is None:
            # Deep copy the row to avoid modifying the original
            node = row.copy()
            node["children"] = []
            hierarchy.append(node)
    
    # Process child nodes
    for row in data:
        if row["ParentID"] is not None:
            parent = next((node for node in hierarchy if node["StateID"] == row["ParentID"]), None)
            
            if parent:
                # Add as direct child of a root node
                child = row.copy()
                child["children"] = []
                parent["children"].append(child)
            else:
                # Search deeper in the hierarchy
                def add_to_parent(nodes):
                    for node in nodes:
                        if node["StateID"] == row["ParentID"]:
                            child = row.copy()
                            child["children"] = []
                            node["children"].append(child)
                            return True
                        if add_to_parent(node["children"]):
                            return True
                    return False
                
                add_to_parent(hierarchy)
    
    return hierarchy

In [63]:
def clean_data_for_excel(data):
    """Prepare data for Excel by removing Selenium elements"""
    cleaned_data = []
    
    for row in data:
        cleaned_row = {}
        for key, value in row.items():
            # Skip Selenium WebElement objects and children
            if key != 'element' and key != 'children':
                cleaned_row[key] = value
        cleaned_data.append(cleaned_row)
    
    return cleaned_data

>## 5. Interactive Exploration Functions
>
>    This section contains the functions that allow you to interactively explore cookie notices and privacy policies. You'll be able to:
>
>    1. Navigate through different layers of cookie notices
>    2. Click different buttons and links to see how they change the notice
>    3. Record the state of each interaction
>    4. Track the hierarchical structure of choices

In [64]:
# Common selectors for cookie notices
COOKIE_SELECTORS = [
    # General cookie and consent banners
    "//div[contains(@class, 'cookie')]",
    "//div[contains(@class, 'gdpr')]",
    "//div[contains(@class, 'consent')]",
    "//div[contains(@id, 'cookie')]",
    "//div[contains(@id, 'consent')]",
    "//div[contains(@id, 'privacy')]",
    "//div[contains(@class, 'privacy-banner')]",
    "//div[contains(@class, 'banner')]",
    "//div[contains(@class, 'notice')]",
    "//div[contains(@class, 'alert')]",
    "//div[contains(@class, 'notification')]",
    "//div[contains(@class, 'popup')]",
    "//div[contains(@class, 'modal')]",
    "//div[contains(@class, 'overlay')]",
    
    # Common third-party consent platforms
    "//iframe[contains(@id, 'sp_message_iframe')]",  # Sourcepoint CMP
    "//div[contains(@id, 'onetrust')]",              # OneTrust
    "//div[contains(@id, 'truste')]",                # TrustArc
    "//div[contains(@id, 'didomi')]",                # Didomi
    "//div[contains(@id, 'cmp')]",                   # Generic CMP
    "//div[contains(@id, 'CybotCookiebotDialog')]",  # Cookiebot
    "//div[contains(@id, 'quantcast')]",             # Quantcast
    "//div[contains(@id, 'termly')]",                # Termly
    "//div[contains(@class, 'termly')]",             # Termly
    "//div[contains(@id, 'osano')]",                 # Osano
    "//div[contains(@class, 'osano')]",              # Osano
    
    # Fixed position banners (bottom/top of page)
    "//div[contains(@class, 'fixed') and (contains(@class, 'bottom') or contains(@class, 'top'))]",
    "//footer[contains(@class, 'cookie') or contains(@id, 'cookie')]"

    "//div[contains(@class, 'privacy-alert')]",
    "//div[contains(@class, 'ccpa')]",
    "//div[contains(@class, 'data-collection')]",
    "//div[contains(@class, 'site-notice')]",
    "//div[contains(@id, 'privacy-prompt')]",
    "//div[contains(@aria-label, 'privacy')]",
    "//div[contains(@aria-label, 'cookie')]",
    "//div[contains(@class, 'consent-modal')]",
    "//div[contains(@id, 'consent-modal')]",
    "//div[contains(@role, 'dialog') and (contains(@class, 'cookie') or contains(@class, 'privacy'))]",
    "//div[contains(@class, 'consent-banner')]",
    "//section[contains(@id, 'cookie')]",
    "//section[contains(@class, 'cookie')]"
]

In [65]:
def find_cookie_notice_iframes(driver):
    """Find potential iframes containing cookie notices"""
    iframe_elements = []
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                iframe_id = iframe.get_attribute("id") or ""
                iframe_class = iframe.get_attribute("class") or ""
                iframe_src = iframe.get_attribute("src") or ""
                
                # Check if iframe might contain cookie notice
                keywords = ['cookie', 'consent', 'privacy', 'gdpr', 'ccpa', 'notice', 'cmp']
                if any(keyword in iframe_id.lower() for keyword in keywords) or \
                   any(keyword in iframe_class.lower() for keyword in keywords) or \
                   any(keyword in iframe_src.lower() for keyword in keywords):
                    iframe_elements.append(iframe)
            except StaleElementReferenceException:
                continue
    except Exception as e:
        print(f"Error finding iframes: {e}")
    
    return iframe_elements

In [66]:
def display_options(notice_details):
    """Display interactive options to the user"""
    print(f"\n--- Notice Text ---\n{notice_details['text']}\n")
    
    print("--- Available Buttons ---")
    for i, button in enumerate(notice_details['buttons']):
        print(f"{i+1}. {button['text']}")
    
    print("\n--- Available Links ---")
    for i, link in enumerate(notice_details['links']):
        print(f"{i+1}. {link['text']} => {link['href']}")
    
    print("\n--- Actions ---")
    print("To click a button, enter: B<number> (e.g., B1 for the first button)")
    print("To click a link, enter: L<number> (e.g., L1 for the first link)")
    print("To take a manual screenshot, enter: S")
    print("To find and expand dropdown sections, enter: E")
    print("To go back, enter: BACK")
    print("To finish exploration, enter: DONE")
    
    return input("Enter your choice: ")

In [67]:
def record_state(data, website, parent_id, state_id, notice_details, screenshot_path, choice=None):
    """Record the current state to the data structure"""
    # Format links for Excel
    links_data = {}
    for i, link in enumerate(notice_details['links'][:5], 1):
        links_data[f"Link {i}"] = link['text']
        links_data[f"Link {i} Detail"] = link['href']
    
    # Ensure all link columns exist
    for i in range(len(notice_details['links']) + 1, 6):
        links_data[f"Link {i}"] = None
        links_data[f"Link {i} Detail"] = None
    
    # Create row with hierarchy information
    row = {
        "Website": website,
        "ParentID": parent_id,  # ID of the parent state (None for root)
        "StateID": state_id,    # Unique ID for this state
        "Snapshot": screenshot_path,
        "Text of Notice": notice_details['text'],
        "Choice Provided": notice_details['buttons'][0]['text'] if notice_details['buttons'] else "No choices detected",
        "ChoiceMade": choice,  # What choice led to this state
        **links_data             # Add all the link columns
    }
    
    data.append(row)
    return data

In [68]:
def extract_site_policies(driver, website):
    """Extract policies by directly navigating to common policy paths"""
    domain = urlparse(website).netloc
    policy_paths = [
        "/privacy", "/privacy-policy", "/privacy-statement",
        "/cookies", "/cookie-policy", "/cookie-statement",
        "/legal/privacy", "/legal/cookies", "/about/privacy"
    ]
    
    results = []
    original_url = driver.current_url
    
    for path in policy_paths:
        try:
            policy_url = f"https://{domain}{path}"
            driver.get(policy_url)
            time.sleep(WAIT_TIME)
            
            # Check if page loaded successfully
            if "404" not in driver.title.lower() and "not found" not in driver.title.lower():
                screenshot_path = take_screenshot(driver, website, f"policy_{path.replace('/', '_')}")
                policy_content = extract_policy_page_content(driver)
                
                results.append({
                    "path": path,
                    "url": policy_url,
                    "screenshot": screenshot_path,
                    "content": policy_content["text"][:500]  # First 500 chars
                })
        except Exception:
            pass
    
    # Return to original page
    driver.get(original_url)
    return results

In [69]:
def find_expandable_sections(driver, container):
    """Enhanced detection of expandable elements with multiple interaction strategies"""
    expandables = []
    
    # 1. First try standard patterns
    selectors = [
        # Standard accordion patterns
        ".accordion-header", "[aria-expanded]", "[data-toggle='collapse']", 
        "details", "summary", ".expandable", ".collapsible",
        
        # Common cookie consent implementations
        ".cookie-category", ".consent-category", ".preference-group",
        ".consent-item", ".cookie-item", ".category-header",
        
        # Elements with + or arrows
        ".plus", ".accordion-toggle", ".arrow", ".caret"
    ]
    
    for selector in selectors:
        try:
            elements = container.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    text = element.text.strip()
                    if text and any(term in text.lower() for term in [
                        "cookie", "necessary", "functional", "analytics", 
                        "targeting", "performance", "advertising", "preferences"
                    ]):
                        expandables.append({
                            "element": element,
                            "text": text,
                            "type": "expandable"
                        })
        except Exception as e:
            print(f"Error finding expandables with {selector}: {e}")
    
    # 2. Try OneTrust specific patterns (very common implementation)
    try:
        onetrust_selectors = [
            "#onetrust-pc-sdk .category-item", 
            "#onetrust-pc-sdk .ot-accordion-layout",
            ".ot-accordion-layout .ot-acc-hdr"
        ]
        
        for selector in onetrust_selectors:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    text = element.text.strip()
                    expandables.append({
                        "element": element,
                        "text": text or "OneTrust Category",
                        "type": "expandable"
                    })
    except Exception:
        pass
    
    # 3. Look for elements with +/- indicators or toggles
    try:
        plus_minus_pattern = ".//*[contains(text(), '+') or contains(text(), '−') or contains(text(), '▼') or contains(text(), '▲')]"
        elements = container.find_elements(By.XPATH, plus_minus_pattern)
        for element in elements:
            if element.is_displayed():
                # Get parent for better context
                try:
                    parent = element.find_element(By.XPATH, "..")
                    text = parent.text.strip()
                    expandables.append({
                        "element": element,
                        "text": text or "Toggle Element",
                        "type": "expandable"
                    })
                except:
                    expandables.append({
                        "element": element,
                        "text": element.text.strip() or "Toggle Element",
                        "type": "expandable"
                    })
    except Exception:
        pass
    
    # 4. Look specifically for text-based matches for common cookie categories
    cookie_category_patterns = [
        ".//div[contains(., 'Strictly Necessary')]",
        ".//div[contains(., 'Necessary Cookies')]",
        ".//div[contains(., 'Functional Cookies')]",
        ".//div[contains(., 'Analytics Cookies')]",
        ".//div[contains(., 'Performance Cookies')]",
        ".//div[contains(., 'Targeting Cookies')]",
        ".//div[contains(., 'Marketing Cookies')]"
    ]
    
    for pattern in cookie_category_patterns:
        try:
            elements = container.find_elements(By.XPATH, pattern)
            for element in elements:
                if element.is_displayed():
                    # Only add if element text is reasonably short (to avoid capturing whole sections)
                    text = element.text.strip()
                    if 10 < len(text) < 150:  # Not too short, not too long
                        expandables.append({
                            "element": element,
                            "text": text,
                            "type": "expandable"
                        })
        except Exception:
            pass
    
    return expandables

In [70]:
def find_and_click_child_element(driver, parent):
    """Find and click potential toggle elements within a parent"""
    # Look for typical toggle elements
    selectors = [
        "button", 
        ".toggle", 
        ".caret", 
        ".arrow", 
        "span[role='button']", 
        "[aria-expanded]"
    ]
    
    for selector in selectors:
        try:
            child = parent.find_element(By.CSS_SELECTOR, selector)
            if child.is_displayed():
                driver.execute_script("arguments[0].click();", child)
                return True
        except:
            pass
    
    # If no specific element found, try the first child
    try:
        children = parent.find_elements(By.XPATH, "./*")
        if children and children[0].is_displayed():
            driver.execute_script("arguments[0].click();", children[0])
            return True
    except:
        pass
    
    raise Exception("No clickable child elements found")

def find_and_click_caret(driver, element):
    """Try to find and click a caret or arrow near the element"""
    try:
        # Look for elements near the target
        parent = element.find_element(By.XPATH, "..")
        carets = parent.find_elements(By.CSS_SELECTOR, ".caret, .arrow, [class*='arrow'], [class*='caret']")
        
        if carets:
            for caret in carets:
                if caret.is_displayed():
                    driver.execute_script("arguments[0].click();", caret)
                    return True
                    
        # Try siblings
        siblings = parent.find_elements(By.XPATH, "./*")
        for sibling in siblings:
            if sibling != element and sibling.is_displayed():
                # Check if it looks like a toggle
                class_attr = sibling.get_attribute("class") or ""
                if "toggle" in class_attr or "arrow" in class_attr or "caret" in class_attr:
                    driver.execute_script("arguments[0].click();", sibling)
                    return True
    except:
        pass
    
    raise Exception("No carets or arrows found")

In [71]:
def try_expand_element(driver, element):
    """Try multiple strategies to expand an element, optimized for OneTrust and other CMPs"""
    
    # First, check if we're dealing with an OneTrust element
    is_onetrust = False
    try:
        parent_html = element.get_attribute("outerHTML")
        if "onetrust" in parent_html.lower() or "ot-" in parent_html.lower():
            is_onetrust = True
    except:
        pass
    
    if is_onetrust:
        # OneTrust-specific expansion strategies
        try:
            # 1. Look for arrow element first (most reliable)
            try:
                arrow = element.find_element(By.CSS_SELECTOR, ".ot-arw-cntr, .ot-arrow, [class*='arrow'], [class*='chevron']")
                driver.execute_script("arguments[0].click();", arrow)
                time.sleep(1)
                return True
            except:
                pass
                
            # 2. Try clicking the header element itself
            driver.execute_script("arguments[0].click();", element)
            time.sleep(1)
            
            # 3. Check if it worked by looking for expanded attribute
            expanded = element.get_attribute("aria-expanded")
            if expanded and expanded == "true":
                return True
                
            # 4. Look for the label to click instead
            try:
                label = element.find_element(By.CSS_SELECTOR, ".ot-cat-header, .ot-always-active")
                driver.execute_script("arguments[0].click();", label)
                time.sleep(1)
                return True
            except:
                pass
                
            # 5. Force expansion by changing the CSS display property of the content
            try:
                content_id = element.get_attribute("aria-controls")
                if content_id:
                    driver.execute_script(f"document.getElementById('{content_id}').style.display = 'block';")
                    driver.execute_script(f"document.getElementById('{content_id}').style.maxHeight = 'none';")
                    return True
            except:
                pass
                
            # 6. Target the plus/minus icon specifically
            try:
                icon = element.find_element(By.CSS_SELECTOR, "[class*='plus'], [class*='minus'], [class*='expand']")
                driver.execute_script("arguments[0].click();", icon)
                time.sleep(1)
                return True
            except:
                pass
        except Exception as e:
            print(f"  OneTrust expansion strategies failed: {e}")
    
    # General strategies for any expandable element
    strategies = [
        # Strategy 1: Simple JavaScript click
        lambda: driver.execute_script("arguments[0].click();", element),
        
        # Strategy 2: Try to find and click a child element that might be the actual toggle
        lambda: find_and_click_child_element(driver, element),
        
        # Strategy 3: Force attribute change (for aria-expanded elements)
        lambda: driver.execute_script(
            "arguments[0].setAttribute('aria-expanded', 'true'); "
            "var content = document.querySelector('#' + arguments[0].getAttribute('aria-controls')); "
            "if(content) { content.style.display = 'block'; content.style.maxHeight = 'none'; }",
            element
        ),
        
        # Strategy 4: Use Actions API for more precise click
        lambda: webdriver.ActionChains(driver).move_to_element(element).click().perform(),
        
        # Strategy 5: Try clicking by coordinates
        lambda: driver.execute_script(
            "var rect = arguments[0].getBoundingClientRect(); "
            "var clickEvent = document.createEvent('MouseEvents'); "
            "clickEvent.initMouseEvent('click', true, true, window, 0, 0, 0, "
            "rect.left + rect.width/2, rect.top + rect.height/2, false, false, false, false, 0, null); "
            "arguments[0].dispatchEvent(clickEvent);",
            element
        ),
    ]
    
    for i, strategy in enumerate(strategies, 1):
        try:
            print(f"  Trying expansion strategy {i}...")
            strategy()
            time.sleep(1)
            
            # Check if it worked by looking for attribute changes or CSS changes
            expanded = element.get_attribute("aria-expanded")
            if expanded and expanded == "true":
                print("  Strategy worked! (aria-expanded is now true)")
                return True
                
            # Take a screenshot after each attempt to help debug
            driver.save_screenshot(f"manual_screenshots/debug_strategy_{i}.png")
            
            return True  # Assume it worked since we can't reliably detect success
        except Exception as e:
            print(f"  Strategy {i} failed: {e}")
    
    # Last resort: Try to find a nearby button that might control expansion
    try:
        # Look one level up and then for button elements
        parent = element.find_element(By.XPATH, "..")
        buttons = parent.find_elements(By.TAG_NAME, "button")
        for button in buttons:
            if button.is_displayed():
                print("  Trying parent's button...")
                driver.execute_script("arguments[0].click();", button)
                time.sleep(1)
                return True
    except:
        pass
    
    return False

In [72]:
def expand_onetrust_sections(driver):
    """Specifically target and expand OneTrust cookie preference sections"""
    print("Attempting to expand OneTrust cookie sections...")
    
    # First, take a screenshot before we start
    driver.save_screenshot("manual_screenshots/onetrust_before.png")
    
    # Method 1: Use OneTrust-specific selectors
    selectors = [
        ".ot-accordion-layout.ot-cat-item",  # Category containers
        ".ot-cat-header",                    # Category headers
        ".ot-acc-hdr",                       # Accordion headers
        ".category-menu-switch-handler"      # Toggle handlers
    ]
    
    success = False
    for selector in selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                print(f"Found {len(elements)} OneTrust elements with selector: {selector}")
                for i, element in enumerate(elements):
                    try:
                        print(f"Clicking OneTrust element {i+1}/{len(elements)}...")
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                        time.sleep(0.5)
                        
                        # Try direct click with JavaScript
                        driver.execute_script("arguments[0].click();", element)
                        time.sleep(0.5)
                        
                        # Also try to find and click the arrow element
                        try:
                            arrow = element.find_element(By.CSS_SELECTOR, ".ot-arw-cntr")
                            driver.execute_script("arguments[0].click();", arrow)
                        except:
                            pass
                        
                        success = True
                        
                        # Take a screenshot after each click
                        driver.save_screenshot(f"manual_screenshots/onetrust_click_{i+1}.png")
                    except Exception as e:
                        print(f"Error expanding element {i+1}: {e}")
            else:
                print(f"No elements found with selector: {selector}")
        except Exception as e:
            print(f"Error with selector {selector}: {e}")
    
    # Method 2: Force expansion with JavaScript
    try:
        # This tries to force all sections to expand by manipulating the DOM directly
        script = """
        // Find all category containers and mark them as expanded
        var categories = document.querySelectorAll('.ot-cat-item, .ot-accordion-layout');
        for(var i = 0; i < categories.length; i++) {
            categories[i].setAttribute('aria-expanded', 'true');
            
            // Also try to find the content and show it
            var content = categories[i].querySelector('.ot-acc-txt, .ot-cat-desc');
            if(content) {
                content.style.display = 'block';
                content.style.maxHeight = 'none';
            }
            
            // And activate any arrow indicators
            var arrow = categories[i].querySelector('.ot-arw-cntr');
            if(arrow) {
                arrow.classList.add('ot-arw-up');
                arrow.classList.remove('ot-arw-down');
            }
        }
        
        // Also target individual headers
        var headers = document.querySelectorAll('.ot-cat-header, .ot-acc-hdr');
        for(var j = 0; j < headers.length; j++) {
            headers[j].setAttribute('aria-expanded', 'true');
        }
        
        // Return number of elements affected
        return categories.length + headers.length;
        """
        affected = driver.execute_script(script)
        print(f"Forced expansion of {affected} elements with JavaScript")
        if affected > 0:
            success = True
        
        # Take final screenshot
        driver.save_screenshot("manual_screenshots/onetrust_after_js.png")
    except Exception as e:
        print(f"Error with JavaScript expansion: {e}")
    
    return success

In [73]:
def explore_notice_interactively(driver, website, max_depth=3):
    """Interactive exploration of cookie notices with state tracking"""
    data = []  # To store all state data
    visited_states = {}  # To track visited states (avoid loops)
    state_stack = []  # For back navigation
    mechanisms = analyze_cookie_mechanisms(website)  # For tracking consent mechanisms
    
    # Initial visit to the website with anti-bot measures
    print(f"Visiting {website}...")
    try:
        # Clear previous data
        driver.delete_all_cookies()
        
        # Visit site
        driver.get(website)
        
        # Check if we need human intervention for bot detection
        print("Checking if human intervention is needed...")
        needs_intervention = check_for_bot_detection(driver)
        
        if needs_intervention:
            # Prompt user to solve captcha/verification
            input("\n>>> BOT DETECTION DETECTED! Please solve the captcha/verification in the browser window, then press Enter to continue...\n")
            print("Thank you! Continuing with scraping...")
            time.sleep(2)  # Small wait after user interaction
    except Exception as e:
        print(f"Error loading {website}: {e}")
        return data, mechanisms
    
    # Verify we're on the actual site content
    if not verify_page_loaded(driver, website):
        print("Page verification failed - may still be on bot detection page")
        input("Please ensure you're on the main website content, then press Enter...")
    
    # Ask if the user wants to manually interact before proceeding
    initial_action = input("\nWould you like to (S)creenshot, (E)xpand sections, or (C)ontinue? ").upper()
    
    if initial_action == "S":
        manual_screenshot = take_manual_screenshot(driver)
        print(f"Manual screenshot taken: {manual_screenshot}")
    elif initial_action == "E":
        # Find and try to expand sections
        try:
            print("Looking for expandable sections in cookie dialog...")
            # Try to find dialog container first
            dialog_selectors = [
                "#onetrust-consent-sdk", ".cookie-notice", "#cookie-notice",
                "div[role='dialog']", ".modal-dialog", ".consent-modal"
            ]
            
            dialog = None
            for selector in dialog_selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for el in elements:
                        if el.is_displayed():
                            dialog = el
                            break
                    if dialog:
                        break
                except:
                    continue
            
            if not dialog:
                print("No cookie dialog container found, using body")
                dialog = driver.find_element(By.TAG_NAME, "body")
            
            # Find expandable sections
            expandables = find_expandable_sections(driver, dialog)
            
            if expandables:
                print(f"Found {len(expandables)} potential expandable sections:")
                for i, exp in enumerate(expandables):
                    print(f"{i+1}. {exp['text']}")
                
                # Ask which one to expand
                exp_choice = input("Enter number to expand (or 'A' for all): ")
                
                if exp_choice.upper() == "A":
                    # Try to expand all
                    for exp in expandables:
                        try:
                            print(f"Clicking: {exp['text']}")
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", exp["element"])
                            time.sleep(0.5)
                            driver.execute_script("arguments[0].click();", exp["element"])
                            time.sleep(1)
                        except Exception as e:
                            print(f"Error expanding {exp['text']}: {e}")
                else:
                    try:
                        idx = int(exp_choice) - 1
                        if 0 <= idx < len(expandables):
                            exp = expandables[idx]
                            print(f"Clicking: {exp['text']}")
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", exp["element"])
                            time.sleep(0.5)
                            driver.execute_script("arguments[0].click();", exp["element"])
                            time.sleep(1)
                        else:
                            print("Invalid selection")
                    except ValueError:
                        print("Invalid input")
            else:
                print("No expandable sections found")
        except Exception as e:
            print(f"Error processing expandable sections: {e}")
    
    # Generate a root state ID
    root_state_id = str(uuid.uuid4())[:8]
    current_state_id = root_state_id
    parent_id = None
    
    # Take initial screenshot AFTER verification
    screenshot_path = take_screenshot(driver, website, current_state_id)
    print("Initial page loaded. Screenshot taken.")
    print(f"Screenshot saved to: {screenshot_path}")
    
    # Wait for cookie notice with retry logic
    print("Waiting for cookie notices to appear...")
    cookie_found = wait_for_cookie_notice(driver, timeout=15)
    
    # Find notice elements
    notice_elements, iframe_elements = find_cookie_notice(driver)
    
    if notice_elements:
        print(f"Found {len(notice_elements)} potential cookie notice elements")
        notice_details = extract_notice_details(notice_elements[0])
    elif iframe_elements:
        print(f"Found {len(iframe_elements)} potential cookie notice iframes")
        notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
    else:
        print("No cookie notice found with standard detection, trying footer links...")
        
        # Look for privacy/cookie links in the footer
        footer_links = find_privacy_footer_links(driver)
        if footer_links:
            print(f"Found {len(footer_links)} privacy-related links in footer")
            notice_details = {
                "text": "Privacy links found in footer (no banner detected)",
                "buttons": [],
                "links": footer_links
            }
        else:
            print("No privacy links found in footer, using placeholder")
            notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
    
    # Record initial state
    data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path)
    
    # Begin interactive exploration
    depth = 0
    exploring = True
    
    while exploring and depth < max_depth:
        choice = display_options(notice_details)
        
        if choice.upper() == "DONE":
            print("Exploration complete!")
            exploring = False
            break
        
        elif choice.upper() == "S":
            # Take manual screenshot
            manual_screenshot = take_manual_screenshot(driver)
            print(f"Manual screenshot taken: {manual_screenshot}")
            continue
            
        
        # In the section where you handle the "E" command:
        elif choice.upper() == "E":
            # Try to find and expand sections
            try:
                print("Looking for expandable sections...")
                dialogs = [
                    # Try to find dialog container first
                    driver.find_elements(By.CSS_SELECTOR, "#onetrust-consent-sdk"),
                    driver.find_elements(By.CSS_SELECTOR, ".cookie-notice"),
                    driver.find_elements(By.CSS_SELECTOR, "#cookie-notice"),
                    driver.find_elements(By.CSS_SELECTOR, "div[role='dialog']"),
                    driver.find_elements(By.CSS_SELECTOR, ".modal-dialog"),
                    driver.find_elements(By.CSS_SELECTOR, ".consent-modal")
                ]
                
                dialog = None
                for dialog_elements in dialogs:
                    for el in dialog_elements:
                        if el.is_displayed():
                            dialog = el
                            break
                    if dialog:
                        break
                
                if "onetrust" in driver.page_source.lower():
                    onetrust_success = expand_onetrust_sections(driver)
                    print(f"OneTrust expansion {'succeeded' if onetrust_success else 'failed'}")

                if not dialog:
                    print("No cookie dialog container found, using body")
                    dialog = driver.find_element(By.TAG_NAME, "body")
                
                # Find expandable sections
                expandables = find_expandable_sections(driver, dialog)
                
                if expandables:
                    print(f"Found {len(expandables)} potential expandable sections:")
                    for i, exp in enumerate(expandables):
                        print(f"{i+1}. {exp['text'][:50]}..." if len(exp['text']) > 50 else f"{i+1}. {exp['text']}")
                    
                    exp_choice = input("Enter number to expand (or 'A' for all): ")
                    
                    if exp_choice.upper() == "A":
                        # Try to expand all
                        for exp in expandables:
                            try:
                                print(f"Attempting to expand: {exp['text'][:30]}...")
                                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", exp["element"])
                                time.sleep(0.5)
                                
                                # Try multiple strategies
                                if try_expand_element(driver, exp["element"]):
                                    print("  Success!")
                                else:
                                    print("  Could not expand element with any strategy")
                                
                                # Take a screenshot after each expansion
                                manual_screenshot = take_manual_screenshot(driver)
                                print(f"  Screenshot saved: {manual_screenshot}")
                                time.sleep(1)
                            except Exception as e:
                                print(f"  Error expanding {exp['text'][:30]}: {e}")
                    else:
                        try:
                            idx = int(exp_choice) - 1
                            if 0 <= idx < len(expandables):
                                exp = expandables[idx]
                                print(f"Attempting to expand: {exp['text']}")
                                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", exp["element"])
                                time.sleep(0.5)
                                
                                # Try multiple strategies
                                if try_expand_element(driver, exp["element"]):
                                    print("Success!")
                                else:
                                    print("Could not expand element with any strategy")
                                    
                                # Take a screenshot after expansion
                                manual_screenshot = take_manual_screenshot(driver)
                                print(f"Screenshot saved: {manual_screenshot}")
                            else:
                                print("Invalid selection")
                        except ValueError:
                            print("Invalid input")
                else:
                    print("No expandable sections found")
                    
                    # As a fallback, list all visible elements that might be expandable
                    print("Listing all potential interactive elements:")
                    all_elements = driver.find_elements(By.CSS_SELECTOR, 
                                                    "div, span, button, a, label")
                    potential_elements = []
                    
                    for el in all_elements:
                        if el.is_displayed():
                            text = el.text.strip()
                            if text and any(term in text.lower() for term in 
                                        ["cookie", "necessary", "functional", "analytics", 
                                        "targeting", "performance", "privacy"]):
                                potential_elements.append({
                                    "element": el,
                                    "text": text
                                })
                    
                    if potential_elements:
                        print(f"Found {len(potential_elements)} elements that might be expandable:")
                        for i, el in enumerate(potential_elements[:10]):  # Show top 10
                            print(f"{i+1}. {el['text'][:50]}...")
                        
                        fallback_choice = input("Enter number to try clicking (or 'S' to skip): ")
                        if fallback_choice.upper() != "S":
                            try:
                                idx = int(fallback_choice) - 1
                                if 0 <= idx < len(potential_elements):
                                    el = potential_elements[idx]
                                    print(f"Attempting to click: {el['text'][:30]}...")
                                    driver.execute_script("arguments[0].click();", el["element"])
                                    time.sleep(1)
                                    
                                    # Take a screenshot after click
                                    manual_screenshot = take_manual_screenshot(driver)
                                    print(f"Screenshot saved: {manual_screenshot}")
                            except Exception as e:
                                print(f"Error: {e}")
                    else:
                        print("No potential elements found")
            except Exception as e:
                print(f"Error processing expandable sections: {e}")
            
            # Take screenshot after all expansions
            expanded_screenshot = take_screenshot(driver, website, current_state_id + "_expanded")
            print(f"Post-expansion screenshot saved to: {expanded_screenshot}")
            continue

            
        elif choice.upper() == "BACK":
            if state_stack:
                # Pop the current state and go back to previous
                prev_url, prev_state_id, prev_parent_id = state_stack.pop()
                print(f"Going back to state {prev_state_id}...")
                
                # Navigate back (either by browser back or by URL)
                try:
                    driver.get(prev_url)
                    time.sleep(WAIT_TIME)  # Wait for page to load
                    
                    # Update current state trackers
                    current_state_id = prev_state_id
                    parent_id = prev_parent_id
                    depth -= 1 if depth > 0 else 0
                    
                    # Rediscover notice elements
                    notice_elements, iframe_elements = find_cookie_notice(driver)
                    
                    if notice_elements:
                        notice_details = extract_notice_details(notice_elements[0])
                    elif iframe_elements:
                        notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                    else:
                        notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
                        
                    screenshot_path = take_screenshot(driver, website, current_state_id)
                    print(f"Screenshot saved to: {screenshot_path}")
                    
                except Exception as e:
                    print(f"Error going back: {e}")
            else:
                print("Cannot go back further (at root state)")
                
        elif choice.upper().startswith("B") and notice_details['buttons']:
            try:
                button_idx = int(choice[1:]) - 1
                if 0 <= button_idx < len(notice_details['buttons']):
                    button = notice_details['buttons'][button_idx]
                    print(f"Clicking button: {button['text']}")
                    
                    # Save current state for back navigation
                    state_stack.append((driver.current_url, current_state_id, parent_id))
                    
                    # Update state trackers
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    
                    # Click the button
                    try:
                        button['element'].click()
                        time.sleep(WAIT_TIME)  # Wait for any changes
                        
                        # Take screenshot of the new state
                        screenshot_path = take_screenshot(driver, website, current_state_id)
                        print(f"Screenshot saved to: {screenshot_path}")
                        
                        # Find new notice state
                        notice_elements, iframe_elements = find_cookie_notice(driver)
                        
                        if notice_elements:
                            notice_details = extract_notice_details(notice_elements[0])
                        elif iframe_elements:
                            notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                        else:
                            notice_details = {"text": "No cookie notice detected after clicking", "buttons": [], "links": []}
                        
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path, button['text'])
                        
                        # Increment depth
                        depth += 1
                        
                    except Exception as e:
                        print(f"Error clicking button: {e}")
                else:
                    print(f"Invalid button index: {button_idx}")
            except ValueError:
                print(f"Invalid button choice: {choice}")
                
        elif choice.upper().startswith("L") and notice_details['links']:
            try:
                link_idx = int(choice[1:]) - 1
                if 0 <= link_idx < len(notice_details['links']):
                    link = notice_details['links'][link_idx]
                    print(f"Clicking link: {link['text']} ({link['href']})")
                    
                    # Save current state for back navigation
                    state_stack.append((driver.current_url, current_state_id, parent_id))
                    
                    # Update state trackers
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    
                    # Click the link
                    try:
                        if link['href'].startswith('http'):
                            print(f"Navigating directly to: {link['href']}")
                            driver.get(link['href'])
                        else:
                            # For relative URLs
                            base_url = urlparse(driver.current_url)
                            full_url = f"{base_url.scheme}://{base_url.netloc}{link['href']}"
                            print(f"Navigating directly to: {full_url}")
                            driver.get(full_url)
                            
                        time.sleep(WAIT_TIME)  # Wait for page to load
                        
                        # Take screenshot of the new state
                        screenshot_path = take_screenshot(driver, website, current_state_id)
                        print(f"Screenshot saved to: {screenshot_path}")

                        # Extract content from the policy page
                        policy_details = extract_policy_page_content(driver)
                                                
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, 
                                        policy_details, screenshot_path, f"Link: {link['text']}")
                        
                        # Check for cookie notices on the new page
                        notice_elements, iframe_elements = find_cookie_notice(driver)
                        
                        if notice_elements:
                            notice_details = extract_notice_details(notice_elements[0])
                        elif iframe_elements:
                            notice_details = switch_to_iframe_and_extract(driver, iframe_elements[0])
                        else:
                            # For policy pages, capture the main content
                            try:
                                main_content = driver.find_element(By.TAG_NAME, "main")
                                notice_details = extract_notice_details(main_content)
                            except:
                                try:
                                    # Look for article or content area
                                    content = driver.find_element(By.CSS_SELECTOR, "article, .content, #content, .privacy-policy")
                                    notice_details = extract_notice_details(content)
                                except:
                                    # Fallback to body
                                    body = driver.find_element(By.TAG_NAME, "body")
                                    notice_details = {
                                        "text": "Policy page content (excerpt):\n" + body.text[:500] + "...",
                                        "buttons": [],
                                        "links": []
                                    }
                        
                        # Record the new state
                        data = record_state(data, website, parent_id, current_state_id, notice_details, screenshot_path, f"Link: {link['text']}")
                        
                        # Increment depth
                        depth += 1
                        
                    except Exception as e:
                        print(f"Error clicking link: {e}")
                else:
                    print(f"Invalid link index: {link_idx}")
            except ValueError:
                print(f"Invalid link choice: {choice}")
        
        else:
            print(f"Invalid choice or no such option: {choice}")
    
    print(f"Exploration finished after {depth} levels")
    
    # Update mechanism analysis
    if any("accept" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_explicit_consent'] = True
    
    if any("reject" in button['text'].lower() or "decline" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_reject_option'] = True
        
    if any("privacy" in link['text'].lower() or "policy" in link['text'].lower() for link in notice_details['links']):
        mechanisms['has_link_to_policy'] = True
    
    if any("settings" in button['text'].lower() or "preferences" in button['text'].lower() or "customize" in button['text'].lower() for button in notice_details['buttons']):
        mechanisms['has_granular_options'] = True
    
    # Detect potential dark patterns
    mechanisms['dark_patterns'] = detect_dark_patterns(notice_details)
    
    return data, mechanisms

In [74]:
def check_for_bot_detection(driver):
    """Check if the current page is a bot detection challenge"""
    # Common bot detection indicators
    bot_indicators = [
        # Page title indicators
        "security check", "captcha", "bot check", "human verification",
        "please verify", "please wait", "detection", "challenge",
        # URL indicators
        "captcha", "challenge", "verify", "security",
        # Content indicators
        "prove you're human", "verify you are human", "security check",
        "confirm you are not a robot", "unusual activity"
    ]
    
    # Check page title
    if any(indicator in driver.title.lower() for indicator in bot_indicators):
        return True
    
    # Check URL
    if any(indicator in driver.current_url.lower() for indicator in bot_indicators):
        return True
    
    # Check page content
    page_text = driver.find_element(By.TAG_NAME, "body").text.lower()
    if any(indicator in page_text for indicator in bot_indicators):
        return True
    
    # Check for specific elements commonly used in verification pages
    try:
        # Check for captcha, challenge elements
        captcha_elements = driver.find_elements(By.CSS_SELECTOR, 
            "iframe[src*='captcha'], iframe[src*='challenge'], div[class*='captcha'], div[class*='challenge']")
        if captcha_elements:
            return True
    except:
        pass
    
    return False

def verify_page_loaded(driver, website):
    """Verify that the actual website content has loaded properly"""
    try:
        # Extract domain from website
        domain = urlparse(website).netloc
        
        # Check if we're on the correct domain
        current_domain = urlparse(driver.current_url).netloc
        if domain not in current_domain:
            return False
        
        # Check for common page elements that indicate a properly loaded page
        common_elements = [
            "header", "nav", "main", "footer", ".header", ".footer",
            "#header", "#footer", ".navigation", "#navigation"
        ]
        
        for selector in common_elements:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements and any(el.is_displayed() for el in elements):
                return True
                
        # Check if body has substantial content
        body_text = driver.find_element(By.TAG_NAME, "body").text
        if len(body_text) > 500:  # Arbitrary threshold for "real" content
            return True
            
        return False
    except:
        return False

In [75]:
def find_privacy_footer_links(driver):
    """Find privacy policy links in the footer"""
    privacy_links = []
    
    # Common footer selectors
    footer_selectors = ["footer", ".footer", "#footer", "[role='contentinfo']", 
                        ".site-footer", "#site-footer", ".global-footer"]
    
    for selector in footer_selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if not element.is_displayed():
                    continue
                    
                links = element.find_elements(By.TAG_NAME, "a")
                for link in links:
                    if not link.is_displayed():
                        continue
                        
                    link_text = link.text.lower().strip()
                    link_href = link.get_attribute("href") or ""
                    
                    privacy_terms = ["privacy", "cookie", "gdpr", "ccpa", "data", "notice"]
                    if any(term in link_text for term in privacy_terms) or \
                       any(term in link_href for term in privacy_terms):
                        privacy_links.append({
                            "text": link.text,
                            "href": link_href,
                            "element": link
                        })
        except Exception:
            continue
    
    # If no links found in footers, try scanning all links
    if not privacy_links:
        try:
            all_links = driver.find_elements(By.TAG_NAME, "a")
            for link in all_links:
                if not link.is_displayed():
                    continue
                    
                link_text = link.text.lower().strip()
                link_href = link.get_attribute("href") or ""
                
                privacy_terms = ["privacy", "cookie", "gdpr", "ccpa", "data policy"]
                if any(term in link_text for term in privacy_terms) or \
                   any(f"/{term}" in link_href for term in ["privacy", "cookie", "legal/privacy"]):
                    privacy_links.append({
                        "text": link.text,
                        "href": link_href,
                        "element": link
                    })
        except Exception:
            pass
            
    return privacy_links

In [76]:
def explore_policy_links(driver, website, parent_state_id, current_depth, max_depth, data):
    """Recursively explore links within policy pages up to max_depth"""
    if current_depth >= max_depth:
        return data
    
    # Extract all links from the current page
    policy_details = extract_policy_page_content(driver)
    
    # Save current URL to return to
    current_url = driver.current_url
    
    # Look for relevant policy links
    policy_keywords = ["privacy", "cookie", "data", "gdpr", "ccpa", "consent", "opt-out", "preferences"]
    
    for link in policy_details["links"]:
        link_text = link["text"].lower()
        
        # Check if this link appears to be a policy-related link
        if any(keyword in link_text for keyword in policy_keywords):
            try:
                # Generate new state ID
                new_state_id = str(uuid.uuid4())[:8]
                
                # Navigate to the link
                print(f"Following policy link: {link['text']} -> {link['href']}")
                driver.get(link['href'])
                time.sleep(WAIT_TIME)
                
                # Take screenshot
                screenshot_path = take_screenshot(driver, website, new_state_id)
                
                # Extract content
                sub_policy_details = extract_policy_page_content(driver)
                
                # Record this state
                data = record_state(data, website, parent_state_id, new_state_id, 
                                   sub_policy_details, screenshot_path, f"Policy link: {link['text']}")
                
                # Recursively explore this page's links
                data = explore_policy_links(driver, website, new_state_id, current_depth + 1, max_depth, data)
                
                # Return to the original page
                driver.get(current_url)
                #time.sleep(WAIT_TIME)
                time.sleep(random.uniform(2, 5))
                
            except Exception as e:
                print(f"Error following policy link {link['text']}: {e}")
                driver.get(current_url)  # Return to the original page
                #time.sleep(WAIT_TIME)
                time.sleep(random.uniform(2, 5))
    
    return data

In [77]:
def find_cookie_banners_and_popups(driver):
    """Enhanced detection of cookie consent elements"""
    banner_elements = []
    
    # Common cookie banner selectors (match attributes and common implementations)
    banner_selectors = [
        # Standard implementations
        ".cookie-banner", "#cookie-banner", ".cookie-notice", "#cookie-notice",
        ".cookie-policy", "#cookie-policy", ".consent-banner", "#consent-banner",
        
        # Common CMP implementations
        "#onetrust-banner-sdk", "#onetrust-consent-sdk", 
        ".cc-window", "#CybotCookiebotDialog",
        "div[aria-label*='cookie']", "div[aria-label*='consent']",
        "div[role='dialog'][aria-modal='true']",
        
        # Common structural patterns
        "div.fixed-bottom", "div.fixed-top", "div.sticky-bottom",
        "footer > div:first-child", "body > div:first-child"
    ]
    
    # Check for each selector
    for selector in banner_selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    # Check if it has meaningful content
                    text = element.text.strip()
                    # Has reasonable amount of text and is not just a single link
                    if len(text) > 20 and ("cookie" in text.lower() or "privacy" in text.lower() 
                                          or "consent" in text.lower() or "gdpr" in text.lower()):
                        banner_elements.append(element)
        except Exception as e:
            print(f"Error checking selector {selector}: {e}")
    
    # Also try text-based detection
    text_patterns = [
        "//*[contains(text(), 'cookie') or contains(text(), 'Cookie')]",
        "//*[contains(text(), 'consent') or contains(text(), 'Consent')]",
        "//*[contains(text(), 'privacy') or contains(text(), 'Privacy')]",
        "//*[contains(text(), 'GDPR') or contains(text(), 'CCPA')]"
    ]
    
    for pattern in text_patterns:
        try:
            elements = driver.find_elements(By.XPATH, pattern)
            for element in elements:
                if element.is_displayed():
                    # Find the closest container
                    parent = element
                    for _ in range(3):  # Move up to 3 levels up
                        if parent.tag_name == "body":
                            break
                        parent = parent.find_element(By.XPATH, "..")
                        if len(parent.text) > 50:  # If parent has substantial text
                            if parent not in banner_elements:
                                banner_elements.append(parent)
                            break
        except Exception as e:
            pass
    
    return banner_elements

def find_expandable_sections(driver, container):
    """Enhanced detection of expandable elements in cookie consent dialogs"""
    expandables = []
    
    # Common patterns for expandable elements
    selectors = [
        # Standard accordion patterns
        ".accordion-header", "[aria-expanded]", "[data-toggle='collapse']", 
        "details", "summary", ".expandable", ".collapsible",
        
        # Common cookie consent implementations
        ".cookie-category", ".consent-category", ".preference-group",
        ".consent-item", ".cookie-item", ".category-header",
        
        # Elements with + or arrows
        ".plus", ".accordion-toggle", ".arrow", ".caret"
    ]
    
    for selector in selectors:
        try:
            elements = container.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                if element.is_displayed():
                    # Check if this might be a category header
                    text = element.text.strip()
                    if text and any(term in text.lower() for term in [
                        "cookie", "necessary", "functional", "analytics", 
                        "targeting", "performance", "advertising", "preferences"
                    ]):
                        expandables.append({
                            "element": element,
                            "text": text,
                            "type": "expandable"
                        })
        except Exception as e:
            print(f"Error finding expandables with {selector}: {e}")
    
    # Also try text-based detection
    try:
        text_patterns = [
            ".//div[contains(., 'Necessary') or contains(., 'Functional') or contains(., 'Analytics') or contains(., 'Targeting') or contains(., 'Performance')]"
        ]
        
        for pattern in text_patterns:
            elements = container.find_elements(By.XPATH, pattern)
            for element in elements:
                if element.is_displayed():
                    text = element.text.strip()
                    if len(text) < 200:  # Not too large
                        expandables.append({
                            "element": element,
                            "text": text,
                            "type": "expandable"
                        })
    except Exception as e:
        print(f"Error with text-based expandable detection: {e}")
    
    return expandables

def take_manual_screenshot(driver, base_dir="manual_screenshots"):
    """Take a manual screenshot with a timestamp"""
    os.makedirs(base_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"manual_{timestamp}.png"
    filepath = os.path.join(base_dir, filename)
    
    driver.save_screenshot(filepath)
    print(f"Manual screenshot saved to: {filepath}")
    return filepath

def process_results_for_saving(results):
    """Remove WebElements to make results serializable to JSON"""
    if isinstance(results, dict):
        processed = {}
        for key, value in results.items():
            if key == "element":
                # Skip WebElements
                continue
            processed[key] = process_results_for_saving(value)
        return processed
    elif isinstance(results, list):
        return [process_results_for_saving(item) for item in results]
    else:
        return results

# Interactive explorer function

In [78]:
# Run the Interactive Explorer

# Select a website to explore
website = "https://www.ssrn.com"

print(f"\n{'='*60}")
print(f"STARTING SCRAPE OF {website}")
print(f"{'='*60}")
print("NOTE: You can use special commands during the exploration:")
print("  - S: Take a manual screenshot at any time")
print("  - E: Find and expand dropdown sections")
print("  - Type DONE when exploration is complete")
print(f"{'='*60}\n")

# Explore the website interactively
data, mechanisms = explore_notice_interactively(driver, website, max_depth=MAX_DEPTH)

# Display the mechanism analysis
print("\n--- Cookie Consent Mechanism Analysis ---")
for key, value in mechanisms.items():
    print(f"{key}: {value}")

# Clean data for Excel export
cleaned_data = clean_data_for_excel(data)

# Create a DataFrame
df = pd.DataFrame(cleaned_data)

# Preview the data
print(df.head())

# Save to Excel
df.to_excel(DATA_FILE, index=False)
print(f"Data saved to {DATA_FILE}")

# Optionally save structured data to JSON
try:
    with open(f"{urlparse(website).netloc}_cookie_consent_flow.json", 'w') as f:
        serializable_results = process_results_for_saving(data)
        json.dump(serializable_results, f, indent=2)
    print(f"Detailed results saved to {urlparse(website).netloc}_cookie_consent_flow.json")
except Exception as e:
    print(f"Error saving JSON data: {e}")


# # Display the mechanism analysis
# print("\n--- Cookie Consent Mechanism Analysis ---")
# for key, value in mechanisms.items():
#     print(f"{key}: {value}")

# # Clean data for Excel export
# cleaned_data = clean_data_for_excel(data)

# # Create a DataFrame
# df = pd.DataFrame(cleaned_data)

# # Preview the data
# print(df.head())

# # Save to Excel
# df.to_excel(DATA_FILE, index=False)
# print(f"Data saved to {DATA_FILE}")

# Create a hierarchical visualization of the exploration
hierarchy = decode_hierarchy(data)


STARTING SCRAPE OF https://www.ssrn.com
NOTE: You can use special commands during the exploration:
  - S: Take a manual screenshot at any time
  - E: Find and expand dropdown sections
  - Type DONE when exploration is complete

Visiting https://www.ssrn.com...
Checking if human intervention is needed...
Manual screenshot saved to: manual_screenshots/manual_20250418_155636.png
Manual screenshot taken: manual_screenshots/manual_20250418_155636.png
Initial page loaded. Screenshot taken.
Screenshot saved to: screenshots/www_ssrn_com_state_06409459_20250418_155636.png
Waiting for cookie notices to appear...
Found 8 potential cookie notice elements

--- Notice Text ---
We use cookies that are necessary to make our site work. We may also use additional cookies to analyze, improve, and personalize our content and your digital experience. For more information, see ourCookie Policy
Cookie Settings
Accept all cookies

--- Available Buttons ---
1. Cookie Settings
2. Accept all cookies

--- Availa

In [79]:
def display_hierarchy_text(node, level=0):
    """Display the hierarchy in text format"""
    indent = "  " * level
    choice = node.get("ChoiceMade", "Initial State")
    state_id = node.get("StateID", "Unknown")
    
    print(f"{indent}Step {level}: {choice} (State ID: {state_id})")
    print(f"{indent}Screenshot: {node.get('Snapshot', 'None')}")
    
    # Add notice text summary (first 100 chars)
    notice_text = node.get("Text of Notice", "")
    if notice_text:
        summary = notice_text[:100] + ("..." if len(notice_text) > 100 else "")
        print(f"{indent}Notice: {summary}")
    
    print()
    
    # Display children
    for child in node.get("children", []):
        display_hierarchy_text(child, level + 1)

# Display the hierarchy in text format
print("\n--- Exploration Hierarchy ---")
for root in hierarchy:
    display_hierarchy_text(root)

# Cleanup
driver.quit()
print("WebDriver closed successfully.")


--- Exploration Hierarchy ---
Step 0: None (State ID: 06409459)
Screenshot: screenshots/www_ssrn_com_state_06409459_20250418_155636.png
Notice: We use cookies that are necessary to make our site work. We may also use additional cookies to analy...

  Step 1: Cookie Settings (State ID: 0c07a810)
  Screenshot: screenshots/www_ssrn_com_state_0c07a810_20250418_155713.png
  Notice: We use cookies that are necessary to make our site work. We may also use additional cookies to analy...

    Step 2: Link: Cookie Policy (State ID: 8d0eac68)
    Screenshot: screenshots/www_ssrn_com_state_8d0eac68_20250418_155836.png
    Notice: Home
Elsevier Cookie Notice
Elsevier cookie notice
Last updated: 15 June 2020
Cookies and similar te...

    Step 2: Link: Cookie Policy (State ID: 8d0eac68)
    Screenshot: screenshots/www_ssrn_com_state_8d0eac68_20250418_155836.png
    Notice: Last updated: 15 June 2020
Cookies and similar technologies
This notice supplements our privacy poli...

WebDriver closed succe

In [80]:
# # Batch Processing Multiple Websites (Optional)

# def batch_scrape_websites(websites_list, headless=True):
#     """Automatically scrape multiple websites without interactive exploration"""
#     all_data = []
#     all_mechanisms = []
    
#     # Initialize new driver for batch processing
#     batch_driver = initialize_driver(headless=headless)
    
#     try:
#         for website in websites_list:
#             print(f"\nProcessing {website}...")
            
#             # Basic data for this site
#             site_data = []
#             mechanisms = analyze_cookie_mechanisms(website)
            
#             # Visit website
#             try:
#                 batch_driver.get(website)
#                 time.sleep(WAIT_TIME)  # Wait for page to load
                
#                 # Take initial screenshot
#                 state_id = str(uuid.uuid4())[:8]
#                 screenshot = take_screenshot(batch_driver, website, state_id)
                
#                 # Find notice elements
#                 notice_elements, iframe_elements = find_cookie_notice(batch_driver)
                
#                 if notice_elements:
#                     notice_details = extract_notice_details(notice_elements[0])
                    
#                     # Record initial state
#                     site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
                    
#                     # Update mechanism analysis
#                     if any("accept" in button['text'].lower() for button in notice_details['buttons']):
#                         mechanisms['has_explicit_consent'] = True
                    
#                     if any("reject" in button['text'].lower() or "decline" in button['text'].lower() for button in notice_details['buttons']):
#                         mechanisms['has_reject_option'] = True
                        
#                     if any("privacy" in link['text'].lower() or "policy" in link['text'].lower() for link in notice_details['links']):
#                         mechanisms['has_link_to_policy'] = True
                    
#                     if any("settings" in button['text'].lower() or "preferences" in button['text'].lower() or "customize" in button['text'].lower() for button in notice_details['buttons']):
#                         mechanisms['has_granular_options'] = True
                    
#                     # Detect potential dark patterns
#                     mechanisms['dark_patterns'] = detect_dark_patterns(notice_details)
                    
#                     # Try clicking the first button (usually Accept)
#                     if notice_details['buttons']:
#                         try:
#                             first_button = notice_details['buttons'][0]
#                             print(f"Automatically clicking: {first_button['text']}")
#                             first_button['element'].click()
#                             #time.sleep(WAIT_TIME)
#                             time.sleep(random.uniform(2, 5))
                            
#                             # Take after-click screenshot
#                             after_state_id = str(uuid.uuid4())[:8]
#                             after_screenshot = take_screenshot(batch_driver, website, after_state_id)
                            
#                             # Check for any remaining notices
#                             after_notices, _ = find_cookie_notice(batch_driver)
#                             if after_notices:
#                                 after_details = extract_notice_details(after_notices[0])
#                                 site_data = record_state(site_data, website, state_id, after_state_id, 
#                                                         after_details, after_screenshot, first_button['text'])
#                         except Exception as e:
#                             print(f"Error clicking first button: {e}")
#                 elif iframe_elements:
#                     notice_details = switch_to_iframe_and_extract(batch_driver, iframe_elements[0])
#                     site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
#                 else:
#                     print("No cookie notice found")
#                     notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
#                     site_data = record_state(site_data, website, None, state_id, notice_details, screenshot)
                
#                 # Add data to the global collection
#                 all_data.extend(site_data)
#                 all_mechanisms.append(mechanisms)
                
#             except Exception as e:
#                 print(f"Error processing {website}: {e}")
        
#         return all_data, all_mechanisms
    
#     finally:
#         # Clean up
#         batch_driver.quit()

# # To run batch processing (uncomment to use)
# # batch_data, batch_mechanisms = batch_scrape_websites(websites)
# # batch_df = pd.DataFrame(clean_data_for_excel(batch_data))
# # batch_df.to_excel("batch_" + DATA_FILE, index=False)