In [None]:
import os
import time
import uuid
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from IPython.display import display, Image, HTML
import re
from urllib.parse import urlparse
import json
from typing import List, Dict, Optional, Tuple
import logging
import requests
import random

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration settings
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3
WAIT_TIME = 5
BANNER_WAIT_TIME = 10  # Longer wait for banners to appear

# Create screenshot directory if it doesn't exist
os.makedirs(SCREENSHOTS_DIR, exist_ok=True)

In [None]:
class ImprovedPrivacyScraper:
    """Enhanced privacy policy scraper with better detection capabilities"""
    
    def __init__(self, headless=False):
        self.driver = self.initialize_driver(headless)
        self.data = []
        self.current_website = None
        
    def initialize_driver(self, headless=False):
        """Initialize Chrome WebDriver with enhanced anti-detection measures"""
        chrome_options = Options()
        
        if headless:
            chrome_options.add_argument("--headless")
            
        # Enhanced anti-detection measures
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # Rotate user agents
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0"
        ]
        import random
        chrome_options.add_argument(f"user-agent={random.choice(user_agents)}")
        
        # Initialize driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Execute script to mask automation
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        return driver
    
    def wait_for_consent_banner(self, timeout=BANNER_WAIT_TIME):
        """Enhanced wait for consent banner with multiple detection strategies"""
        logger.info(f"Waiting up to {timeout}s for consent banner to appear...")
        
        # Strategy 1: Wait for common consent management platforms
        cmp_indicators = [
            # OneTrust
            ("id", "onetrust-banner-sdk"),
            ("id", "onetrust-consent-sdk"),
            ("class", "ot-sdk-container"),
            
            # Cookiebot
            ("id", "CybotCookiebotDialog"),
            ("class", "CybotCookiebotDialog"),
            
            # TrustArc/TRUSTe
            ("id", "truste-consent-track"),
            ("class", "truste-banner"),
            
            # Quantcast
            ("class", "qc-cmp2-container"),
            ("class", "qc-cmp-ui-container"),
            
            # Didomi
            ("id", "didomi-popup"),
            ("class", "didomi-popup-container"),
            
            # Usercentrics
            ("id", "usercentrics-root"),
            ("data-testid", "uc-container"),
            
            # Generic patterns
            ("class", "cookie-consent"),
            ("class", "cookie-banner"),
            ("class", "consent-banner"),
            ("class", "privacy-banner"),
            ("class", "gdpr-banner"),
            ("role", "dialog"),  # Many consent banners use role="dialog"
        ]
        
        # Try each indicator
        for attr_type, attr_value in cmp_indicators:
            try:
                if attr_type == "id":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.ID, attr_value))
                    )
                elif attr_type == "class":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CLASS_NAME, attr_value))
                    )
                elif attr_type == "data-testid":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-testid='{attr_value}']"))
                    )
                elif attr_type == "role":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[role='{attr_value}']"))
                    )
                
                if element and element.is_displayed():
                    logger.info(f"Found consent banner via {attr_type}='{attr_value}'")
                    return True
            except TimeoutException:
                continue
        
        # Strategy 2: Check for iframes that might contain consent banners
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_attrs = {
                    "id": iframe.get_attribute("id") or "",
                    "class": iframe.get_attribute("class") or "",
                    "src": iframe.get_attribute("src") or "",
                    "name": iframe.get_attribute("name") or ""
                }
                
                # Check if iframe might contain consent content
                consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice']
                if any(keyword in str(iframe_attrs).lower() for keyword in consent_keywords):
                    logger.info(f"Found potential consent iframe: {iframe_attrs}")
                    return True
        except Exception as e:
            logger.error(f"Error checking iframes: {e}")
        
        # Strategy 3: Look for fixed position elements (many banners are fixed)
        try:
            fixed_elements = self.driver.execute_script("""
                return Array.from(document.querySelectorAll('*')).filter(el => {
                    const style = window.getComputedStyle(el);
                    return style.position === 'fixed' && 
                           el.offsetHeight > 50 && 
                           el.innerText && 
                           el.innerText.length > 50;
                });
            """)
            
            for element in fixed_elements:
                text = element.text.lower()
                if any(keyword in text for keyword in ['cookie', 'consent', 'privacy', 'accept', 'decline']):
                    logger.info("Found fixed position consent element")
                    return True
        except Exception as e:
            logger.error(f"Error checking fixed elements: {e}")
        
        return False
    
    def find_consent_elements(self):
        """Find all consent banner elements with improved detection"""
        consent_elements = {
            "banners": [],
            "iframes": [],
            "buttons": [],
            "links": [],
            "toggles": []
        }
        
        # Enhanced selectors for consent banners
        banner_selectors = [
            # ID-based selectors
            "[id*='cookie'][id*='banner']",
            "[id*='cookie'][id*='consent']",
            "[id*='privacy'][id*='banner']",
            "[id*='gdpr']",
            "[id*='consent']",
            "#onetrust-banner-sdk",
            "#CybotCookiebotDialog",
            "#didomi-popup",
            "#usercentrics-root",
            
            # Class-based selectors
            "[class*='cookie-banner']",
            "[class*='consent-banner']",
            "[class*='privacy-banner']",
            "[class*='gdpr-banner']",
            ".qc-cmp2-container",
            ".ot-sdk-container",
            
            # Role-based selectors
            "[role='dialog'][aria-label*='cookie']",
            "[role='dialog'][aria-label*='consent']",
            "[role='dialog'][aria-label*='privacy']",
            
            # Data attribute selectors
            "[data-testid*='cookie']",
            "[data-testid*='consent']",
            "[data-component*='cookie']",
            "[data-component*='consent']"
        ]
        
        # Find banner elements
        for selector in banner_selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed() and element.size['height'] > 50:
                        # Verify it contains consent-related text
                        text = element.text.lower()
                        if any(keyword in text for keyword in ['cookie', 'privacy', 'consent', 'data', 'accept']):
                            consent_elements["banners"].append(element)
                            logger.info(f"Found banner via selector: {selector}")
            except Exception as e:
                continue
        
        # Find iframes
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_info = self._get_iframe_info(iframe)
                if self._is_consent_iframe(iframe_info):
                    consent_elements["iframes"].append(iframe)
        except Exception:
            pass
        
        # Find all interactive elements within banners
        for banner in consent_elements["banners"]:
            try:
                # Find buttons
                button_selectors = [
                    "button",
                    "[role='button']",
                    "a[href='#']",  # Links acting as buttons
                    "[class*='btn']",
                    "[class*='button']",
                    "input[type='button']",
                    "input[type='submit']"
                ]
                
                for selector in button_selectors:
                    buttons = banner.find_elements(By.CSS_SELECTOR, selector)
                    consent_elements["buttons"].extend([b for b in buttons if b.is_displayed()])
                
                # Find links
                links = banner.find_elements(By.TAG_NAME, "a")
                consent_elements["links"].extend([l for l in links if l.is_displayed() and l.get_attribute("href")])
                
                # Find toggles/checkboxes
                toggle_selectors = [
                    "input[type='checkbox']",
                    "input[type='switch']",
                    "[role='switch']",
                    "[role='checkbox']"
                ]
                
                for selector in toggle_selectors:
                    toggles = banner.find_elements(By.CSS_SELECTOR, selector)
                    consent_elements["toggles"].extend(toggles)
                    
            except Exception as e:
                logger.error(f"Error extracting elements from banner: {e}")
        
        return consent_elements
    
    def _get_iframe_info(self, iframe):
        """Get iframe attributes"""
        return {
            "id": iframe.get_attribute("id") or "",
            "class": iframe.get_attribute("class") or "",
            "src": iframe.get_attribute("src") or "",
            "name": iframe.get_attribute("name") or ""
        }
    
    def _is_consent_iframe(self, iframe_info):
        """Check if iframe likely contains consent content"""
        consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice', 'onetrust', 'cookiebot', 'didomi', 'usercentrics', 'quantcast', 'trustarc']
        iframe_str = str(iframe_info).lower()
        return any(keyword in iframe_str for keyword in consent_keywords)
    
    def extract_banner_content(self, consent_elements):
        """Extract comprehensive content from consent elements"""
        content = {
            "banner_text": "",
            "buttons": [],
            "links": [],
            "toggles": [],
            "has_reject_option": False,
            "has_accept_all": False,
            "has_customize": False
        }
        
        # If we have iframes but no banner elements, process iframe first
        if not consent_elements["banners"] and consent_elements["iframes"]:
            logger.info("Processing iframe content as primary banner")
            iframe_content = self.process_iframe_content(consent_elements["iframes"][0])
            return iframe_content
        
        # Extract banner text
        banner_texts = []
        for banner in consent_elements["banners"]:
            try:
                # Get text but exclude button text to avoid duplication
                banner_text = banner.text
                # Remove button texts from banner text
                for button in consent_elements["buttons"]:
                    button_text = button.text
                    if button_text:
                        banner_text = banner_text.replace(button_text, "")
                banner_texts.append(banner_text.strip())
            except Exception:
                continue
        
        content["banner_text"] = " ".join(banner_texts)
        
        # Extract button information
        for button in consent_elements["buttons"]:
            try:
                button_info = {
                    "text": button.text.strip(),
                    "type": self._classify_button(button.text),
                    "element": button,
                    "classes": button.get_attribute("class") or "",
                    "id": button.get_attribute("id") or ""
                }
                
                if button_info["text"]:  # Only add if has text
                    content["buttons"].append(button_info)
                    
                    # Check for specific button types
                    if button_info["type"] == "reject":
                        content["has_reject_option"] = True
                    elif button_info["type"] == "accept_all":
                        content["has_accept_all"] = True
                    elif button_info["type"] == "customize":
                        content["has_customize"] = True
                        
            except Exception as e:
                logger.error(f"Error extracting button: {e}")
        
        # Extract links
        for link in consent_elements["links"]:
            try:
                link_info = {
                    "text": link.text.strip(),
                    "href": link.get_attribute("href"),
                    "element": link
                }
                if link_info["text"] and link_info["href"]:
                    content["links"].append(link_info)
            except Exception:
                continue
        
        # Extract toggle states
        for toggle in consent_elements["toggles"]:
            try:
                # Find associated label
                toggle_id = toggle.get_attribute("id")
                label_text = ""
                if toggle_id:
                    try:
                        label = self.driver.find_element(By.CSS_SELECTOR, f"label[for='{toggle_id}']")
                        label_text = label.text.strip()
                    except:
                        # Try parent element
                        parent = toggle.find_element(By.XPATH, "..")
                        label_text = parent.text.strip()
                
                toggle_info = {
                    "label": label_text,
                    "checked": toggle.is_selected(),
                    "enabled": toggle.is_enabled(),
                    "element": toggle
                }
                content["toggles"].append(toggle_info)
            except Exception:
                continue
        
        # Update classification based on all found content
        self._update_content_classification(content)
        
        return content
    
    def _update_content_classification(self, content):
        """Update content classification flags"""
        # Re-check all buttons for classification
        for button in content.get("buttons", []):
            button_type = button.get("type", "")
            if button_type == "reject":
                content["has_reject_option"] = True
            elif button_type == "accept_all":
                content["has_accept_all"] = True
            elif button_type == "customize":
                content["has_customize"] = True
    
    def _classify_button(self, button_text):
        """Classify button type based on text"""
        text_lower = button_text.lower()
        
        # Reject patterns - INCLUDING "decline"
        reject_patterns = ['reject', 'decline', 'deny', 'refuse', 'no thanks', 'not now', "don't accept", 'opt out', 'disagree']
        if any(pattern in text_lower for pattern in reject_patterns):
            return "reject"
        
        # Accept all patterns
        accept_all_patterns = ['accept all', 'agree all', 'allow all', 'accept cookies', 'i agree', 'got it', 'ok', 'i accept']
        if any(pattern in text_lower for pattern in accept_all_patterns):
            return "accept_all"
        
        # Customize patterns
        customize_patterns = ['customize', 'manage', 'settings', 'preferences', 'more options', 'choose', 'select', 'configure']
        if any(pattern in text_lower for pattern in customize_patterns):
            return "customize"
        
        # Simple accept
        if 'accept' in text_lower or 'allow' in text_lower or 'agree' in text_lower:
            return "accept"
        
        return "other"
    
    def process_iframe_content(self, iframe):
        """Extract content from iframe"""
        content = {
            "banner_text": "",
            "buttons": [],
            "links": [],
            "toggles": []
        }
        
        try:
            # Switch to iframe
            self.driver.switch_to.frame(iframe)
            time.sleep(1)  # Wait for content to load
            
            # Extract all visible text
            body = self.driver.find_element(By.TAG_NAME, "body")
            content["banner_text"] = body.text
            
            # Find buttons in iframe
            buttons = self.driver.find_elements(By.CSS_SELECTOR, "button, [role='button'], a[href='#']")
            for button in buttons:
                if button.is_displayed() and button.text.strip():
                    content["buttons"].append({
                        "text": button.text.strip(),
                        "type": self._classify_button(button.text),
                        "element": button
                    })
            
            # Find links in iframe
            links = self.driver.find_elements(By.CSS_SELECTOR, "a[href]:not([href='#'])")
            for link in links:
                if link.is_displayed() and link.text.strip():
                    content["links"].append({
                        "text": link.text.strip(),
                        "href": link.get_attribute("href"),
                        "element": link
                    })
            
        except Exception as e:
            logger.error(f"Error processing iframe: {e}")
        finally:
            # Always switch back to main content
            self.driver.switch_to.default_content()
        
        return content
    
    def take_screenshot(self, state_id=None):
        """Take screenshot with proper naming"""
        domain = urlparse(self.current_website).netloc.replace('.', '_')
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        state_suffix = f"_state_{state_id}" if state_id else ""
        filename = f"{domain}{state_suffix}_{timestamp}.png"
        filepath = os.path.join(SCREENSHOTS_DIR, filename)
        
        self.driver.save_screenshot(filepath)
        return filepath
    
    def navigate_and_capture(self, element, parent_state_id, choice_text, depth=0):
        """Navigate to element and capture the new state"""
        if depth >= MAX_DEPTH:
            return
        
        new_state_id = str(uuid.uuid4())[:8]
        
        try:
            # Click the element
            self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
            time.sleep(0.5)
            element.click()
            time.sleep(WAIT_TIME)
            
            # Take screenshot
            screenshot_path = self.take_screenshot(new_state_id)
            
            # Check what type of page we're on
            current_url = self.driver.current_url
            is_policy_page = any(keyword in current_url.lower() for keyword in ['privacy', 'policy', 'cookie', 'terms'])
            
            if is_policy_page:
                # Extract full policy content
                policy_content = self.extract_policy_content()
                self.record_state(parent_state_id, new_state_id, policy_content, screenshot_path, choice_text, depth)
            else:
                # Look for consent elements on the new page
                consent_elements = self.find_consent_elements()
                if consent_elements["banners"] or consent_elements["iframes"]:
                    content = self.extract_banner_content(consent_elements)
                    self.record_state(parent_state_id, new_state_id, content, screenshot_path, choice_text, depth)
                    
                    # Continue exploring customize options if available
                    if content["has_customize"]:
                        for button in content["buttons"]:
                            if button["type"] == "customize":
                                self.navigate_and_capture(button["element"], new_state_id, button["text"], depth + 1)
                                break
            
        except Exception as e:
            logger.error(f"Error navigating to {choice_text}: {e}")
    
    def extract_policy_content(self):
        """Extract content from privacy policy page"""
        content = {
            "banner_text": "",
            "buttons": [],
            "links": [],
            "is_policy_page": True
        }
        
        # Try to find main content area
        content_selectors = [
            "main", "article", ".content", "#content", 
            ".privacy-policy", ".cookie-policy", ".policy-content",
            "[role='main']", ".main-content"
        ]
        
        for selector in content_selectors:
            try:
                main_content = self.driver.find_element(By.CSS_SELECTOR, selector)
                if main_content and len(main_content.text) > 500:
                    content["banner_text"] = main_content.text
                    
                    # Extract links within policy
                    links = main_content.find_elements(By.TAG_NAME, "a")
                    for link in links:
                        if link.is_displayed() and link.text.strip():
                            content["links"].append({
                                "text": link.text.strip(),
                                "href": link.get_attribute("href")
                            })
                    break
            except:
                continue
        
        # Fallback to body if no specific content area found
        if not content["banner_text"]:
            try:
                body = self.driver.find_element(By.TAG_NAME, "body")
                content["banner_text"] = body.text
            except:
                content["banner_text"] = "Could not extract policy content"
        
        return content
    
    def record_state(self, parent_id, state_id, content, screenshot_path, choice_made, depth):
        """Record state data in structured format"""
        # Clean the banner text - remove excess whitespace and newlines
        clean_text = " ".join(content.get("banner_text", "").split())
        
        # Format buttons and links for Excel
        buttons_text = "; ".join([b["text"] for b in content.get("buttons", [])])
        
        # For policy pages, don't extract new links - keep the original banner links
        if content.get("is_policy_page", False):
            # Don't add policy page links to the record
            links_data = {}
            for i in range(1, 6):
                links_data[f"Link {i}"] = None
                links_data[f"Link {i} Detail"] = None
        else:
            # For banner states, record the links
            links_data = {}
            for i, link in enumerate(content.get("links", [])[:5], 1):
                links_data[f"Link {i}"] = link["text"]
                links_data[f"Link {i} Detail"] = link.get("href", "")
            
            # Fill remaining link columns with None
            for i in range(len(content.get("links", [])) + 1, 6):
                links_data[f"Link {i}"] = None
                links_data[f"Link {i} Detail"] = None
        
        row = {
            "Website": self.current_website,
            "ParentID": parent_id,
            "StateID": state_id,
            "Depth": depth,
            "Timestamp": datetime.now().isoformat(),
            "Screenshot": screenshot_path,
            "Text of Notice": clean_text[:1000] + "..." if len(clean_text) > 1000 else clean_text,  # Limit text length
            "All Buttons": buttons_text,
            "Has Reject Option": content.get("has_reject_option", False),
            "Has Accept All": content.get("has_accept_all", False),
            "Has Customize": content.get("has_customize", False),
            "Choice Made": choice_made,
            "Is Policy Page": content.get("is_policy_page", False),
            **links_data
        }
        
        self.data.append(row)
    
    def scrape_website(self, website, interactive=False):
        """Main method to scrape a website"""
        self.current_website = website
        logger.info(f"Scraping {website}")
        
        try:
            # Navigate to website
            self.driver.get(website)
            
            # Wait for consent banner
            banner_found = self.wait_for_consent_banner()
            
            if not banner_found:
                logger.warning("No consent banner detected")
                # Still take a screenshot
                screenshot = self.take_screenshot("no_banner")
                self.record_state(None, "no_banner", 
                                {"banner_text": "No consent banner detected"}, 
                                screenshot, None, 0)
                return
            
            # Find consent elements
            consent_elements = self.find_consent_elements()
            
            # Process main content
            if consent_elements["banners"]:
                content = self.extract_banner_content(consent_elements)
            elif consent_elements["iframes"]:
                # Process iframe content
                content = self.process_iframe_content(consent_elements["iframes"][0])
            else:
                content = {"banner_text": "Banner detected but could not extract content"}
            
            # Record initial state
            root_state_id = str(uuid.uuid4())[:8]
            screenshot = self.take_screenshot(root_state_id)
            self.record_state(None, root_state_id, content, screenshot, None, 0)
            
            if interactive:
                # Interactive mode - let user choose what to click
                self._interactive_exploration(content, root_state_id)
            else:
                # Automated mode - explore all main options
                self._automated_exploration(content, root_state_id)
                
        except Exception as e:
            logger.error(f"Error scraping {website}: {e}")
            # Still record the error state
            screenshot = self.take_screenshot("error")
            self.record_state(None, "error", 
                            {"banner_text": f"Error: {str(e)}"}, 
                            screenshot, None, 0)
    
    def _automated_exploration(self, content, parent_state_id):
        """Smart hybrid automated exploration of consent options"""
        original_url = self.driver.current_url
        
        # Store the original banner links (not policy page links)
        original_links = content.get("links", []).copy()
        
        # Categorize elements for smart exploration
        customize_buttons = []
        action_buttons = []  # Accept/Decline
        policy_links = []
        
        # Categorize buttons
        for button in content.get("buttons", []):
            if button["type"] == "customize":
                customize_buttons.append(button)
            elif button["type"] in ["accept_all", "reject", "accept"]:
                action_buttons.append(button)
        
        # Use the original banner links
        policy_links = original_links
        
        # 1. First explore customize/manage options (depth-first)
        for button in customize_buttons:
            logger.info(f"Exploring customize option: {button['text']}")
            self._explore_customize_flow(button, parent_state_id, original_url)
        
        # 2. Then capture policy links
        for link in policy_links:
            logger.info(f"Capturing policy link: {link['text']}")
            self._capture_policy_page(link, parent_state_id, original_url)
        
        # 3. Finally capture accept/decline end states
        # Note: We do these last because they might close the banner permanently
        for button in action_buttons:
            logger.info(f"Capturing action button result: {button['text']}")
            self._capture_action_result(button, parent_state_id, original_url)
    
    def _explore_customize_flow(self, button, parent_state_id, original_url):
        """Deep exploration of customize/manage cookie settings"""
        new_state_id = str(uuid.uuid4())[:8]
        
        try:
            # First, make sure we can find and click the button
            clicked = False
            
            # Check if we need to work with iframe
            consent_elements = self.find_consent_elements()
            
            # Try clicking in main content first
            if not clicked:
                for btn in consent_elements["buttons"]:
                    if btn.text.strip() == button["text"]:
                        if self._click_element_safely(btn):
                            clicked = True
                            break
            
            # If not found in main content, check iframes
            if not clicked and consent_elements["iframes"]:
                for iframe in consent_elements["iframes"]:
                    try:
                        self.driver.switch_to.frame(iframe)
                        logger.info("Switched to iframe for customize button")
                        
                        # Find button within iframe
                        iframe_buttons = self.driver.find_elements(By.CSS_SELECTOR, "button, [role='button']")
                        for btn in iframe_buttons:
                            if btn.text.strip() == button["text"]:
                                if self._click_element_safely(btn):
                                    clicked = True
                                    logger.info(f"Successfully clicked {button['text']} in iframe")
                                    break
                        
                        # Don't switch back yet - we might need to interact with the customize panel
                        if clicked:
                            break
                        else:
                            self.driver.switch_to.default_content()
                    except Exception as e:
                        logger.error(f"Error clicking in iframe: {e}")
                        self.driver.switch_to.default_content()
            
            if not clicked:
                logger.error(f"Could not click customize button: {button['text']}")
                return
            
            time.sleep(WAIT_TIME)
            
            # Take screenshot of customize panel
            screenshot_path = self.take_screenshot(new_state_id)
            
            # Find all elements in the customize panel (might still be in iframe)
            customize_content = self._extract_customize_panel_content()
            
            # Record this state
            self.record_state(parent_state_id, new_state_id, customize_content, 
                            screenshot_path, button["text"], 1)
            
            # If there are cookie categories with toggles, explore them
            if customize_content.get("cookie_categories"):
                self._explore_cookie_categories(customize_content, new_state_id)
            
            # Look for save/confirm buttons
            save_buttons = [b for b in customize_content.get("buttons", []) 
                          if any(word in b["text"].lower() for word in ["save", "confirm", "ok", "apply", "accept preferences"])]
            
            if save_buttons:
                # Click save to see the result
                save_button = save_buttons[0]
                logger.info(f"Clicking save button: {save_button['text']}")
                
                save_state_id = str(uuid.uuid4())[:8]
                self._click_element_safely(save_button["element"])
                time.sleep(WAIT_TIME)
                
                # Switch back to default content if we were in iframe
                try:
                    self.driver.switch_to.default_content()
                except:
                    pass
                
                # Capture the result after saving preferences
                save_screenshot = self.take_screenshot(save_state_id)
                result_content = {"banner_text": "Preferences saved", "buttons": [], "links": []}
                
                # Check if we're back on the main page or still in a banner
                if self.driver.current_url != original_url:
                    result_content = self.extract_policy_content()
                
                self.record_state(new_state_id, save_state_id, result_content, 
                                save_screenshot, save_button["text"], 2)
            else:
                # Make sure we're back in default content
                try:
                    self.driver.switch_to.default_content()
                except:
                    pass
            
        except Exception as e:
            logger.error(f"Error exploring customize flow: {e}")
            # Make sure we're back in default content
            try:
                self.driver.switch_to.default_content()
            except:
                pass
        finally:
            # Return to original page
            try:
                self.driver.get(original_url)
                time.sleep(WAIT_TIME)
                # Re-trigger the banner if needed
                self.wait_for_consent_banner(timeout=5)
            except:
                pass
    
    def _click_element_safely(self, element):
        """Safely click an element with multiple strategies"""
        try:
            # Strategy 1: Direct click
            element.click()
            return True
        except:
            try:
                # Strategy 2: JavaScript click
                self.driver.execute_script("arguments[0].click();", element)
                return True
            except:
                try:
                    # Strategy 3: Scroll and click
                    self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
                    time.sleep(0.5)
                    element.click()
                    return True
                except:
                    return False
    
    def _extract_customize_panel_content(self):
        """Extract content from customize/manage cookies panel"""
        content = {
            "banner_text": "",
            "buttons": [],
            "links": [],
            "cookie_categories": [],
            "has_granular_control": False
        }
        
        # Look for cookie preference panels
        panel_selectors = [
            ".cookie-settings", ".cookie-preferences", ".privacy-settings",
            "[class*='preference-center']", "[class*='cookie-modal']",
            ".ot-pc-content", ".cmp-settings", "#gdpr-settings"
        ]
        
        panel = None
        for selector in panel_selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                for elem in elements:
                    if elem.is_displayed():
                        panel = elem
                        break
                if panel:
                    break
            except:
                continue
        
        if not panel:
            # Use the whole visible page
            panel = self.driver.find_element(By.TAG_NAME, "body")
        
        # Extract text
        content["banner_text"] = panel.text
        
        # Find all buttons in the panel
        buttons = panel.find_elements(By.CSS_SELECTOR, "button, [role='button'], a[href='#']")
        for button in buttons:
            if button.is_displayed() and button.text.strip():
                content["buttons"].append({
                    "text": button.text.strip(),
                    "type": self._classify_button(button.text),
                    "element": button
                })
        
        # Find cookie categories with toggles
        category_selectors = [
            ".cookie-category", ".consent-category", ".cookie-group",
            "[class*='category-item']", ".ot-cat-item", ".cmp-category"
        ]
        
        for selector in category_selectors:
            categories = panel.find_elements(By.CSS_SELECTOR, selector)
            for category in categories:
                try:
                    cat_info = self._extract_category_info(category)
                    if cat_info:
                        content["cookie_categories"].append(cat_info)
                except:
                    continue
        
        if content["cookie_categories"]:
            content["has_granular_control"] = True
        
        return content
    
    def _extract_category_info(self, category_element):
        """Extract information about a cookie category"""
        info = {
            "name": "",
            "description": "",
            "is_required": False,
            "default_state": False,
            "has_toggle": False
        }
        
        # Get category name
        name_selectors = ["h1", "h2", "h3", "h4", "h5", ".category-name", ".title"]
        for selector in name_selectors:
            try:
                name_elem = category_element.find_element(By.CSS_SELECTOR, selector)
                if name_elem.text.strip():
                    info["name"] = name_elem.text.strip()
                    break
            except:
                continue
        
        # Get description
        desc_selectors = ["p", ".description", ".category-description"]
        for selector in desc_selectors:
            try:
                desc_elem = category_element.find_element(By.CSS_SELECTOR, selector)
                if desc_elem.text.strip():
                    info["description"] = desc_elem.text.strip()
                    break
            except:
                continue
        
        # Check for toggle
        toggle_selectors = ["input[type='checkbox']", "input[type='switch']", "[role='switch']"]
        for selector in toggle_selectors:
            try:
                toggle = category_element.find_element(By.CSS_SELECTOR, selector)
                info["has_toggle"] = True
                info["default_state"] = toggle.is_selected()
                info["is_required"] = not toggle.is_enabled()  # Disabled = required
                break
            except:
                continue
        
        # Check if it's marked as required/necessary
        if "necessary" in info["name"].lower() or "essential" in info["name"].lower() or "required" in info["name"].lower():
            info["is_required"] = True
        
        return info if info["name"] else None
    
    def _explore_cookie_categories(self, customize_content, parent_state_id):
        """Document the state of cookie categories"""
        categories_summary = []
        
        for category in customize_content.get("cookie_categories", []):
            categories_summary.append({
                "Category": category["name"],
                "Required": category["is_required"],
                "Default On": category["default_state"],
                "Has Toggle": category["has_toggle"]
            })
        
        # Add this information to the current state's record
        if categories_summary:
            # Find the record with parent_state_id and update it
            for record in self.data:
                if record["StateID"] == parent_state_id:
                    record["Cookie Categories"] = str(categories_summary)
                    break
    
    def _capture_policy_page(self, link, parent_state_id, original_url):
        """Capture privacy policy page content"""
        new_state_id = str(uuid.uuid4())[:8]
        
        try:
            # Get the actual policy URL
            policy_url = self._resolve_policy_url(link, original_url)
            logger.info(f"Navigating to policy: {policy_url}")
            
            self.driver.get(policy_url)
            time.sleep(WAIT_TIME)
            
            # Take screenshot
            screenshot_path = self.take_screenshot(new_state_id)
            
            # Extract policy content
            policy_content = self.extract_policy_content()
            policy_content["is_policy_page"] = True
            
            # Record the policy page
            self.record_state(parent_state_id, new_state_id, policy_content, 
                            screenshot_path, f"Policy Link: {link['text']}", 1)
            
        except Exception as e:
            logger.error(f"Error capturing policy page: {e}")
        finally:
            # Return to original page
            try:
                self.driver.get(original_url)
                time.sleep(WAIT_TIME)
                # Re-trigger consent banner if needed
                self.wait_for_consent_banner(timeout=3)
            except:
                pass
    
    def _resolve_policy_url(self, link, base_url):
        """Resolve the actual policy URL from iframe or relative links"""
        link_text = link["text"].lower()
        href = link["href"]
        
        # Common privacy policy URL patterns
        policy_paths = {
            "privacy policy": ["/privacy", "/privacy-policy", "/legal/privacy", "/help/privacy-policy"],
            "cookie policy": ["/cookies", "/cookie-policy", "/legal/cookies", "/info/cookies"],
            "terms": ["/terms", "/tos", "/terms-of-service", "/legal/terms"],
            "data policy": ["/data-policy", "/data-protection"]
        }
        
        # Check if this is an iframe URL
        if "iframe" in href or "#" in href:
            logger.info(f"Detected iframe/anchor URL: {href}")
            
            # Extract base domain
            from urllib.parse import urlparse
            parsed = urlparse(base_url)
            base_domain = f"{parsed.scheme}://{parsed.netloc}"
            
            # Try to find the right policy path
            for key, paths in policy_paths.items():
                if key in link_text:
                    for path in paths:
                        # Try the path
                        test_url = base_domain + path
                        logger.info(f"Trying policy URL: {test_url}")
                        
                        # Quick check if URL exists
                        try:
                            import requests
                            response = requests.head(test_url, allow_redirects=True, timeout=5)
                            if response.status_code < 400:
                                logger.info(f"Found valid policy URL: {test_url}")
                                return test_url
                        except:
                            continue
            
            # If no valid URL found, try extracting from iframe URL parameters
            if "privacy" in link_text:
                # For Dropbox-like patterns
                if "dropbox.com" in base_url:
                    return "https://www.dropbox.com/privacy"
            elif "cookie" in link_text:
                if "dropbox.com" in base_url:
                    return "https://www.dropbox.com/cookie_policy"
        
        # If it's already a full URL, return as is
        if href.startswith("http"):
            return href
        
        # If it's a relative URL, make it absolute
        if href.startswith("/"):
            parsed = urlparse(base_url)
            return f"{parsed.scheme}://{parsed.netloc}{href}"
        
        # Default: return original href
        return href
    
    def _capture_action_result(self, button, parent_state_id, original_url):
        """Capture the result of accept/decline actions"""
        new_state_id = str(uuid.uuid4())[:8]
        
        try:
            # For accept/decline, we need a fresh session to see the result
            # Clear cookies and reload
            self.driver.delete_all_cookies()
            self.driver.get(original_url)
            time.sleep(WAIT_TIME)
            
            # Wait for banner again
            if not self.wait_for_consent_banner(timeout=5):
                logger.warning(f"Banner not found after reload for {button['text']}")
                return
            
            # Find consent elements including iframes
            consent_elements = self.find_consent_elements()
            target_button = None
            iframe_switched = False
            clicked = False
            
            # First try to find button in main content
            for elem in consent_elements["buttons"]:
                try:
                    if elem.text.strip().lower() == button["text"].lower():
                        if self._click_element_safely(elem):
                            clicked = True
                            logger.info(f"Clicked {button['text']} in main content")
                            break
                except:
                    continue
            
            # If not found in main content, check iframes
            if not clicked and consent_elements["iframes"]:
                for iframe in consent_elements["iframes"]:
                    try:
                        self.driver.switch_to.frame(iframe)
                        iframe_switched = True
                        logger.info(f"Switched to iframe to find {button['text']}")
                        
                        # Find buttons within iframe
                        iframe_buttons = self.driver.find_elements(By.CSS_SELECTOR, "button, [role='button']")
                        for elem in iframe_buttons:
                            if elem.text.strip().lower() == button["text"].lower():
                                if self._click_element_safely(elem):
                                    clicked = True
                                    logger.info(f"Successfully clicked {button['text']} in iframe")
                                    break
                        
                        if clicked:
                            break
                        else:
                            self.driver.switch_to.default_content()
                            iframe_switched = False
                    except Exception as e:
                        logger.error(f"Error in iframe: {e}")
                        if iframe_switched:
                            try:
                                self.driver.switch_to.default_content()
                            except:
                                pass
                            iframe_switched = False
            
            if clicked:
                time.sleep(WAIT_TIME)
                
                # Switch back to default content if we were in iframe
                if iframe_switched:
                    try:
                        self.driver.switch_to.default_content()
                    except:
                        pass
                
                # Take screenshot of result
                screenshot_path = self.take_screenshot(new_state_id)
                
                # Check what happened
                result_content = {
                    "banner_text": f"Result after clicking {button['text']}",
                    "buttons": [],
                    "links": [],
                    "banner_closed": True
                }
                
                # Check if banner is still visible
                try:
                    # Quick check without full wait
                    time.sleep(1)
                    consent_elements_after = self.find_consent_elements()
                    if consent_elements_after["banners"] or consent_elements_after["iframes"]:
                        result_content["banner_closed"] = False
                        result_content["banner_text"] = "Banner still visible after action"
                    else:
                        result_content["banner_text"] = f"Banner closed after clicking {button['text']}"
                except:
                    pass
                
                # Record the result
                self.record_state(parent_state_id, new_state_id, result_content, 
                                screenshot_path, button["text"], 1)
            else:
                logger.error(f"Could not find or click button: {button['text']}")
            
        except Exception as e:
            logger.error(f"Error capturing action result: {e}")
        finally:
            # Make sure we're back in default content
            if iframe_switched:
                try:
                    self.driver.switch_to.default_content()
                except:
                    pass
            
            # Clear cookies for next iteration
            self.driver.delete_all_cookies()
    
    def save_results(self, filename=DATA_FILE):
        """Save results to Excel"""
        if self.data:
            df = pd.DataFrame(self.data)
            df.to_excel(filename, index=False)
            logger.info(f"Results saved to {filename}")
        else:
            logger.warning("No data to save")
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/aishanipatil/.wdm/drivers/chromedriver/mac64/137.0.7151.68/chromedriver-mac-arm64/chromedriver] found in cache
INFO:__main__:Scraping https://www.dropbox.com
INFO:__main__:Waiting up to 10s for consent banner to appear...
INFO:__main__:Found potential consent iframe: {'id': 'ccpa-iframe', 'class': '_ccpa-iframe_163y6_1 _banner-visible_163y6_13', 'src': 'https://www.dropbox.com/ccpa_iframe?hide_gdpr=false&should_disable_banner=false&gpc=false&origin=https%253A%252F%252Fwww.dropbox.com&uri_for_logging=dropbox.com%2F&should_show_floating_button=false&should_auto_open_options=undefined&width=1512&locale_override=en&default_non_ccpa=true', 'name': ''}
INFO:__main__:Capturing policy link: Privacy Policy
INFO:__main__:Detected iframe/anchor URL: https://www.dropbox.com/en/ccpa_iframe?default_non_ccpa=true&gpc=false&hide_gdpr=false&locale_override=en&ori


Scraping complete! Total states captured: 6

States captured per website:
  https://www.dropbox.com: 6 states


In [None]:
# Example usage
if __name__ == "__main__":
    # Initialize scraper
    scraper = ImprovedPrivacyScraper(headless=False)
    
    # List of websites to scrape
    websites = [
        "https://www.dropbox.com",
        # "https://www.theguardian.com",
        # Add more websites as needed
    ]
    
    # Scrape websites
    for website in websites:
        try:
            # Clear cookies before each website
            scraper.driver.delete_all_cookies()
            
            # Use interactive=True for manual exploration, False for automated
            scraper.scrape_website(website, interactive=False)
            
            # Add a delay between websites
            time.sleep(2)
            
        except Exception as e:
            logger.error(f"Failed to scrape {website}: {e}")
            continue
    
    # Save results
    scraper.save_results()
    
    # Display summary
    print(f"\nScraping complete! Total states captured: {len(scraper.data)}")
    
    # Group by website
    from collections import defaultdict
    website_summary = defaultdict(int)
    for record in scraper.data:
        website_summary[record["Website"]] += 1
    
    print("\nStates captured per website:")
    for website, count in website_summary.items():
        print(f"  {website}: {count} states")
    
    # Close driver
    scraper.close()