## Best code

In [None]:
"""
ENHANCED: Smooth Navigation with Banner State Preservation
This version implements smart cookie management and banner re-triggering for seamless navigation
"""

import os
import time
import uuid
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import json
import random
import re
import requests

# Configuration
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3
WAIT_TIME = 5

os.makedirs(SCREENSHOTS_DIR, exist_ok=True)

class EnhancedPrivacyScraper:
    def __init__(self, headless=False):
        self.driver = self.initialize_driver_with_stealth(headless)
        self.data = []
        self.current_iframe = None  # Track if we're in an iframe
        self.original_url = None  # Track the original page URL
        self.banner_state_preserved = False
        
    def initialize_driver_with_stealth(self, headless=False):
        """Enhanced driver initialization"""
        chrome_options = Options()
        
        if headless:
            chrome_options.add_argument("--headless")
            
        # Enhanced anti-detection measures
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # Rotate user agents
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0"
        ]
        user_agent = random.choice(user_agents)
        chrome_options.add_argument(f"user-agent={user_agent}")
        
        # Initialize driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Execute script to mask automation
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        print(f"Driver initialized with user agent: {user_agent[:50]}...")
        return driver
    
    def wait_for_consent_banner(self, timeout=10):
        """Enhanced banner detection"""
        print(f"Waiting up to {timeout}s for consent banner to appear...")
        
        # Strategy 1: Common CMP indicators
        cmp_indicators = [
            ("id", "onetrust-banner-sdk"),
            ("id", "onetrust-consent-sdk"),
            ("class", "ot-sdk-container"),
            ("id", "CybotCookiebotDialog"),
            ("class", "CybotCookiebotDialog"),
            ("id", "truste-consent-track"),
            ("class", "truste-banner"),
            ("class", "qc-cmp2-container"),
            ("class", "qc-cmp-ui-container"),
            ("id", "didomi-popup"),
            ("class", "didomi-popup-container"),
            ("id", "usercentrics-root"),
            ("data-testid", "uc-container"),
            ("class", "cookie-consent"),
            ("class", "cookie-banner"),
            ("class", "consent-banner"),
            ("class", "privacy-banner"),
            ("class", "gdpr-banner"),
            ("role", "dialog"),
        ]
        
        for attr_type, attr_value in cmp_indicators:
            try:
                if attr_type == "id":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.ID, attr_value))
                    )
                elif attr_type == "class":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CLASS_NAME, attr_value))
                    )
                elif attr_type == "data-testid":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-testid='{attr_value}']"))
                    )
                elif attr_type == "role":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[role='{attr_value}']"))
                    )
                
                if element and element.is_displayed():
                    print(f"Found consent banner via {attr_type}='{attr_value}'")
                    self.banner_state_preserved = True
                    return True
            except TimeoutException:
                continue
        
        # Strategy 2: Check for iframes
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_attrs = {
                    "id": iframe.get_attribute("id") or "",
                    "class": iframe.get_attribute("class") or "",
                    "src": iframe.get_attribute("src") or "",
                    "name": iframe.get_attribute("name") or ""
                }
                
                consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice']
                if any(keyword in str(iframe_attrs).lower() for keyword in consent_keywords):
                    print(f"Found potential consent iframe: {iframe_attrs}")
                    self.banner_state_preserved = True
                    return True
        except Exception as e:
            print(f"Error checking iframes: {e}")
        
        self.banner_state_preserved = False
        return False
    
    def force_banner_reappearance(self):
        """Force cookie banner to reappear by manipulating cookies/localStorage"""
        try:
            print("🔄 Attempting to restore banner state...")
            
            # Clear consent cookies specifically
            consent_cookie_patterns = ['OptanonConsent', 'euconsent', 'gdpr', 'cookie_consent', 'consent']
            
            cookies_cleared = 0
            for cookie in self.driver.get_cookies():
                cookie_name = cookie['name'].lower()
                if any(pattern.lower() in cookie_name for pattern in consent_cookie_patterns):
                    self.driver.delete_cookie(cookie['name'])
                    cookies_cleared += 1
                    print(f"🍪 Deleted consent cookie: {cookie['name']}")
            
            # Clear localStorage consent data
            self.driver.execute_script("""
                // Clear common consent localStorage keys
                const consentKeys = Object.keys(localStorage).filter(key => 
                    key.toLowerCase().includes('consent') || 
                    key.toLowerCase().includes('gdpr') ||
                    key.toLowerCase().includes('cookie')
                );
                consentKeys.forEach(key => localStorage.removeItem(key));
                console.log('Cleared localStorage consent keys:', consentKeys);
            """)
            
            # Only refresh if we cleared something
            if cookies_cleared > 0:
                print(f"🔄 Refreshing page after clearing {cookies_cleared} consent cookies...")
                self.driver.refresh()
                time.sleep(3)
                return True
            else:
                print("ℹ️ No consent cookies found to clear")
                return False
                
        except Exception as e:
            print(f"⚠️ Error forcing banner reappearance: {e}")
            return False
    
    def return_to_original_page_with_banner(self):
        """Smart return to original page with banner restoration"""
        if not self.original_url:
            print("⚠️ No original URL stored")
            return False
            
        try:
            print(f"🏠 Returning to original page: {self.original_url}")
            
            # First try browser back navigation (preserves more state)
            current_url = self.driver.current_url
            if current_url != self.original_url:
                try:
                    # Try using browser back if we're just one page away
                    self.driver.back()
                    time.sleep(2)
                    
                    # Check if we're back to the right page
                    if self.original_url not in self.driver.current_url:
                        # Back didn't work, use direct navigation
                        print("   🔄 Browser back didn't work, using direct navigation...")
                        self.driver.get(self.original_url)
                        time.sleep(WAIT_TIME)
                except:
                    # Fallback to direct navigation
                    self.driver.get(self.original_url)
                    time.sleep(WAIT_TIME)
            
            # Check if banner is visible
            if self.wait_for_consent_banner(timeout=3):
                print("✅ Banner restored successfully")
                return True
            else:
                # Try to force banner reappearance
                print("🔧 Banner not visible, attempting to restore...")
                if self.force_banner_reappearance():
                    return self.wait_for_consent_banner(timeout=5)
                return False
                
        except Exception as e:
            print(f"❌ Error returning to original page: {e}")
            return False
    
    def find_consent_elements(self):
        """Enhanced element detection with iframe tracking"""
        consent_elements = {
            "banners": [],
            "iframes": [],
            "buttons": [],
            "links": [],
            "toggles": []
        }
        
        # Reset iframe tracking
        self.current_iframe = None
        
        # Enhanced selectors for consent banners
        banner_selectors = [
            "[id*='cookie'][id*='banner']",
            "[id*='cookie'][id*='consent']",
            "[id*='privacy'][id*='banner']",
            "[id*='gdpr']",
            "[id*='consent']",
            "#onetrust-banner-sdk",
            "#CybotCookiebotDialog",
            "#didomi-popup",
            "#usercentrics-root",
            "[class*='cookie-banner']",
            "[class*='consent-banner']",
            "[class*='privacy-banner']",
            "[class*='gdpr-banner']",
            ".qc-cmp2-container",
            ".ot-sdk-container",
            "[role='dialog'][aria-label*='cookie']",
            "[role='dialog'][aria-label*='consent']",
            "[role='dialog'][aria-label*='privacy']",
            "[data-testid*='cookie']",
            "[data-testid*='consent']",
            "[data-component*='cookie']",
            "[data-component*='consent']"
        ]
        
        # Find banner elements
        for selector in banner_selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed() and element.size['height'] > 50:
                        text = element.text.lower()
                        if any(keyword in text for keyword in ['cookie', 'privacy', 'consent', 'data', 'accept']):
                            consent_elements["banners"].append(element)
                            print(f"Found banner via selector: {selector}")
            except Exception as e:
                continue
        
        # Find iframes and store the active one
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_info = self._get_iframe_info(iframe)
                if self._is_consent_iframe(iframe_info):
                    consent_elements["iframes"].append(iframe)
                    # Store the iframe for later use
                    self.current_iframe = iframe
        except Exception:
            pass
        
        return consent_elements
    
    def _get_iframe_info(self, iframe):
        """Get iframe attributes"""
        return {
            "id": iframe.get_attribute("id") or "",
            "class": iframe.get_attribute("class") or "",
            "src": iframe.get_attribute("src") or "",
            "name": iframe.get_attribute("name") or ""
        }
    
    def _is_consent_iframe(self, iframe_info):
        """Check if iframe likely contains consent content"""
        consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice']
        iframe_str = str(iframe_info).lower()
        return any(keyword in iframe_str for keyword in consent_keywords)
    
    def extract_notice_details(self, element):
        """Extract text, buttons and links from a notice element"""
        details = {
            "text": element.text.strip(),
            "buttons": [],
            "links": [],
            "is_in_iframe": False
        }
        
        # Extract buttons
        try:
            buttons = element.find_elements(By.TAG_NAME, "button")
            for button in buttons:
                if button.is_displayed():
                    button_text = button.text.strip()
                    if button_text:
                        details["buttons"].append({
                            "text": button_text,
                            "element": button
                        })
        except Exception as e:
            print(f"Error extracting buttons: {e}")
        
        # Also look for span/div buttons
        try:
            spans_divs = element.find_elements(By.CSS_SELECTOR, "span[role='button'], div[role='button']")
            for item in spans_divs:
                if item.is_displayed():
                    item_text = item.text.strip()
                    if item_text:
                        details["buttons"].append({
                            "text": item_text,
                            "element": item
                        })
        except Exception as e:
            print(f"Error extracting span/div buttons: {e}")
        
        # Extract links
        try:
            links = element.find_elements(By.TAG_NAME, "a")
            for link in links:
                if link.is_displayed():
                    link_text = link.text.strip()
                    link_href = link.get_attribute("href")
                    if link_text and link_href:
                        # Resolve iframe URLs to actual policy URLs
                        resolved_href = self._resolve_policy_url(link_href, link_text)
                        details["links"].append({
                            "text": link_text,
                            "href": resolved_href,
                            "original_href": link_href,
                            "element": link
                        })
        except Exception as e:
            print(f"Error extracting links: {e}")
        
        return details
    
    def _resolve_policy_url(self, href, link_text):
        """ENHANCED URL resolution with website base detection"""
        # If it's already a normal URL (not iframe), return as is
        if href.startswith("http") and "iframe" not in href and "#" not in href:
            return href
        
        # Get base domain from current page or original URL
        base_url = self.original_url or self.driver.current_url
        parsed = urlparse(base_url)
        base_domain = f"{parsed.scheme}://{parsed.netloc}"
        
        # Enhanced resolution logic from second code
        link_lower = link_text.lower()
        
        # Common policy URL patterns for any website
        if "privacy policy faq" in link_lower or ("faq" in link_lower and "privacy" in link_lower):
            # Try common FAQ URL patterns
            common_faq_patterns = [
                "/privacy-faq",
                "/help/privacy-faq", 
                "/privacy/faq",
                "/help/security/privacy-policy-faq",
                "/legal/privacy-faq",
                "/privacy#faq"
            ]
            
            # Test which URL exists
            for pattern in common_faq_patterns:
                test_url = base_domain + pattern
                if self._test_url_exists(test_url):
                    return test_url
            
            # Fallback to first pattern
            return f"{base_domain}{common_faq_patterns[0]}"
            
        elif "privacy policy" in link_lower or "privacy" in link_lower:
            return f"{base_domain}/privacy"
            
        elif "cookie policy" in link_lower or "cookie" in link_lower:
            return f"{base_domain}/cookie-policy"
            
        elif "terms" in link_lower:
            return f"{base_domain}/terms"
            
        elif "legal" in link_lower:
            return f"{base_domain}/legal"
        else:
            # Generic fallback
            return f"{base_domain}/privacy"
    
    def _test_url_exists(self, url):
        """Test if a URL exists"""
        try:
            response = requests.head(url, allow_redirects=True, timeout=3, 
                                headers={'User-Agent': 'Mozilla/5.0'})
            return response.status_code < 400
        except:
            return False
    
    def switch_to_iframe_and_extract(self, iframe):
        """Switch to iframe and extract content with proper context tracking"""
        try:
            print(f"🔄 Switching to iframe for content extraction...")
            self.driver.switch_to.frame(iframe)
            self.current_iframe = iframe  # Track that we're in iframe
            
            # Check for notice elements in iframe
            try:
                body = self.driver.find_element(By.TAG_NAME, "body")
                notice_details = self.extract_notice_details(body)
                notice_details["is_in_iframe"] = True
                
                print(f"📦 Extracted iframe content: {len(notice_details['buttons'])} buttons, {len(notice_details['links'])} links")
                
            except:
                notice_details = {"text": "", "buttons": [], "links": [], "is_in_iframe": True}
            
            # DON'T switch back to default content yet - stay in iframe for interactions
            return notice_details
        except Exception as e:
            print(f"❌ Error processing iframe: {e}")
            self.driver.switch_to.default_content()
            self.current_iframe = None
            return {"text": "", "buttons": [], "links": [], "is_in_iframe": False}
    
    def _click_element_safely_with_iframe_context(self, element):
        """Enhanced element clicking with iframe context awareness"""
        try:
            print(f"🎯 Attempting to click element (iframe context: {self.current_iframe is not None})")
            
            # If we have an iframe and we're not in it, switch to it
            if self.current_iframe and not self._is_in_iframe():
                print(f"🔄 Switching to iframe for button click...")
                self.driver.switch_to.frame(self.current_iframe)
                time.sleep(1)
                
                # Re-find the button in iframe context
                button_text = element.text if hasattr(element, 'text') else "Unknown"
                buttons = self.driver.find_elements(By.CSS_SELECTOR, "button, [role='button']")
                
                target_button = None
                for btn in buttons:
                    if btn.is_displayed() and btn.text.strip() == button_text:
                        target_button = btn
                        break
                
                if target_button:
                    element = target_button
                    print(f"✅ Found button in iframe: {button_text}")
                else:
                    print(f"❌ Button not found in iframe: {button_text}")
                    return False
            
            # Try multiple click strategies
            strategies = [
                ("Direct click", lambda: element.click()),
                ("JavaScript click", lambda: self.driver.execute_script("arguments[0].click();", element)),
                ("Scroll and click", self._scroll_and_click),
                ("Actions click", self._actions_click)
            ]
            
            for strategy_name, strategy_func in strategies:
                try:
                    print(f"   🚀 Trying: {strategy_name}")
                    if strategy_name in ["Scroll and click", "Actions click"]:
                        strategy_func(element)
                    else:
                        strategy_func()
                    print(f"   ✅ Success with: {strategy_name}")
                    return True
                except Exception as e:
                    print(f"   ❌ Failed {strategy_name}: {e}")
                    continue
            
            return False
            
        except Exception as e:
            print(f"❌ All click strategies failed: {e}")
            return False
    
    def _is_in_iframe(self):
        """Check if we're currently in an iframe"""
        try:
            # Try to access the main window - if we're in iframe, this will be different
            main_window = self.driver.execute_script("return window.top === window;")
            return not main_window
        except:
            return False
    
    def _scroll_and_click(self, element):
        """Scroll to element and click"""
        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
        time.sleep(0.5)
        element.click()
    
    def _actions_click(self, element):
        """Use Actions API to click"""
        from selenium.webdriver.common.action_chains import ActionChains
        ActionChains(self.driver).move_to_element(element).click().perform()
    
    def display_options(self, notice_details):
        """Display interactive options to the user"""
        print(f"\n--- Notice Text ---\n{notice_details['text']}\n")
        
        print("--- Available Buttons ---")
        for i, button in enumerate(notice_details['buttons']):
            print(f"{i+1}. {button['text']}")
        
        print("\n--- Available Links ---")
        for i, link in enumerate(notice_details['links']):
            print(f"{i+1}. {link['text']} => {link['href']}")
        
        print("\n--- Actions ---")
        print("To click a button, enter: B<number> (e.g., B1 for the first button)")
        print("To click a link, enter: L<number> (e.g., L1 for the first link)")
        print("To take a manual screenshot, enter: S")
        print("To find and expand dropdown sections, enter: E")
        print("To go back, enter: BACK")
        print("To finish exploration, enter: DONE")
        
        return input("Enter your choice: ")
    
    def take_screenshot(self, state_id=None, custom_name=None):
        """Take screenshot"""
        domain = urlparse(self.current_website).netloc.replace('.', '_') if hasattr(self, 'current_website') else "unknown"
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        if custom_name:
            filename = f"{domain}_{custom_name}_{timestamp}.png"
        elif state_id:
            filename = f"{domain}_state_{state_id}_{timestamp}.png"
        else:
            filename = f"{domain}_{timestamp}.png"
            
        filepath = os.path.join(SCREENSHOTS_DIR, filename)
        self.driver.save_screenshot(filepath)
        return filepath
    
    def record_state(self, website, parent_id, state_id, notice_details, screenshot_path, choice=None):
        """Record state"""
        # Format links for Excel
        links_data = {}
        for i, link in enumerate(notice_details['links'][:5], 1):
            links_data[f"Link {i}"] = link['text']
            links_data[f"Link {i} Detail"] = link['href']
        
        # Ensure all link columns exist
        for i in range(len(notice_details['links']) + 1, 6):
            links_data[f"Link {i}"] = None
            links_data[f"Link {i} Detail"] = None
        
        row = {
            "Website": website,
            "ParentID": parent_id,
            "StateID": state_id,
            "Snapshot": screenshot_path,
            "Text of Notice": notice_details['text'],
            "Choice Provided": notice_details['buttons'][0]['text'] if notice_details['buttons'] else "No choices detected",
            "ChoiceMade": choice,
            "In Iframe": notice_details.get('is_in_iframe', False),
            "Banner State Preserved": self.banner_state_preserved,
            **links_data
        }
        
        self.data.append(row)
        return self.data
    
    def explore_interactively(self, website, max_depth=3):
        """Main interactive exploration method with enhanced navigation"""
        self.current_website = website
        self.original_url = website  # Store original URL
        print(f"🌐 Visiting {website}...")
        
        try:
            # Navigate to website (NO INITIAL COOKIE CLEARING)
            self.driver.get(website)
            time.sleep(3)
            
            # Check for bot detection
            print("🤖 Checking for bot detection...")
            page_title = self.driver.title.lower()
            page_text = self.driver.find_element(By.TAG_NAME, "body").text.lower()
            
            bot_indicators = ["security check", "captcha", "bot check", "human verification", "please verify"]
            if any(indicator in page_title or indicator in page_text for indicator in bot_indicators):
                print("⚠️  BOT DETECTION DETECTED!")
                input("Please solve the captcha/verification in the browser window, then press Enter to continue...")
                print("✅ Continuing with scraping...")
                time.sleep(2)
            
            # Generate root state
            root_state_id = str(uuid.uuid4())[:8]
            screenshot_path = self.take_screenshot(root_state_id, "initial")
            print(f"📸 Initial screenshot: {screenshot_path}")
            
            # Wait for banner
            banner_found = self.wait_for_consent_banner()
            
            # Find notice elements
            consent_elements = self.find_consent_elements()
            
            if consent_elements["banners"]:
                print(f"🎯 Found {len(consent_elements['banners'])} banner elements")
                notice_details = self.extract_notice_details(consent_elements["banners"][0])
            elif consent_elements["iframes"]:
                print(f"🎯 Found {len(consent_elements['iframes'])} iframe elements")
                notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
            else:
                print("❌ No consent banner found")
                notice_details = {"text": "No cookie notice detected", "buttons": [], "links": []}
            
            # Record initial state
            self.record_state(website, None, root_state_id, notice_details, screenshot_path)
            
            # Start interactive exploration
            current_state_id = root_state_id
            parent_id = None
            depth = 0
            state_stack = []
            
            while depth < max_depth:
                choice = self.display_options(notice_details)
                
                if choice.upper() == "DONE":
                    print("✅ Exploration complete!")
                    break
                
                elif choice.upper() == "S":
                    manual_screenshot = self.take_screenshot(custom_name=f"manual_{datetime.now().strftime('%H%M%S')}")
                    print(f"📸 Manual screenshot: {manual_screenshot}")
                    continue
                
                elif choice.upper() == "BACK":
                    print(f"⬅️ Going back to previous state...")
                    
                    # Use smart back navigation
                    new_state_id, new_parent_id, new_notice_details = self.smart_back_navigation(
                        state_stack, current_state_id, parent_id, depth
                    )
                    
                    if new_state_id is not None:
                        # Update state variables
                        current_state_id = new_state_id
                        parent_id = new_parent_id
                        notice_details = new_notice_details
                        depth -= 1 if depth > 0 else 0
                        
                        # Take screenshot of restored state
                        screenshot_path = self.take_screenshot(current_state_id, "back_navigation")
                        print(f"📸 Screenshot: {screenshot_path}")
                    else:
                        print("❌ Back navigation failed")
                
                elif choice.upper().startswith("B") and notice_details['buttons']:
                    # Handle button clicks with iframe awareness
                    try:
                        button_idx = int(choice[1:]) - 1
                        if 0 <= button_idx < len(notice_details['buttons']):
                            button = notice_details['buttons'][button_idx]
                            print(f"🖱️ Clicking button: {button['text']}")
                            
                            # Save current state for back navigation
                            state_stack.append((self.driver.current_url, current_state_id, parent_id, self.current_iframe))
                            
                            # Update state trackers
                            parent_id = current_state_id
                            current_state_id = str(uuid.uuid4())[:8]
                            
                            # Use iframe-aware clicking
                            if self._click_element_safely_with_iframe_context(button['element']):
                                print("✅ Button clicked successfully!")
                                time.sleep(WAIT_TIME)
                                
                                # Take screenshot
                                screenshot_path = self.take_screenshot(current_state_id, f"after_{button['text'].replace(' ', '_')}")
                                print(f"📸 Screenshot: {screenshot_path}")
                                
                                # Find new notice state (stay in iframe if that's where we are)
                                try:
                                    if self.current_iframe and self._is_in_iframe():
                                        # We're in iframe, extract content from here
                                        body = self.driver.find_element(By.TAG_NAME, "body")
                                        notice_details = self.extract_notice_details(body)
                                        notice_details["is_in_iframe"] = True
                                    else:
                                        # We're in main content
                                        new_consent_elements = self.find_consent_elements()
                                        
                                        if new_consent_elements["banners"]:
                                            notice_details = self.extract_notice_details(new_consent_elements["banners"][0])
                                        elif new_consent_elements["iframes"]:
                                            notice_details = self.switch_to_iframe_and_extract(new_consent_elements["iframes"][0])
                                        else:
                                            notice_details = {"text": "No notice detected after clicking", "buttons": [], "links": []}
                                except Exception as e:
                                    print(f"⚠️ Error extracting new state: {e}")
                                    notice_details = {"text": "Error extracting new state", "buttons": [], "links": []}
                                
                                # Record new state
                                self.record_state(website, parent_id, current_state_id, notice_details, 
                                                screenshot_path, button['text'])
                                depth += 1
                                
                            else:
                                print("❌ Failed to click button")
                                # Restore state trackers since click failed
                                current_state_id = parent_id
                                parent_id = state_stack[-1][2] if state_stack else None
                                state_stack.pop() if state_stack else None
                        else:
                            print(f"❌ Invalid button index: {button_idx}")
                    except ValueError:
                        print(f"❌ Invalid button choice: {choice}")
                
                elif choice.upper().startswith("L") and notice_details['links']:
                    # Handle link clicks with smart navigation
                    try:
                        link_idx = int(choice[1:]) - 1
                        if 0 <= link_idx < len(notice_details['links']):
                            link = notice_details['links'][link_idx]
                            print(f"🔗 Clicking link: {link['text']} ({link['href']})")
                            
                            # Save current state for back navigation
                            state_stack.append((self.driver.current_url, current_state_id, parent_id, self.current_iframe))
                            
                            # Update state trackers
                            parent_id = current_state_id
                            current_state_id = str(uuid.uuid4())[:8]
                            
                            try:
                                # Switch back to main content for navigation
                                if self.current_iframe:
                                    self.driver.switch_to.default_content()
                                    self.current_iframe = None
                                
                                # Navigate to resolved link
                                print(f"🌐 Navigating to: {link['href']}")
                                self.driver.get(link['href'])
                                time.sleep(WAIT_TIME)
                                
                                # Take screenshot
                                screenshot_path = self.take_screenshot(current_state_id, "policy_page")
                                print(f"📸 Screenshot: {screenshot_path}")
                                
                                # Extract policy content
                                policy_content = self.extract_policy_page_content()
                                
                                # Record new state
                                self.record_state(website, parent_id, current_state_id, policy_content,
                                                screenshot_path, f"Link: {link['text']}")
                                
                                # Update notice_details for next iteration
                                notice_details = policy_content
                                depth += 1
                                
                            except Exception as e:
                                print(f"❌ Error clicking link: {e}")
                        else:
                            print(f"❌ Invalid link index: {link_idx}")
                    except ValueError:
                        print(f"❌ Invalid link choice: {choice}")
                
                elif choice.upper() == "E":
                    # Enhanced expandable section detection
                    print("🔧 Enhanced expandable section detection...")
                    
                    # Find dialog container (prioritize iframe context if we're in one)
                    dialog = None
                    if self.current_iframe and self._is_in_iframe():
                        print("📦 Using iframe context for expansion detection")
                        try:
                            dialog = self.driver.find_element(By.TAG_NAME, "body")
                        except:
                            pass
                    
                    if not dialog:
                        print("📦 Searching for dialog container in main content")
                        dialog_selectors = [
                            "#onetrust-consent-sdk", ".cookie-notice", "#cookie-notice",
                            "div[role='dialog']", ".modal-dialog", ".consent-modal"
                        ]
                        
                        for selector in dialog_selectors:
                            try:
                                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                                for el in elements:
                                    if el.is_displayed():
                                        dialog = el
                                        print(f"📦 Found dialog container: {selector}")
                                        break
                                if dialog:
                                    break
                            except:
                                continue
                    
                    if not dialog:
                        print("📦 Using body as container")
                        if self.current_iframe and self._is_in_iframe():
                            dialog = self.driver.find_element(By.TAG_NAME, "body")
                        else:
                            # Switch to iframe if we have one
                            if self.current_iframe:
                                self.driver.switch_to.frame(self.current_iframe)
                                dialog = self.driver.find_element(By.TAG_NAME, "body")
                                print("📦 Switched to iframe and using body")
                            else:
                                dialog = self.driver.find_element(By.TAG_NAME, "body")
                    
                    # Find expandable sections with enhanced detection
                    expandables = self.find_expandable_sections_enhanced(dialog)
                    
                    if expandables:
                        print(f"🎯 Found {len(expandables)} expandable sections:")
                        for i, exp in enumerate(expandables):
                            display_text = exp['text'][:50] + "..." if len(exp['text']) > 50 else exp['text']
                            print(f"   {i+1}. {display_text} ({exp['type']})")
                        
                        print("\n🔧 Options:")
                        print("   A - Expand ALL sections")
                        print("   1-9 - Expand specific section number")
                        print("   SKIP - Skip expansion and continue")
                        
                        exp_choice = input("Enter your choice: ").strip()
                        
                        if exp_choice.upper() == "A":
                            print("🚀 Expanding ALL sections...")
                            expanded_count = 0
                            for i, exp in enumerate(expandables):
                                print(f"\n--- Expanding {i+1}/{len(expandables)}: {exp['text'][:30]}... ---")
                                success = self.try_expand_element_enhanced(exp)
                                if success:
                                    expanded_count += 1
                                    screenshot = self.take_screenshot(custom_name=f"expanded_{i+1}_{exp['type']}")
                                    print(f"📸 Screenshot saved: {screenshot}")
                                else:
                                    print(f"⚠️ Could not expand section {i+1}")
                                time.sleep(1)  # Brief pause between expansions
                            
                            print(f"\n✅ Expansion complete! Successfully expanded {expanded_count}/{len(expandables)} sections")
                            
                            # Take a final screenshot showing all expanded sections
                            final_screenshot = self.take_screenshot(custom_name="all_expanded_final")
                            print(f"📸 Final screenshot with all expansions: {final_screenshot}")
                            
                        elif exp_choice.upper() == "SKIP":
                            print("⏭️ Skipping expansion...")
                            
                        else:
                            # Handle specific section selection
                            try:
                                idx = int(exp_choice) - 1
                                if 0 <= idx < len(expandables):
                                    exp = expandables[idx]
                                    print(f"\n🚀 Expanding selected section: {exp['text'][:50]}...")
                                    success = self.try_expand_element_enhanced(exp)
                                    if success:
                                        screenshot = self.take_screenshot(custom_name=f"expanded_section_{idx+1}")
                                        print(f"📸 Screenshot saved: {screenshot}")
                                    else:
                                        print(f"⚠️ Could not expand section: {exp['text'][:50]}")
                                else:
                                    print(f"❌ Invalid selection. Please enter 1-{len(expandables)}, A, or SKIP")
                            except ValueError:
                                print("❌ Invalid input. Please enter a number, A for all, or SKIP")
                    else:
                        print("❌ No expandable sections found")
                    
                    continue
                
                elif choice.upper() == "BACK":
                    if state_stack:
                        print(f"⬅️ Going back to previous state...")
                        
                        try:
                            # Switch back to main content first
                            if self.current_iframe:
                                self.driver.switch_to.default_content()
                                self.current_iframe = None
                            
                            # Smart return to original page with banner restoration
                            if self.return_to_original_page_with_banner():
                                print("✅ Successfully returned with banner state preserved")
                                
                                # Restore state trackers
                                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                                current_state_id = prev_state_id
                                parent_id = prev_parent_id
                                self.current_iframe = prev_iframe
                                depth -= 1 if depth > 0 else 0
                                
                                # Rediscover notice elements
                                consent_elements = self.find_consent_elements()
                                
                                if consent_elements["banners"]:
                                    notice_details = self.extract_notice_details(consent_elements["banners"][0])
                                elif consent_elements["iframes"]:
                                    notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
                                else:
                                    notice_details = {"text": "No notice detected", "buttons": [], "links": []}
                                
                                screenshot_path = self.take_screenshot(current_state_id, "back")
                                print(f"📸 Screenshot: {screenshot_path}")
                                
                            else:
                                print("⚠️ Could not restore banner state, but navigation successful")
                                # Still pop the state stack
                                state_stack.pop()
                                
                        except Exception as e:
                            print(f"❌ Error going back: {e}")
                    else:
                        print("❌ Cannot go back further (at root state)")
                        
        except Exception as e:
            print(f"❌ Error during exploration: {e}")
            return self.data
        finally:
            # Ensure we're back to main content
            try:
                if self.current_iframe:
                    self.driver.switch_to.default_content()
            except:
                pass
            
        return self.data
    
    def find_expandable_sections_enhanced(self, container):
        """GENERALIZED expandable section detection for any website"""
        expandables = []
        
        print("🔍 Looking for expandable sections with enhanced detection...")
        
        # Strategy 1: Look for common cookie category patterns (works for any site)
        common_categories = [
            # Essential categories
            "strictly necessary", "necessary", "essential", "required",
            "functional", "functionality", "performance", 
            "analytics", "analytical", "statistics", "statistical",
            
            # Marketing categories  
            "marketing", "advertising", "targeting", "targeted",
            "social media", "social", "personalization", "personalized",
            
            # Data categories
            "third party", "data collection", "tracking", "cookies",
            "do not sell", "do not share", "opt out", "privacy choices"
        ]
        
        for category in common_categories:
            try:
                # Look for elements containing category text (case insensitive)
                xpath_patterns = [
                    f".//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]",
                    f".//div[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]",
                    f".//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]"
                ]
                
                for xpath in xpath_patterns:
                    try:
                        elements = container.find_elements(By.XPATH, xpath)
                        for element in elements:
                            if element.is_displayed():
                                element_text = element.text.strip()
                                # Only add if text is reasonable length (not entire page)
                                if 5 < len(element_text) < 200 and category.lower() in element_text.lower():
                                    print(f"   Found category element: {element_text[:50]}...")
                                    
                                    # Look for clickable parent
                                    clickable_element = self._find_clickable_parent(element)
                                    
                                    expandables.append({
                                        "element": clickable_element or element,
                                        "text": element_text,
                                        "type": "category",
                                        "strategy": f"text_match:{category}"
                                    })
                                    break
                        
                        # If we found something for this category, move to next
                        if expandables and category.lower() in expandables[-1]["text"].lower():
                            break
                            
                    except Exception as e:
                        continue
                        
            except Exception as e:
                print(f"   Error searching for {category}: {e}")
        
        # Strategy 2: Look for accordion/collapsible patterns
        accordion_selectors = [
            ".accordion-header", "[aria-expanded]", "[data-toggle='collapse']",
            "details", "summary", ".expandable", ".collapsible",
            ".toggle", "[role='button']", ".category-header"
        ]
        
        for selector in accordion_selectors:
            try:
                elements = container.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed():
                        text = element.text.strip()
                        # Only add if contains relevant keywords
                        if text and any(keyword in text.lower() for keyword in [
                            "cookie", "necessary", "functional", "analytics", 
                            "advertising", "marketing", "social", "performance",
                            "targeting", "privacy", "data"
                        ]):
                            expandables.append({
                                "element": element,
                                "text": text[:100],
                                "type": "accordion", 
                                "strategy": selector
                            })
            except Exception as e:
                continue
        
        # Remove duplicates based on text similarity
        unique_expandables = []
        for exp in expandables:
            is_duplicate = False
            for unique_exp in unique_expandables:
                if self._text_similarity(exp["text"], unique_exp["text"]) > 0.8:
                    is_duplicate = True
                    break
            if not is_duplicate:
                unique_expandables.append(exp)
        
        print(f"🎯 Total unique expandable sections found: {len(unique_expandables)}")
        return unique_expandables
    
    def _find_clickable_parent(self, element):
        """Find a clickable parent element"""
        try:
            current = element
            for _ in range(3):  # Go up to 3 levels
                parent = current.find_element(By.XPATH, "..")
                if parent.tag_name == "body":
                    break
                
                # Check if parent is clickable
                if (parent.get_attribute("onclick") or 
                    parent.get_attribute("role") == "button" or
                    "click" in parent.get_attribute("class").lower() if parent.get_attribute("class") else False):
                    return parent
                
                current = parent
        except:
            pass
        return None
    
    def _text_similarity(self, text1, text2):
        """Simple text similarity calculation"""
        if not text1 or not text2:
            return 0.0
        
        # Convert to sets of words
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        # Calculate Jaccard similarity
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        if not union:
            return 0.0
        
        return len(intersection) / len(union)
    
    def try_expand_element_enhanced(self, expandable_info):
        """GENERALIZED element expansion strategies for any website"""
        element = expandable_info["element"]
        element_type = expandable_info["type"]
        strategy = expandable_info["strategy"]
        
        print(f"🚀 Attempting to expand: {expandable_info['text'][:50]}...")
        print(f"   Type: {element_type}, Strategy: {strategy}")
        
        # Enhanced universal expansion strategies
        universal_strategies = [
            ("Direct click", lambda: self._click_element_safely_with_iframe_context(element)),
            ("JavaScript click", lambda: self.driver.execute_script("arguments[0].click();", element)),
            ("Double click", lambda: self._double_click_element(element)),
            ("Parent click", lambda: self._click_parent_element(element)),
            ("Actions click", lambda: self._actions_click(element)),
        ]
        
        for strategy_name, strategy_func in universal_strategies:
            try:
                print(f"   Trying strategy: {strategy_name}")
                result = strategy_func()
                if result:
                    print(f"   ✅ Success with: {strategy_name}")
                    time.sleep(2)  # Wait for expansion animation
                    return True
                else:
                    print(f"   ❌ Failed: {strategy_name}")
            except Exception as e:
                print(f"   ❌ Error with {strategy_name}: {e}")
        
        print(f"   ⚠️ All strategies failed for: {expandable_info['text'][:30]}")
        return False
    
    def _double_click_element(self, element):
        """Try double-clicking the element"""
        try:
            from selenium.webdriver.common.action_chains import ActionChains
            ActionChains(self.driver).double_click(element).perform()
            return True
        except:
            return False
    
    def _click_parent_element(self, element):
        """Try clicking parent elements"""
        try:
            parent = element.find_element(By.XPATH, "..")
            return self._click_element_safely_with_iframe_context(parent)
        except:
            return False
    
    def handle_customize_panel_back_navigation(self):
        """Handle going back from customize panels to main banner - GENERALIZED METHOD"""
        try:
            print("🔄 Attempting to navigate back from customize panel...")
            
            # Strategy 1: Look for close/back/cancel buttons in the panel
            close_button_selectors = [
                "//button[contains(text(), 'Close')]",
                "//button[contains(text(), 'Cancel')]", 
                "//button[contains(text(), 'Back')]",
                "//button[contains(@aria-label, 'close')]",
                "//button[contains(@aria-label, 'back')]",
                "//button[contains(@class, 'close')]",
                "//button[contains(@class, 'back')]",
                "//span[contains(@class, 'close')]",
                "//div[contains(@class, 'close')][@role='button']",
                "[data-testid*='close']",
                "[data-testid*='back']"
            ]
            
            for selector in close_button_selectors:
                try:
                    if selector.startswith("//") or selector.startswith("*"):
                        # XPath selector
                        buttons = self.driver.find_elements(By.XPATH, selector)
                    else:
                        # CSS selector
                        buttons = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    for button in buttons:
                        if button.is_displayed():
                            print(f"   Found close/back button: {button.text or button.get_attribute('aria-label') or 'unlabeled'}")
                            if self._click_element_safely_with_iframe_context(button):
                                print("   ✅ Successfully clicked close/back button")
                                time.sleep(2)
                                return True
                except Exception:
                    continue
            
            # Strategy 2: Try clicking outside the panel (works for some modal dialogs)
            try:
                print("   Trying to click outside panel...")
                # Click at a safe position that's usually outside panels
                self.driver.execute_script("document.elementFromPoint(50, 50).click();")
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 3: Try ESC key
            try:
                print("   Trying ESC key...")
                from selenium.webdriver.common.keys import Keys
                body = self.driver.find_element(By.TAG_NAME, "body")
                body.send_keys(Keys.ESCAPE)
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 4: If we're in an iframe, try switching back to default content
            # This might reveal the original banner
            if self.current_iframe and self._is_in_iframe():
                print("   Switching back to main content...")
                self.driver.switch_to.default_content()
                self.current_iframe = None
                time.sleep(2)
                return True
            
            print("   ⚠️ Could not find way to close customize panel")
            return False
            
        except Exception as e:
            print(f"   ❌ Error in customize panel back navigation: {e}")
            return False
    
    def smart_back_navigation(self, state_stack, current_state_id, parent_id, depth):
        """Intelligent back navigation that handles different contexts"""
        if not state_stack:
            print("❌ Cannot go back further (at root state)")
            return None, None, None
        
        try:
            # Check what type of state we're in
            current_url = self.driver.current_url
            
            # If we're in a customize panel (still on original domain), try panel-specific back navigation first
            if self.original_url and self.original_url in current_url and self.current_iframe:
                print("🎯 Detected customize panel context, trying panel-specific navigation...")
                if self.handle_customize_panel_back_navigation():
                    print("✅ Successfully navigated back within customize context")
                    
                    # Check if we're back to the main banner
                    time.sleep(2)
                    if self.wait_for_consent_banner(timeout=3):
                        # Find the original banner state
                        consent_elements = self.find_consent_elements()
                        if consent_elements["banners"]:
                            notice_details = self.extract_notice_details(consent_elements["banners"][0])
                        elif consent_elements["iframes"]:
                            notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
                        else:
                            notice_details = {"text": "Banner restored but content not detected", "buttons": [], "links": []}
                        
                        # Pop the state stack and return updated values
                        prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                        return prev_state_id, prev_parent_id, notice_details
            
            # Standard navigation for policy pages and other external pages
            print("🏠 Using standard page navigation...")
            
            # Switch back to main content if in iframe
            if self.current_iframe:
                self.driver.switch_to.default_content()
                self.current_iframe = None
            
            # Smart return to original page with banner restoration
            if self.return_to_original_page_with_banner():
                print("✅ Successfully returned with banner state preserved")
                
                # Rediscover notice elements
                consent_elements = self.find_consent_elements()
                
                if consent_elements["banners"]:
                    notice_details = self.extract_notice_details(consent_elements["banners"][0])
                elif consent_elements["iframes"]:
                    notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
                else:
                    notice_details = {"text": "No notice detected after return", "buttons": [], "links": []}
                
                # Pop the state stack and return updated values
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                return prev_state_id, prev_parent_id, notice_details
            else:
                print("⚠️ Could not restore banner state")
                # Still pop the state stack
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                return prev_state_id, prev_parent_id, {"text": "Navigation completed but banner not restored", "buttons": [], "links": []}
                
        except Exception as e:
            print(f"❌ Error in smart back navigation: {e}")
            # Still try to pop the state stack
            if state_stack:
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                return prev_state_id, prev_parent_id, {"text": "Error during navigation", "buttons": [], "links": []}
            return None, None, None
        """Extract content from policy pages"""
        policy_details = {
            "text": "",
            "buttons": [],
            "links": []
        }
        
        try:
            # Try to find main content container
            for selector in ["main", "article", ".content", "#content", ".privacy-policy", ".cookie-policy"]:
                try:
                    content_element = self.driver.find_element(By.CSS_SELECTOR, selector)
                    if content_element:
                        policy_text = content_element.text
                        if len(policy_text) > 200:
                            policy_details["text"] = policy_text
                            
                            # Get links within policy
                            links = content_element.find_elements(By.TAG_NAME, "a")
                            for link in links[:10]:  # Limit to 10 links
                                if link.is_displayed():
                                    link_text = link.text.strip()
                                    link_href = link.get_attribute("href")
                                    if link_text and link_href:
                                        policy_details["links"].append({
                                            "text": link_text,
                                            "href": link_href,
                                            "element": link
                                        })
                            return policy_details
                except:
                    continue
            
            # Fallback to body
            body = self.driver.find_element(By.TAG_NAME, "body")
            policy_details["text"] = body.text[:5000] + "..." if len(body.text) > 5000 else body.text
            
        except Exception as e:
            print(f"❌ Error extracting policy content: {e}")
            policy_details["text"] = "Error extracting policy content"
        
        return policy_details
    
    def save_results(self, filename=None):
        """Save results to Excel"""
        if not self.data:
            print("❌ No data to save")
            return
        
        filename = filename or DATA_FILE
        
        # Clean data for Excel export
        cleaned_data = []
        for row in self.data:
            cleaned_row = {}
            for key, value in row.items():
                if key != 'element':  # Skip WebElement objects
                    cleaned_row[key] = value
            cleaned_data.append(cleaned_row)
        
        # Create DataFrame and save
        df = pd.DataFrame(cleaned_data)
        df.to_excel(filename, index=False)
        print(f"💾 Results saved to {filename}")
        
        # Display summary
        print(f"\n📊 Summary:")
        print(f"   Total states captured: {len(df)}")
        print(f"   Websites processed: {df['Website'].nunique()}")
        
        return df
    
    def close(self):
        """Close the driver"""
        try:
            # Make sure we're out of any iframe
            if self.current_iframe:
                self.driver.switch_to.default_content()
            self.driver.quit()
            print("🔌 Driver closed successfully")
        except:
            pass

# Example usage
def main():
    """Main execution function"""
    print("🚀 Enhanced Privacy Policy Scraper with Smart Navigation")
    print("=" * 70)
    
    # Initialize scraper
    scraper = EnhancedPrivacyScraper(headless=False)
    
    # Website to test
    website = 'https://www.dropbox.com'
    
    try:
        # Run interactive exploration
        data = scraper.explore_interactively(website, max_depth=MAX_DEPTH)
        
        # Save results
        df = scraper.save_results()
        
        # Display preview
        if not df.empty:
            print("\n📋 Data Preview:")
            print(df[['Website', 'StateID', 'ChoiceMade', 'Choice Provided', 'In Iframe', 'Banner State Preserved']].head())
        
    except KeyboardInterrupt:
        print("\n⚠️ Scraping interrupted by user")
    except Exception as e:
        print(f"\n❌ Scraping failed: {e}")
    finally:
        scraper.close()

if __name__ == "__main__":
    main()

## Attempt to improve

In [3]:
"""
ENHANCED: Smooth Navigation with Banner State Preservation
This version implements smart cookie management and banner re-triggering for seamless navigation
"""

import os
import time
import uuid
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import json
import random
import re
import requests

# Configuration
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3
WAIT_TIME = 5

os.makedirs(SCREENSHOTS_DIR, exist_ok=True)

class EnhancedPrivacyScraper:
    
    def __init__(self, headless=False):
        """Initialize scraper with enhanced tracking"""
        self.driver = self.initialize_driver_with_stealth(headless)
        self.data = []
        self.current_iframe = None
        self.original_url = None
        self.banner_state_preserved = False
        self.current_depth = 0  # NEW: Track current exploration depth
    
    def switch_to_iframe_and_extract(self, iframe):
        """Switch to iframe and extract content with proper context tracking and new format"""
        try:
            print(f"🔄 Switching to iframe for content extraction...")
            self.driver.switch_to.frame(iframe)
            self.current_iframe = iframe  # Track that we're in iframe
            
            # Check for notice elements in iframe
            try:
                body = self.driver.find_element(By.TAG_NAME, "body")
                notice_details = self.extract_notice_details(body)
                notice_details["is_in_iframe"] = True
                
                print(f"📦 Extracted iframe content: {len(notice_details['buttons'])} buttons, {len(notice_details['links'])} links")
                print(f"📦 Button analysis - Reject: {notice_details['has_reject']}, Accept All: {notice_details['has_accept_all']}, Customize: {notice_details['has_customize']}")
                
            except Exception as e:
                print(f"Error extracting iframe content: {e}")
                notice_details = {
                    "text": "", "buttons": [], "links": [], "is_in_iframe": True,
                    "all_buttons_formatted": "", "has_reject": False, 
                    "has_accept_all": False, "has_customize": False
                }
            
            # DON'T switch back to default content yet - stay in iframe for interactions
            return notice_details
        except Exception as e:
            print(f"❌ Error processing iframe: {e}")
            self.driver.switch_to.default_content()
            self.current_iframe = None
            return {
                "text": "", "buttons": [], "links": [], "is_in_iframe": False,
                "all_buttons_formatted": "", "has_reject": False, 
                "has_accept_all": False, "has_customize": False
            }
        
    def initialize_driver_with_stealth(self, headless=False):
        """Enhanced driver initialization"""
        chrome_options = Options()
        
        if headless:
            chrome_options.add_argument("--headless")
            
        # Enhanced anti-detection measures
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # Rotate user agents
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0"
        ]
        user_agent = random.choice(user_agents)
        chrome_options.add_argument(f"user-agent={user_agent}")
        
        # Initialize driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Execute script to mask automation
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        print(f"Driver initialized with user agent: {user_agent[:50]}...")
        return driver
    
    def wait_for_consent_banner(self, timeout=10):
        """Enhanced banner detection"""
        print(f"Waiting up to {timeout}s for consent banner to appear...")
        
        # Strategy 1: Common CMP indicators
        cmp_indicators = [
            ("id", "onetrust-banner-sdk"),
            ("id", "onetrust-consent-sdk"),
            ("class", "ot-sdk-container"),
            ("id", "CybotCookiebotDialog"),
            ("class", "CybotCookiebotDialog"),
            ("id", "truste-consent-track"),
            ("class", "truste-banner"),
            ("class", "qc-cmp2-container"),
            ("class", "qc-cmp-ui-container"),
            ("id", "didomi-popup"),
            ("class", "didomi-popup-container"),
            ("id", "usercentrics-root"),
            ("data-testid", "uc-container"),
            ("class", "cookie-consent"),
            ("class", "cookie-banner"),
            ("class", "consent-banner"),
            ("class", "privacy-banner"),
            ("class", "gdpr-banner"),
            ("role", "dialog"),
        ]
        
        for attr_type, attr_value in cmp_indicators:
            try:
                if attr_type == "id":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.ID, attr_value))
                    )
                elif attr_type == "class":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CLASS_NAME, attr_value))
                    )
                elif attr_type == "data-testid":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-testid='{attr_value}']"))
                    )
                elif attr_type == "role":
                    element = WebDriverWait(self.driver, 2).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, f"[role='{attr_value}']"))
                    )
                
                if element and element.is_displayed():
                    print(f"Found consent banner via {attr_type}='{attr_value}'")
                    self.banner_state_preserved = True
                    return True
            except TimeoutException:
                continue
        
        # Strategy 2: Check for iframes
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_attrs = {
                    "id": iframe.get_attribute("id") or "",
                    "class": iframe.get_attribute("class") or "",
                    "src": iframe.get_attribute("src") or "",
                    "name": iframe.get_attribute("name") or ""
                }
                
                consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice']
                if any(keyword in str(iframe_attrs).lower() for keyword in consent_keywords):
                    print(f"Found potential consent iframe: {iframe_attrs}")
                    self.banner_state_preserved = True
                    return True
        except Exception as e:
            print(f"Error checking iframes: {e}")
        
        self.banner_state_preserved = False
        return False
    
    def force_banner_reappearance(self):
        """Force cookie banner to reappear by manipulating cookies/localStorage"""
        try:
            print("🔄 Attempting to restore banner state...")
            
            # Clear consent cookies specifically
            consent_cookie_patterns = ['OptanonConsent', 'euconsent', 'gdpr', 'cookie_consent', 'consent']
            
            cookies_cleared = 0
            for cookie in self.driver.get_cookies():
                cookie_name = cookie['name'].lower()
                if any(pattern.lower() in cookie_name for pattern in consent_cookie_patterns):
                    self.driver.delete_cookie(cookie['name'])
                    cookies_cleared += 1
                    print(f"🍪 Deleted consent cookie: {cookie['name']}")
            
            # Clear localStorage consent data
            self.driver.execute_script("""
                // Clear common consent localStorage keys
                const consentKeys = Object.keys(localStorage).filter(key => 
                    key.toLowerCase().includes('consent') || 
                    key.toLowerCase().includes('gdpr') ||
                    key.toLowerCase().includes('cookie')
                );
                consentKeys.forEach(key => localStorage.removeItem(key));
                console.log('Cleared localStorage consent keys:', consentKeys);
            """)
            
            # Only refresh if we cleared something
            if cookies_cleared > 0:
                print(f"🔄 Refreshing page after clearing {cookies_cleared} consent cookies...")
                self.driver.refresh()
                time.sleep(3)
                return True
            else:
                print("ℹ️ No consent cookies found to clear")
                return False
                
        except Exception as e:
            print(f"⚠️ Error forcing banner reappearance: {e}")
            return False
    
    def return_to_original_page_with_banner(self):
        """Smart return to original page with banner restoration"""
        if not self.original_url:
            print("⚠️ No original URL stored")
            return False
            
        try:
            print(f"🏠 Returning to original page: {self.original_url}")
            
            # First try browser back navigation (preserves more state)
            current_url = self.driver.current_url
            if current_url != self.original_url:
                try:
                    # Try using browser back if we're just one page away
                    self.driver.back()
                    time.sleep(2)
                    
                    # Check if we're back to the right page
                    if self.original_url not in self.driver.current_url:
                        # Back didn't work, use direct navigation
                        print("   🔄 Browser back didn't work, using direct navigation...")
                        self.driver.get(self.original_url)
                        time.sleep(WAIT_TIME)
                except:
                    # Fallback to direct navigation
                    self.driver.get(self.original_url)
                    time.sleep(WAIT_TIME)
            
            # Check if banner is visible
            if self.wait_for_consent_banner(timeout=3):
                print("✅ Banner restored successfully")
                return True
            else:
                # Try to force banner reappearance
                print("🔧 Banner not visible, attempting to restore...")
                if self.force_banner_reappearance():
                    return self.wait_for_consent_banner(timeout=5)
                return False
                
        except Exception as e:
            print(f"❌ Error returning to original page: {e}")
            return False
    
    def find_consent_elements(self):
        """Enhanced element detection with iframe tracking"""
        consent_elements = {
            "banners": [],
            "iframes": [],
            "buttons": [],
            "links": [],
            "toggles": []
        }
        
        # Reset iframe tracking
        self.current_iframe = None
        
        # Enhanced selectors for consent banners
        banner_selectors = [
            "[id*='cookie'][id*='banner']",
            "[id*='cookie'][id*='consent']",
            "[id*='privacy'][id*='banner']",
            "[id*='gdpr']",
            "[id*='consent']",
            "#onetrust-banner-sdk",
            "#CybotCookiebotDialog",
            "#didomi-popup",
            "#usercentrics-root",
            "[class*='cookie-banner']",
            "[class*='consent-banner']",
            "[class*='privacy-banner']",
            "[class*='gdpr-banner']",
            ".qc-cmp2-container",
            ".ot-sdk-container",
            "[role='dialog'][aria-label*='cookie']",
            "[role='dialog'][aria-label*='consent']",
            "[role='dialog'][aria-label*='privacy']",
            "[data-testid*='cookie']",
            "[data-testid*='consent']",
            "[data-component*='cookie']",
            "[data-component*='consent']"
        ]
        
        # Find banner elements
        for selector in banner_selectors:
            try:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed() and element.size['height'] > 50:
                        text = element.text.lower()
                        if any(keyword in text for keyword in ['cookie', 'privacy', 'consent', 'data', 'accept']):
                            consent_elements["banners"].append(element)
                            print(f"Found banner via selector: {selector}")
            except Exception as e:
                continue
        
        # Find iframes and store the active one
        try:
            iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
            for iframe in iframes:
                iframe_info = self._get_iframe_info(iframe)
                if self._is_consent_iframe(iframe_info):
                    consent_elements["iframes"].append(iframe)
                    # Store the iframe for later use
                    self.current_iframe = iframe
        except Exception:
            pass
        
        return consent_elements
    
    def _get_iframe_info(self, iframe):
        """Get iframe attributes"""
        return {
            "id": iframe.get_attribute("id") or "",
            "class": iframe.get_attribute("class") or "",
            "src": iframe.get_attribute("src") or "",
            "name": iframe.get_attribute("name") or ""
        }
    
    def _is_consent_iframe(self, iframe_info):
        """Check if iframe likely contains consent content"""
        consent_keywords = ['consent', 'cookie', 'privacy', 'gdpr', 'ccpa', 'cmp', 'notice']
        iframe_str = str(iframe_info).lower()
        return any(keyword in iframe_str for keyword in consent_keywords)
    
    def extract_notice_details(self, element):
        """Extract text, buttons and links from a notice element with enhanced formatting"""
        details = {
            "text": element.text.strip(),
            "buttons": [],
            "links": [],
            "is_in_iframe": False,
            "all_buttons_formatted": "",  # New: semicolon-separated button text
            "has_reject": False,          # New: boolean flags
            "has_accept_all": False,
            "has_customize": False
        }
        
        # Extract buttons
        try:
            buttons = element.find_elements(By.TAG_NAME, "button")
            button_texts = []
            
            for button in buttons:
                if button.is_displayed():
                    button_text = button.text.strip()
                    if button_text:
                        details["buttons"].append({
                            "text": button_text,
                            "element": button
                        })
                        button_texts.append(button_text)
                        
                        # Check for specific button types
                        button_lower = button_text.lower()
                        if any(word in button_lower for word in ['reject', 'decline', 'deny']):
                            details["has_reject"] = True
                        if any(word in button_lower for word in ['accept all', 'allow all', 'agree']):
                            details["has_accept_all"] = True
                        if any(word in button_lower for word in ['customize', 'preferences', 'settings', 'manage']):
                            details["has_customize"] = True
            
            # Format buttons as semicolon-separated string
            details["all_buttons_formatted"] = "; ".join(button_texts)
            
        except Exception as e:
            print(f"Error extracting buttons: {e}")
        
        # Also look for span/div buttons
        try:
            spans_divs = element.find_elements(By.CSS_SELECTOR, "span[role='button'], div[role='button']")
            additional_buttons = []
            
            for item in spans_divs:
                if item.is_displayed():
                    item_text = item.text.strip()
                    if item_text:
                        details["buttons"].append({
                            "text": item_text,
                            "element": item
                        })
                        additional_buttons.append(item_text)
                        
                        # Check for specific button types
                        item_lower = item_text.lower()
                        if any(word in item_lower for word in ['reject', 'decline', 'deny']):
                            details["has_reject"] = True
                        if any(word in item_lower for word in ['accept all', 'allow all', 'agree']):
                            details["has_accept_all"] = True
                        if any(word in item_lower for word in ['customize', 'preferences', 'settings', 'manage']):
                            details["has_customize"] = True
            
            # Update formatted buttons if we found additional ones
            if additional_buttons:
                all_buttons = details["all_buttons_formatted"].split("; ") if details["all_buttons_formatted"] else []
                all_buttons.extend(additional_buttons)
                details["all_buttons_formatted"] = "; ".join(all_buttons)
                
        except Exception as e:
            print(f"Error extracting span/div buttons: {e}")
        
        # Extract links
        try:
            links = element.find_elements(By.TAG_NAME, "a")
            for link in links:
                if link.is_displayed():
                    link_text = link.text.strip()
                    link_href = link.get_attribute("href")
                    if link_text and link_href:
                        # Resolve iframe URLs to actual policy URLs
                        resolved_href = self._resolve_policy_url(link_href, link_text)
                        details["links"].append({
                            "text": link_text,
                            "href": resolved_href,
                            "original_href": link_href,
                            "element": link
                        })
        except Exception as e:
            print(f"Error extracting links: {e}")
        
        return details

    def _resolve_policy_url(self, href, link_text):
        """ENHANCED URL resolution with website base detection"""
        # If it's already a normal URL (not iframe), return as is
        if href.startswith("http") and "iframe" not in href and "#" not in href:
            return href
        
        # Get base domain from current page or original URL
        base_url = self.original_url or self.driver.current_url
        parsed = urlparse(base_url)
        base_domain = f"{parsed.scheme}://{parsed.netloc}"
        
        # Enhanced resolution logic from second code
        link_lower = link_text.lower()
        
        # Common policy URL patterns for any website
        if "privacy policy faq" in link_lower or ("faq" in link_lower and "privacy" in link_lower):
            # Try common FAQ URL patterns
            common_faq_patterns = [
                "/privacy-faq",
                "/help/privacy-faq", 
                "/privacy/faq",
                "/help/security/privacy-policy-faq",
                "/legal/privacy-faq",
                "/privacy#faq"
            ]
            
            # Test which URL exists
            for pattern in common_faq_patterns:
                test_url = base_domain + pattern
                if self._test_url_exists(test_url):
                    return test_url
            
            # Fallback to first pattern
            return f"{base_domain}{common_faq_patterns[0]}"
            
        elif "privacy policy" in link_lower or "privacy" in link_lower:
            return f"{base_domain}/privacy"
            
        elif "cookie policy" in link_lower or "cookie" in link_lower:
            return f"{base_domain}/cookie-policy"
            
        elif "terms" in link_lower:
            return f"{base_domain}/terms"
            
        elif "legal" in link_lower:
            return f"{base_domain}/legal"
        else:
            # Generic fallback
            return f"{base_domain}/privacy"
    
    def _test_url_exists(self, url):
        """Test if a URL exists"""
        try:
            response = requests.head(url, allow_redirects=True, timeout=3, 
                                headers={'User-Agent': 'Mozilla/5.0'})
            return response.status_code < 400
        except:
            return False
        
    def _click_element_safely_with_iframe_context(self, element):
        """Enhanced element clicking with iframe context awareness"""
        try:
            print(f"🎯 Attempting to click element (iframe context: {self.current_iframe is not None})")
            
            # If we have an iframe and we're not in it, switch to it
            if self.current_iframe and not self._is_in_iframe():
                print(f"🔄 Switching to iframe for button click...")
                self.driver.switch_to.frame(self.current_iframe)
                time.sleep(1)
                
                # Re-find the button in iframe context
                button_text = element.text if hasattr(element, 'text') else "Unknown"
                buttons = self.driver.find_elements(By.CSS_SELECTOR, "button, [role='button']")
                
                target_button = None
                for btn in buttons:
                    if btn.is_displayed() and btn.text.strip() == button_text:
                        target_button = btn
                        break
                
                if target_button:
                    element = target_button
                    print(f"✅ Found button in iframe: {button_text}")
                else:
                    print(f"❌ Button not found in iframe: {button_text}")
                    return False
            
            # Try multiple click strategies
            strategies = [
                ("Direct click", lambda: element.click()),
                ("JavaScript click", lambda: self.driver.execute_script("arguments[0].click();", element)),
                ("Scroll and click", self._scroll_and_click),
                ("Actions click", self._actions_click)
            ]
            
            for strategy_name, strategy_func in strategies:
                try:
                    print(f"   🚀 Trying: {strategy_name}")
                    if strategy_name in ["Scroll and click", "Actions click"]:
                        strategy_func(element)
                    else:
                        strategy_func()
                    print(f"   ✅ Success with: {strategy_name}")
                    return True
                except Exception as e:
                    print(f"   ❌ Failed {strategy_name}: {e}")
                    continue
            
            return False
            
        except Exception as e:
            print(f"❌ All click strategies failed: {e}")
            return False

    def _is_in_iframe(self):
        """Check if we're currently in an iframe"""
        try:
            # Try to access the main window - if we're in iframe, this will be different
            main_window = self.driver.execute_script("return window.top === window;")
            return not main_window
        except:
            return False

    def _scroll_and_click(self, element):
        """Scroll to element and click"""
        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
        time.sleep(0.5)
        element.click()

    def _actions_click(self, element):
        """Use Actions API to click"""
        from selenium.webdriver.common.action_chains import ActionChains
        ActionChains(self.driver).move_to_element(element).click().perform()
    
    def display_options(self, notice_details):
        """Display interactive options to the user"""
        print(f"\n--- Notice Text ---\n{notice_details['text']}\n")
        
        print("--- Available Buttons ---")
        for i, button in enumerate(notice_details['buttons']):
            print(f"{i+1}. {button['text']}")
        
        print("\n--- Available Links ---")
        for i, link in enumerate(notice_details['links']):
            print(f"{i+1}. {link['text']} => {link['href']}")
        
        print("\n--- Actions ---")
        print("To click a button, enter: B<number> (e.g., B1 for the first button)")
        print("To click a link, enter: L<number> (e.g., L1 for the first link)")
        print("To take a manual screenshot, enter: S")
        print("To find and expand dropdown sections, enter: E")
        print("To go back, enter: BACK")
        print("To finish exploration, enter: DONE")
        
        return input("Enter your choice: ")
    
    def take_screenshot(self, state_id=None, custom_name=None):
        """Take screenshot"""
        domain = urlparse(self.current_website).netloc.replace('.', '_') if hasattr(self, 'current_website') else "unknown"
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        if custom_name:
            filename = f"{domain}_{custom_name}_{timestamp}.png"
        elif state_id:
            filename = f"{domain}_state_{state_id}_{timestamp}.png"
        else:
            filename = f"{domain}_{timestamp}.png"
            
        filepath = os.path.join(SCREENSHOTS_DIR, filename)
        self.driver.save_screenshot(filepath)
        return filepath
    
    def record_state(self, website, parent_id, state_id, notice_details, screenshot_path, choice=None, action_type=None):
        """Enhanced record state with proper formatting for Excel output"""
        
        # Determine if this is a policy page
        current_url = self.driver.current_url
        is_policy_page = (current_url != self.original_url and 
                        any(keyword in current_url.lower() for keyword in ['privacy', 'policy', 'legal', 'terms', 'cookie']))
        
        # Format links for Excel (up to 5 links)
        links_data = {}
        for i in range(1, 6):  # Links 1-5
            if i-1 < len(notice_details['links']):
                link = notice_details['links'][i-1]
                links_data[f"Link {i}"] = link['text']
                links_data[f"Link {i} Detail"] = link['href']
            else:
                links_data[f"Link {i}"] = ""
                links_data[f"Link {i} Detail"] = ""
        
        # Determine action type
        if action_type is None:
            if choice:
                if "customize" in choice.lower():
                    action_type = "Customize cookies"
                elif "reject" in choice.lower() or "decline" in choice.lower():
                    action_type = "Decline"
                elif "accept" in choice.lower():
                    action_type = "Accept All"
                elif "link:" in choice.lower():
                    action_type = choice  # Keep the full "Policy Link: [name]" format
                else:
                    action_type = choice
            else:
                action_type = ""
        
        # Create the row with all required columns
        row = {
            "Website": website,
            "ParentID": parent_id if parent_id else "",
            "StateID": state_id,
            "Depth": self.current_depth,
            "Timestamp": datetime.now().isoformat(),
            "Screenshot": screenshot_path,
            "Text of Notice": notice_details['text'],
            "All Buttons": notice_details.get('all_buttons_formatted', ''),
            "Has Reject Option": notice_details.get('has_reject', False),
            "Has Accept All": notice_details.get('has_accept_all', False),
            "Has Customize": notice_details.get('has_customize', False),
            "Choice Made": choice if choice else "",
            "Is Policy Page": is_policy_page,
            **links_data,
            "Action Type": action_type
        }
        
        self.data.append(row)
        print(f"📝 Recorded state: {state_id} (Depth: {self.current_depth}, Policy Page: {is_policy_page})")
        return self.data
  
    def extract_policy_page_content(self):
        """Extract comprehensive content from policy pages"""
        policy_details = {
            "text": "",
            "buttons": [],
            "links": [],
            "is_in_iframe": False,
            "all_buttons_formatted": "",
            "has_reject": False,
            "has_accept_all": False,
            "has_customize": False
        }
        
        try:
            print("📄 Extracting policy page content...")
            
            # Strategy 1: Try to find main content container
            content_selectors = [
                "main", 
                "article", 
                ".content", 
                "#content", 
                ".privacy-policy", 
                ".cookie-policy",
                "[role='main']",
                ".policy-content",
                ".legal-content",
                ".page-content",
                "#main-content"
            ]
            
            main_content = None
            for selector in content_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        if element.is_displayed() and len(element.text.strip()) > 200:
                            main_content = element
                            print(f"Found main content using selector: {selector}")
                            break
                    if main_content:
                        break
                except:
                    continue
            
            # Strategy 2: Fallback to body if no main content found
            if not main_content:
                print("Using body as fallback for content extraction")
                main_content = self.driver.find_element(By.TAG_NAME, "body")
            
            # Extract and clean text content
            raw_text = main_content.text.strip()
            
            # Clean and format the text
            import re
            # Remove excessive whitespace
            cleaned_text = re.sub(r'\n\s*\n', '\n\n', raw_text)  # Clean up multiple newlines
            cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)   # Clean up multiple spaces
            cleaned_text = re.sub(r'\n\s+', '\n', cleaned_text)   # Clean up indented lines
            
            # Remove navigation elements and common page clutter
            lines = cleaned_text.split('\n')
            filtered_lines = []
            
            skip_patterns = [
                r'^(home|about|contact|login|sign up|menu|navigation)$',
                r'^©.*\d{4}',  # Copyright notices
                r'^\s*$',      # Empty lines
                r'^(cookies|privacy|terms|legal)\s*$'  # Single word navigation
            ]
            
            for line in lines:
                line = line.strip()
                if line and len(line) > 3:  # Keep substantial lines
                    skip_line = False
                    for pattern in skip_patterns:
                        if re.match(pattern, line, re.IGNORECASE):
                            skip_line = True
                            break
                    if not skip_line:
                        filtered_lines.append(line)
            
            policy_details["text"] = '\n'.join(filtered_lines)
            
            # Extract buttons within policy content
            button_selectors = [
                "button", 
                "[role='button']", 
                "input[type='submit']",
                "input[type='button']",
                ".btn",
                ".button"
            ]
            
            all_buttons = []
            for selector in button_selectors:
                try:
                    buttons = main_content.find_elements(By.CSS_SELECTOR, selector)
                    for button in buttons:
                        if button.is_displayed():
                            button_text = button.text.strip()
                            if button_text and len(button_text) < 100:  # Reasonable button text length
                                policy_details["buttons"].append({
                                    "text": button_text,
                                    "element": button
                                })
                                all_buttons.append(button_text)
                                
                                # Check button types
                                button_lower = button_text.lower()
                                if any(word in button_lower for word in ['reject', 'decline', 'deny', 'opt out']):
                                    policy_details["has_reject"] = True
                                if any(word in button_lower for word in ['accept all', 'allow all', 'agree']):
                                    policy_details["has_accept_all"] = True
                                if any(word in button_lower for word in ['customize', 'preferences', 'settings', 'manage']):
                                    policy_details["has_customize"] = True
                except:
                    continue
            
            policy_details["all_buttons_formatted"] = "; ".join(all_buttons)
            
            # Extract links within policy content (limit to relevant ones)
            try:
                links = main_content.find_elements(By.TAG_NAME, "a")
                link_count = 0
                for link in links:
                    if link.is_displayed() and link_count < 10:  # Limit to first 10 relevant links
                        link_text = link.text.strip()
                        link_href = link.get_attribute("href")
                        if link_text and link_href and len(link_text) < 100:
                            # Only include policy-related links
                            if any(keyword in link_text.lower() for keyword in [
                                'privacy', 'cookie', 'terms', 'legal', 'policy', 'data', 'gdpr', 'ccpa'
                            ]):
                                policy_details["links"].append({
                                    "text": link_text,
                                    "href": link_href,
                                    "element": link
                                })
                                link_count += 1
            except Exception as e:
                print(f"Error extracting policy links: {e}")
            
            print(f"📄 Extracted policy content: {len(policy_details['text'])} characters, {len(policy_details['buttons'])} buttons, {len(policy_details['links'])} links")
            
        except Exception as e:
            print(f"❌ Error extracting policy content: {e}")
            policy_details["text"] = f"Error extracting policy content: {str(e)}"
        
        return policy_details

    def explore_interactively(self, website, max_depth=3):
        """Main interactive exploration method with enhanced navigation and proper data recording"""
        self.current_website = website
        self.original_url = website  # Store original URL
        self.current_depth = 0  # Reset depth for new exploration
        
        print(f"🌐 Visiting {website}...")
        
        try:
            # Navigate to website (NO INITIAL COOKIE CLEARING)
            self.driver.get(website)
            time.sleep(3)
            
            # Check for bot detection
            print("🤖 Checking for bot detection...")
            page_title = self.driver.title.lower()
            page_text = self.driver.find_element(By.TAG_NAME, "body").text.lower()
            
            bot_indicators = ["security check", "captcha", "bot check", "human verification", "please verify"]
            if any(indicator in page_title or indicator in page_text for indicator in bot_indicators):
                print("⚠️  BOT DETECTION DETECTED!")
                input("Please solve the captcha/verification in the browser window, then press Enter to continue...")
                print("✅ Continuing with scraping...")
                time.sleep(2)
            
            # Generate root state
            root_state_id = str(uuid.uuid4())[:8]
            screenshot_path = self.take_screenshot(root_state_id, "initial")
            print(f"📸 Initial screenshot: {screenshot_path}")
            
            # Wait for banner
            banner_found = self.wait_for_consent_banner()
            
            # Find notice elements
            consent_elements = self.find_consent_elements()
            
            if consent_elements["banners"]:
                print(f"🎯 Found {len(consent_elements['banners'])} banner elements")
                notice_details = self.extract_notice_details(consent_elements["banners"][0])
            elif consent_elements["iframes"]:
                print(f"🎯 Found {len(consent_elements['iframes'])} iframe elements")
                notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
            else:
                print("❌ No consent banner found")
                notice_details = {
                    "text": "No cookie notice detected", "buttons": [], "links": [],
                    "all_buttons_formatted": "", "has_reject": False, 
                    "has_accept_all": False, "has_customize": False
                }
            
            # Record initial state (depth 0)
            self.record_state(website, None, root_state_id, notice_details, screenshot_path)
            
            # Print initial analysis
            print(f"📊 Initial banner analysis:")
            print(f"   Buttons found: {notice_details['all_buttons_formatted']}")
            print(f"   Has Reject: {notice_details['has_reject']}")
            print(f"   Has Accept All: {notice_details['has_accept_all']}")
            print(f"   Has Customize: {notice_details['has_customize']}")
            print(f"   Links found: {len(notice_details['links'])}")
            
            # Continue with exploration loop...
            return self._run_exploration_loop(website, root_state_id, notice_details, max_depth)
            
        except Exception as e:
            print(f"❌ Error during exploration setup: {e}")
            return self.data

    def _run_exploration_loop(self, website, root_state_id, initial_notice_details, max_depth):
        """Main exploration loop with proper depth tracking and data recording"""
        
        # Initialize exploration variables
        current_state_id = root_state_id
        parent_id = None
        notice_details = initial_notice_details
        state_stack = []
        
        while self.current_depth < max_depth:
            print(f"\n🔍 Current depth: {self.current_depth}/{max_depth}")
            choice = self.display_options(notice_details)
            
            if choice.upper() == "DONE":
                print("✅ Exploration complete!")
                break
            
            elif choice.upper() == "S":
                manual_screenshot = self.take_screenshot(custom_name=f"manual_{datetime.now().strftime('%H%M%S')}")
                print(f"📸 Manual screenshot: {manual_screenshot}")
                continue
            
            elif choice.upper() == "BACK":
                print(f"⬅️ Going back to previous state...")
                
                # Use smart back navigation with depth tracking
                new_state_id, new_parent_id, new_notice_details = self.smart_back_navigation(
                    state_stack, current_state_id, parent_id, self.current_depth
                )
                
                if new_state_id is not None:
                    # Update state variables (depth already decremented in smart_back_navigation)
                    current_state_id = new_state_id
                    parent_id = new_parent_id
                    notice_details = new_notice_details
                    
                    # Take screenshot of restored state
                    screenshot_path = self.take_screenshot(current_state_id, "back_navigation")
                    print(f"📸 Screenshot: {screenshot_path}")
                    print(f"📊 Current depth after back: {self.current_depth}")
                else:
                    print("❌ Back navigation failed")
            
            elif choice.upper().startswith("B") and notice_details['buttons']:
                # Handle button clicks with enhanced tracking
                if self._handle_button_click(choice, notice_details, website, current_state_id, state_stack):
                    # Update state variables for successful button click
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    self.current_depth += 1
                    
                    # Get new notice details and record state
                    notice_details = self._get_new_notice_state()
                    screenshot_path = self.take_screenshot(current_state_id, f"after_button_{self.current_depth}")
                    self.record_state(website, parent_id, current_state_id, notice_details, 
                                    screenshot_path, self._get_choice_text(choice, "button", notice_details))
            
            elif choice.upper().startswith("L") and notice_details['links']:
                # Handle link clicks with enhanced tracking
                if self._handle_link_click(choice, notice_details, website, current_state_id, state_stack):
                    # Update state variables for successful link click
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    self.current_depth += 1
                    
                    # Get policy page content and record state
                    notice_details = self.extract_policy_page_content()
                    screenshot_path = self.take_screenshot(current_state_id, "policy_page")
                    choice_text = self._get_choice_text(choice, "link", notice_details)
                    self.record_state(website, parent_id, current_state_id, notice_details, 
                                    screenshot_path, choice_text)
            
            elif choice.upper() == "E":
                # Handle expandable sections (no state change, just screenshots)
                self._handle_expandable_sections()
                continue
            
            else:
                print(f"❌ Invalid choice: {choice}")
                continue
        
        return self.data

    def _get_choice_text(self, choice, choice_type, notice_details):
        """Get properly formatted choice text for recording"""
        try:
            if choice_type == "button":
                button_idx = int(choice[1:]) - 1
                if 0 <= button_idx < len(notice_details['buttons']):
                    return notice_details['buttons'][button_idx]['text']
            elif choice_type == "link":
                link_idx = int(choice[1:]) - 1
                if 0 <= link_idx < len(notice_details['links']):
                    link = notice_details['links'][link_idx]
                    return f"Policy Link: {link['text']}"
        except:
            pass
        return choice
    
    def find_expandable_sections_enhanced(self, container):
        """GENERALIZED expandable section detection for any website"""
        expandables = []
        
        print("🔍 Looking for expandable sections with enhanced detection...")
        
        # Strategy 1: Look for common cookie category patterns (works for any site)
        common_categories = [
            # Essential categories
            "strictly necessary", "necessary", "essential", "required",
            "functional", "functionality", "performance", 
            "analytics", "analytical", "statistics", "statistical",
            
            # Marketing categories  
            "marketing", "advertising", "targeting", "targeted",
            "social media", "social", "personalization", "personalized",
            
            # Data categories
            "third party", "data collection", "tracking", "cookies",
            "do not sell", "do not share", "opt out", "privacy choices"
        ]
        
        for category in common_categories:
            try:
                # Look for elements containing category text (case insensitive)
                xpath_patterns = [
                    f".//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]",
                    f".//div[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]",
                    f".//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{category.lower()}')]"
                ]
                
                for xpath in xpath_patterns:
                    try:
                        elements = container.find_elements(By.XPATH, xpath)
                        for element in elements:
                            if element.is_displayed():
                                element_text = element.text.strip()
                                # Only add if text is reasonable length (not entire page)
                                if 5 < len(element_text) < 200 and category.lower() in element_text.lower():
                                    print(f"   Found category element: {element_text[:50]}...")
                                    
                                    # Look for clickable parent
                                    clickable_element = self._find_clickable_parent(element)
                                    
                                    expandables.append({
                                        "element": clickable_element or element,
                                        "text": element_text,
                                        "type": "category",
                                        "strategy": f"text_match:{category}"
                                    })
                                    break
                        
                        # If we found something for this category, move to next
                        if expandables and category.lower() in expandables[-1]["text"].lower():
                            break
                            
                    except Exception as e:
                        continue
                        
            except Exception as e:
                print(f"   Error searching for {category}: {e}")
        
        # Strategy 2: Look for accordion/collapsible patterns
        accordion_selectors = [
            ".accordion-header", "[aria-expanded]", "[data-toggle='collapse']",
            "details", "summary", ".expandable", ".collapsible",
            ".toggle", "[role='button']", ".category-header"
        ]
        
        for selector in accordion_selectors:
            try:
                elements = container.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed():
                        text = element.text.strip()
                        # Only add if contains relevant keywords
                        if text and any(keyword in text.lower() for keyword in [
                            "cookie", "necessary", "functional", "analytics", 
                            "advertising", "marketing", "social", "performance",
                            "targeting", "privacy", "data"
                        ]):
                            expandables.append({
                                "element": element,
                                "text": text[:100],
                                "type": "accordion", 
                                "strategy": selector
                            })
            except Exception as e:
                continue
        
        # Remove duplicates based on text similarity
        unique_expandables = []
        for exp in expandables:
            is_duplicate = False
            for unique_exp in unique_expandables:
                if self._text_similarity(exp["text"], unique_exp["text"]) > 0.8:
                    is_duplicate = True
                    break
            if not is_duplicate:
                unique_expandables.append(exp)
        
        print(f"🎯 Total unique expandable sections found: {len(unique_expandables)}")
        return unique_expandables
    
    def _find_clickable_parent(self, element):
        """Find a clickable parent element"""
        try:
            current = element
            for _ in range(3):  # Go up to 3 levels
                parent = current.find_element(By.XPATH, "..")
                if parent.tag_name == "body":
                    break
                
                # Check if parent is clickable
                if (parent.get_attribute("onclick") or 
                    parent.get_attribute("role") == "button" or
                    "click" in parent.get_attribute("class").lower() if parent.get_attribute("class") else False):
                    return parent
                
                current = parent
        except:
            pass
        return None
    
    def _text_similarity(self, text1, text2):
        """Simple text similarity calculation"""
        if not text1 or not text2:
            return 0.0
        
        # Convert to sets of words
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        # Calculate Jaccard similarity
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        if not union:
            return 0.0
        
        return len(intersection) / len(union)
    
    def try_expand_element_enhanced(self, expandable_info):
        """GENERALIZED element expansion strategies for any website"""
        element = expandable_info["element"]
        element_type = expandable_info["type"]
        strategy = expandable_info["strategy"]
        
        print(f"🚀 Attempting to expand: {expandable_info['text'][:50]}...")
        print(f"   Type: {element_type}, Strategy: {strategy}")
        
        # Enhanced universal expansion strategies
        universal_strategies = [
            ("Direct click", lambda: self._click_element_safely_with_iframe_context(element)),
            ("JavaScript click", lambda: self.driver.execute_script("arguments[0].click();", element)),
            ("Double click", lambda: self._double_click_element(element)),
            ("Parent click", lambda: self._click_parent_element(element)),
            ("Actions click", lambda: self._actions_click(element)),
        ]
        
        for strategy_name, strategy_func in universal_strategies:
            try:
                print(f"   Trying strategy: {strategy_name}")
                result = strategy_func()
                if result:
                    print(f"   ✅ Success with: {strategy_name}")
                    time.sleep(2)  # Wait for expansion animation
                    return True
                else:
                    print(f"   ❌ Failed: {strategy_name}")
            except Exception as e:
                print(f"   ❌ Error with {strategy_name}: {e}")
        
        print(f"   ⚠️ All strategies failed for: {expandable_info['text'][:30]}")
        return False
    

        """Handle expandable section detection and expansion"""
        print("🔧 Enhanced expandable section detection...")
        
        # Find dialog container (prioritize iframe context if we're in one)
        dialog = None
        if self.current_iframe and self._is_in_iframe():
            print("📦 Using iframe context for expansion detection")
            try:
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            except:
                pass
        
        if not dialog:
            print("📦 Searching for dialog container in main content")
            dialog_selectors = [
                "#onetrust-consent-sdk", ".cookie-notice", "#cookie-notice",
                "div[role='dialog']", ".modal-dialog", ".consent-modal"
            ]
            
            for selector in dialog_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for el in elements:
                        if el.is_displayed():
                            dialog = el
                            print(f"📦 Found dialog container: {selector}")
                            break
                    if dialog:
                        break
                except:
                    continue
        
        if not dialog:
            print("📦 Using body as container")
            if self.current_iframe and self._is_in_iframe():
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            else:
                # Switch to iframe if we have one
                if self.current_iframe:
                    self.driver.switch_to.frame(self.current_iframe)
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
                    print("📦 Switched to iframe and using body")
                else:
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
        
        # Find expandable sections with enhanced detection
        expandables = self.find_expandable_sections_enhanced(dialog)
        
        if expandables:
            print(f"🎯 Found {len(expandables)} expandable sections:")
            for i, exp in enumerate(expandables):
                display_text = exp['text'][:50] + "..." if len(exp['text']) > 50 else exp['text']
                print(f"   {i+1}. {display_text} ({exp['type']})")
            
            print("\n🔧 Options:")
            print("   A - Expand ALL sections")
            print("   1-9 - Expand specific section number")
            print("   SKIP - Skip expansion and continue")
            
            exp_choice = input("Enter your choice: ").strip()
            
            if exp_choice.upper() == "A":
                print("🚀 Expanding ALL sections...")
                expanded_count = 0
                for i, exp in enumerate(expandables):
                    print(f"\n--- Expanding {i+1}/{len(expandables)}: {exp['text'][:30]}... ---")
                    success = self.try_expand_element_enhanced(exp)
                    if success:
                        expanded_count += 1
                        screenshot = self.take_screenshot(custom_name=f"expanded_{i+1}_{exp['type']}")
                        print(f"📸 Screenshot saved: {screenshot}")
                    else:
                        print(f"⚠️ Could not expand section {i+1}")
                    time.sleep(1)  # Brief pause between expansions
                
                print(f"\n✅ Expansion complete! Successfully expanded {expanded_count}/{len(expandables)} sections")
                
                # Take a final screenshot showing all expanded sections
                final_screenshot = self.take_screenshot(custom_name="all_expanded_final")
                print(f"📸 Final screenshot with all expansions: {final_screenshot}")
                
            elif exp_choice.upper() == "SKIP":
                print("⏭️ Skipping expansion...")
                
            else:
                # Handle specific section selection
                try:
                    idx = int(exp_choice) - 1
                    if 0 <= idx < len(expandables):
                        exp = expandables[idx]
                        print(f"\n🚀 Expanding selected section: {exp['text'][:50]}...")
                        success = self.try_expand_element_enhanced(exp)
                        if success:
                            screenshot = self.take_screenshot(custom_name=f"expanded_section_{idx+1}")
                            print(f"📸 Screenshot saved: {screenshot}")
                        else:
                            print(f"⚠️ Could not expand section: {exp['text'][:50]}")
                    else:
                        print(f"❌ Invalid selection. Please enter 1-{len(expandables)}, A, or SKIP")
                except ValueError:
                    print("❌ Invalid input. Please enter a number, A for all, or SKIP")
        else:
            print("❌ No expandable sections found")

    def _double_click_element(self, element):
        """Try double-clicking the element"""
        try:
            from selenium.webdriver.common.action_chains import ActionChains
            ActionChains(self.driver).double_click(element).perform()
            return True
        except:
            return False
    
    def _click_parent_element(self, element):
        """Try clicking parent elements"""
        try:
            parent = element.find_element(By.XPATH, "..")
            return self._click_element_safely_with_iframe_context(parent)
        except:
            return False
        """Handle going back from customize panels to main banner - GENERALIZED METHOD"""
        try:
            print("🔄 Attempting to navigate back from customize panel...")
            
            # Strategy 1: Look for close/back/cancel buttons in the panel
            close_button_selectors = [
                "//button[contains(text(), 'Close')]",
                "//button[contains(text(), 'Cancel')]", 
                "//button[contains(text(), 'Back')]",
                "//button[contains(@aria-label, 'close')]",
                "//button[contains(@aria-label, 'back')]",
                "//button[contains(@class, 'close')]",
                "//button[contains(@class, 'back')]",
                "//span[contains(@class, 'close')]",
                "//div[contains(@class, 'close')][@role='button']",
                "[data-testid*='close']",
                "[data-testid*='back']"
            ]
            
            for selector in close_button_selectors:
                try:
                    if selector.startswith("//") or selector.startswith("*"):
                        # XPath selector
                        buttons = self.driver.find_elements(By.XPATH, selector)
                    else:
                        # CSS selector
                        buttons = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    for button in buttons:
                        if button.is_displayed():
                            print(f"   Found close/back button: {button.text or button.get_attribute('aria-label') or 'unlabeled'}")
                            if self._click_element_safely_with_iframe_context(button):
                                print("   ✅ Successfully clicked close/back button")
                                time.sleep(2)
                                return True
                except Exception:
                    continue
            
            # Strategy 2: Try clicking outside the panel (works for some modal dialogs)
            try:
                print("   Trying to click outside panel...")
                # Click at a safe position that's usually outside panels
                self.driver.execute_script("document.elementFromPoint(50, 50).click();")
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 3: Try ESC key
            try:
                print("   Trying ESC key...")
                from selenium.webdriver.common.keys import Keys
                body = self.driver.find_element(By.TAG_NAME, "body")
                body.send_keys(Keys.ESCAPE)
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 4: If we're in an iframe, try switching back to default content
            # This might reveal the original banner
            if self.current_iframe and self._is_in_iframe():
                print("   Switching back to main content...")
                self.driver.switch_to.default_content()
                self.current_iframe = None
                time.sleep(2)
                return True
            
            print("   ⚠️ Could not find way to close customize panel")
            return False
            
        except Exception as e:
            print(f"   ❌ Error in customize panel back navigation: {e}")
            return False

    def smart_back_navigation(self, state_stack, current_state_id, parent_id, depth):
        """Intelligent back navigation that handles different contexts with depth tracking"""
        if not state_stack:
            print("❌ Cannot go back further (at root state)")
            return None, None, None
        
        try:
            # Check what type of state we're in
            current_url = self.driver.current_url
            
            # If we're in a customize panel (still on original domain), try panel-specific back navigation first
            if self.original_url and self.original_url in current_url and self.current_iframe:
                print("🎯 Detected customize panel context, trying panel-specific navigation...")
                if self.handle_customize_panel_back_navigation():
                    print("✅ Successfully navigated back within customize context")
                    
                    # Check if we're back to the main banner
                    time.sleep(2)
                    if self.wait_for_consent_banner(timeout=3):
                        # Find the original banner state
                        consent_elements = self.find_consent_elements()
                        if consent_elements["banners"]:
                            notice_details = self.extract_notice_details(consent_elements["banners"][0])
                        elif consent_elements["iframes"]:
                            notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
                        else:
                            notice_details = {"text": "Banner restored but content not detected", "buttons": [], "links": [],
                                            "all_buttons_formatted": "", "has_reject": False, "has_accept_all": False, "has_customize": False}
                        
                        # Pop the state stack and return updated values
                        prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                        # Decrease depth
                        self.current_depth -= 1 if self.current_depth > 0 else 0
                        return prev_state_id, prev_parent_id, notice_details
            
            # Standard navigation for policy pages and other external pages
            print("🏠 Using standard page navigation...")
            
            # Switch back to main content if in iframe
            if self.current_iframe:
                self.driver.switch_to.default_content()
                self.current_iframe = None
            
            # Smart return to original page with banner restoration
            if self.return_to_original_page_with_banner():
                print("✅ Successfully returned with banner state preserved")
                
                # Rediscover notice elements
                consent_elements = self.find_consent_elements()
                
                if consent_elements["banners"]:
                    notice_details = self.extract_notice_details(consent_elements["banners"][0])
                elif consent_elements["iframes"]:
                    notice_details = self.switch_to_iframe_and_extract(consent_elements["iframes"][0])
                else:
                    notice_details = {"text": "No notice detected after return", "buttons": [], "links": [],
                                    "all_buttons_formatted": "", "has_reject": False, "has_accept_all": False, "has_customize": False}
                
                # Pop the state stack and return updated values
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                # Decrease depth
                self.current_depth -= 1 if self.current_depth > 0 else 0
                return prev_state_id, prev_parent_id, notice_details
            else:
                print("⚠️ Could not restore banner state")
                # Still pop the state stack
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                # Decrease depth
                self.current_depth -= 1 if self.current_depth > 0 else 0
                return prev_state_id, prev_parent_id, {"text": "Navigation completed but banner not restored", "buttons": [], "links": [],
                                                    "all_buttons_formatted": "", "has_reject": False, "has_accept_all": False, "has_customize": False}
                
        except Exception as e:
            print(f"❌ Error in smart back navigation: {e}")
            # Still try to pop the state stack
            if state_stack:
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                # Decrease depth
                self.current_depth -= 1 if self.current_depth > 0 else 0
                return prev_state_id, prev_parent_id, {"text": "Error during navigation", "buttons": [], "links": [],
                                                    "all_buttons_formatted": "", "has_reject": False, "has_accept_all": False, "has_customize": False}
            return None, None, None

    def handle_customize_panel_back_navigation(self):
        """Handle going back from customize panels to main banner - GENERALIZED METHOD"""
        try:
            print("🔄 Attempting to navigate back from customize panel...")
            
            # Strategy 1: Look for close/back/cancel buttons in the panel
            close_button_selectors = [
                "//button[contains(text(), 'Close')]",
                "//button[contains(text(), 'Cancel')]", 
                "//button[contains(text(), 'Back')]",
                "//button[contains(@aria-label, 'close')]",
                "//button[contains(@aria-label, 'back')]",
                "//button[contains(@class, 'close')]",
                "//button[contains(@class, 'back')]",
                "//span[contains(@class, 'close')]",
                "//div[contains(@class, 'close')][@role='button']",
                "[data-testid*='close']",
                "[data-testid*='back']"
            ]
            
            for selector in close_button_selectors:
                try:
                    if selector.startswith("//") or selector.startswith("*"):
                        # XPath selector
                        buttons = self.driver.find_elements(By.XPATH, selector)
                    else:
                        # CSS selector
                        buttons = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    for button in buttons:
                        if button.is_displayed():
                            print(f"   Found close/back button: {button.text or button.get_attribute('aria-label') or 'unlabeled'}")
                            if self._click_element_safely_with_iframe_context(button):
                                print("   ✅ Successfully clicked close/back button")
                                time.sleep(2)
                                return True
                except Exception:
                    continue
            
            # Strategy 2: Try clicking outside the panel (works for some modal dialogs)
            try:
                print("   Trying to click outside panel...")
                # Click at a safe position that's usually outside panels
                self.driver.execute_script("document.elementFromPoint(50, 50).click();")
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 3: Try ESC key
            try:
                print("   Trying ESC key...")
                from selenium.webdriver.common.keys import Keys
                body = self.driver.find_element(By.TAG_NAME, "body")
                body.send_keys(Keys.ESCAPE)
                time.sleep(1)
                return True
            except Exception:
                pass
            
            # Strategy 4: If we're in an iframe, try switching back to default content
            # This might reveal the original banner
            if self.current_iframe and self._is_in_iframe():
                print("   Switching back to main content...")
                self.driver.switch_to.default_content()
                self.current_iframe = None
                time.sleep(2)
                return True
            
            print("   ⚠️ Could not find way to close customize panel")
            return False
            
        except Exception as e:
            print(f"   ❌ Error in customize panel back navigation: {e}")
            return False


        """Handle link clicks with proper error handling and state tracking"""
        try:
            link_idx = int(choice[1:]) - 1
            if 0 <= link_idx < len(notice_details['links']):
                link = notice_details['links'][link_idx]
                print(f"🔗 Clicking link: {link['text']} ({link['href']})")
                
                # Save current state for back navigation
                state_stack.append((self.driver.current_url, current_state_id,
                                None if not state_stack else current_state_id,
                                self.current_iframe))
                
                try:
                    # Switch back to main content for navigation
                    if self.current_iframe:
                        self.driver.switch_to.default_content()
                        self.current_iframe = None
                    
                    # Navigate to resolved link
                    print(f"🌐 Navigating to: {link['href']}")
                    self.driver.get(link['href'])
                    time.sleep(WAIT_TIME)
                    return True
                    
                except Exception as e:
                    print(f"❌ Error clicking link: {e}")
                    # Remove the state we just added since navigation failed
                    state_stack.pop()
                    return False
            else:
                print(f"❌ Invalid link index: {link_idx}")
                return False
        except ValueError:
            print(f"❌ Invalid link choice: {choice}")
            return False

    def _handle_button_click(self, choice, notice_details, website, current_state_id, state_stack):
        """Handle button clicks with proper error handling and state tracking"""
        try:
            button_idx = int(choice[1:]) - 1
            if 0 <= button_idx < len(notice_details['buttons']):
                button = notice_details['buttons'][button_idx]
                print(f"🖱️ Clicking button: {button['text']}")
                
                # Determine parent_id for state tracking
                if state_stack:
                    # If we have previous states, the parent is the current state
                    parent_id = current_state_id
                else:
                    # If this is the first action from root, parent is None
                    parent_id = None
                
                # Save current state for back navigation
                state_stack.append((self.driver.current_url, current_state_id, 
                                parent_id, self.current_iframe))
                
                # Use iframe-aware clicking
                if self._click_element_safely_with_iframe_context(button['element']):
                    print("✅ Button clicked successfully!")
                    time.sleep(WAIT_TIME)
                    return True
                else:
                    print("❌ Failed to click button")
                    # Remove the state we just added since click failed
                    state_stack.pop()
                    return False
            else:
                print(f"❌ Invalid button index: {button_idx}")
                return False
        except ValueError:
            print(f"❌ Invalid button choice: {choice}")
            return False

    def _handle_link_click(self, choice, notice_details, website, current_state_id, state_stack):
        """Handle link clicks with proper error handling and state tracking"""
        try:
            link_idx = int(choice[1:]) - 1
            if 0 <= link_idx < len(notice_details['links']):
                link = notice_details['links'][link_idx]
                print(f"🔗 Clicking link: {link['text']} ({link['href']})")
                
                # Save current state for back navigation
                state_stack.append((self.driver.current_url, current_state_id,
                                None if not state_stack else current_state_id,
                                self.current_iframe))
                
                try:
                    # Switch back to main content for navigation
                    if self.current_iframe:
                        self.driver.switch_to.default_content()
                        self.current_iframe = None
                    
                    # Navigate to resolved link
                    print(f"🌐 Navigating to: {link['href']}")
                    self.driver.get(link['href'])
                    time.sleep(WAIT_TIME)
                    return True
                    
                except Exception as e:
                    print(f"❌ Error clicking link: {e}")
                    # Remove the state we just added since navigation failed
                    state_stack.pop()
                    return False
            else:
                print(f"❌ Invalid link index: {link_idx}")
                return False
        except ValueError:
            print(f"❌ Invalid link choice: {choice}")
            return False

    def _get_new_notice_state(self):
        """Get new notice state after button click with proper format"""
        try:
            if self.current_iframe and self._is_in_iframe():
                # We're in iframe, extract content from here
                body = self.driver.find_element(By.TAG_NAME, "body")
                notice_details = self.extract_notice_details(body)
                notice_details["is_in_iframe"] = True
            else:
                # We're in main content
                new_consent_elements = self.find_consent_elements()
                
                if new_consent_elements["banners"]:
                    notice_details = self.extract_notice_details(new_consent_elements["banners"][0])
                elif new_consent_elements["iframes"]:
                    notice_details = self.switch_to_iframe_and_extract(new_consent_elements["iframes"][0])
                else:
                    notice_details = {
                        "text": "No notice detected after clicking", "buttons": [], "links": [],
                        "all_buttons_formatted": "", "has_reject": False, 
                        "has_accept_all": False, "has_customize": False
                    }
            
            print(f"📊 New state analysis:")
            print(f"   Buttons: {notice_details['all_buttons_formatted']}")
            print(f"   Has Reject: {notice_details['has_reject']}")
            print(f"   Has Accept All: {notice_details['has_accept_all']}")
            print(f"   Has Customize: {notice_details['has_customize']}")
            
            return notice_details
            
        except Exception as e:
            print(f"⚠️ Error extracting new state: {e}")
            return {
                "text": "Error extracting new state", "buttons": [], "links": [],
                "all_buttons_formatted": "", "has_reject": False, 
                "has_accept_all": False, "has_customize": False
            }

    def _handle_expandable_sections(self):
        """Handle expandable section detection and expansion"""
        print("🔧 Enhanced expandable section detection...")
        
        # Find dialog container (prioritize iframe context if we're in one)
        dialog = None
        if self.current_iframe and self._is_in_iframe():
            print("📦 Using iframe context for expansion detection")
            try:
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            except:
                pass
        
        if not dialog:
            print("📦 Searching for dialog container in main content")
            dialog_selectors = [
                "#onetrust-consent-sdk", ".cookie-notice", "#cookie-notice",
                "div[role='dialog']", ".modal-dialog", ".consent-modal"
            ]
            
            for selector in dialog_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for el in elements:
                        if el.is_displayed():
                            dialog = el
                            print(f"📦 Found dialog container: {selector}")
                            break
                    if dialog:
                        break
                except:
                    continue
        
        if not dialog:
            print("📦 Using body as container")
            if self.current_iframe and self._is_in_iframe():
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            else:
                # Switch to iframe if we have one
                if self.current_iframe:
                    self.driver.switch_to.frame(self.current_iframe)
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
                    print("📦 Switched to iframe and using body")
                else:
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
        
        # Find expandable sections with enhanced detection
        expandables = self.find_expandable_sections_enhanced(dialog)
        
        if expandables:
            print(f"🎯 Found {len(expandables)} expandable sections:")
            for i, exp in enumerate(expandables):
                display_text = exp['text'][:50] + "..." if len(exp['text']) > 50 else exp['text']
                print(f"   {i+1}. {display_text} ({exp['type']})")
            
            print("\n🔧 Options:")
            print("   A - Expand ALL sections")
            print("   1-9 - Expand specific section number")
            print("   SKIP - Skip expansion and continue")
            
            exp_choice = input("Enter your choice: ").strip()
            
            if exp_choice.upper() == "A":
                print("🚀 Expanding ALL sections...")
                expanded_count = 0
                for i, exp in enumerate(expandables):
                    print(f"\n--- Expanding {i+1}/{len(expandables)}: {exp['text'][:30]}... ---")
                    success = self.try_expand_element_enhanced(exp)
                    if success:
                        expanded_count += 1
                        screenshot = self.take_screenshot(custom_name=f"expanded_{i+1}_{exp['type']}")
                        print(f"📸 Screenshot saved: {screenshot}")
                    else:
                        print(f"⚠️ Could not expand section {i+1}")
                    time.sleep(1)  # Brief pause between expansions
                
                print(f"\n✅ Expansion complete! Successfully expanded {expanded_count}/{len(expandables)} sections")
                
                # Take a final screenshot showing all expanded sections
                final_screenshot = self.take_screenshot(custom_name="all_expanded_final")
                print(f"📸 Final screenshot with all expansions: {final_screenshot}")
                
            elif exp_choice.upper() == "SKIP":
                print("⏭️ Skipping expansion...")
                
            else:
                # Handle specific section selection
                try:
                    idx = int(exp_choice) - 1
                    if 0 <= idx < len(expandables):
                        exp = expandables[idx]
                        print(f"\n🚀 Expanding selected section: {exp['text'][:50]}...")
                        success = self.try_expand_element_enhanced(exp)
                        if success:
                            screenshot = self.take_screenshot(custom_name=f"expanded_section_{idx+1}")
                            print(f"📸 Screenshot saved: {screenshot}")
                        else:
                            print(f"⚠️ Could not expand section: {exp['text'][:50]}")
                    else:
                        print(f"❌ Invalid selection. Please enter 1-{len(expandables)}, A, or SKIP")
                except ValueError:
                    print("❌ Invalid input. Please enter a number, A for all, or SKIP")
        else:
            print("❌ No expandable sections found")


        """Handle expandable section detection and expansion"""
        print("🔧 Enhanced expandable section detection...")
        
        # Find dialog container (prioritize iframe context if we're in one)
        dialog = None
        if self.current_iframe and self._is_in_iframe():
            print("📦 Using iframe context for expansion detection")
            try:
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            except:
                pass
        
        if not dialog:
            print("📦 Searching for dialog container in main content")
            dialog_selectors = [
                "#onetrust-consent-sdk", ".cookie-notice", "#cookie-notice",
                "div[role='dialog']", ".modal-dialog", ".consent-modal"
            ]
            
            for selector in dialog_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for el in elements:
                        if el.is_displayed():
                            dialog = el
                            print(f"📦 Found dialog container: {selector}")
                            break
                    if dialog:
                        break
                except:
                    continue
        
        if not dialog:
            print("📦 Using body as container")
            if self.current_iframe and self._is_in_iframe():
                dialog = self.driver.find_element(By.TAG_NAME, "body")
            else:
                # Switch to iframe if we have one
                if self.current_iframe:
                    self.driver.switch_to.frame(self.current_iframe)
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
                    print("📦 Switched to iframe and using body")
                else:
                    dialog = self.driver.find_element(By.TAG_NAME, "body")
        
        # Find expandable sections with enhanced detection
        expandables = self.find_expandable_sections_enhanced(dialog)
        
        if expandables:
            print(f"🎯 Found {len(expandables)} expandable sections:")
            for i, exp in enumerate(expandables):
                display_text = exp['text'][:50] + "..." if len(exp['text']) > 50 else exp['text']
                print(f"   {i+1}. {display_text} ({exp['type']})")
            
            print("\n🔧 Options:")
            print("   A - Expand ALL sections")
            print("   1-9 - Expand specific section number")
            print("   SKIP - Skip expansion and continue")
            
            exp_choice = input("Enter your choice: ").strip()
            
            if exp_choice.upper() == "A":
                print("🚀 Expanding ALL sections...")
                expanded_count = 0
                for i, exp in enumerate(expandables):
                    print(f"\n--- Expanding {i+1}/{len(expandables)}: {exp['text'][:30]}... ---")
                    success = self.try_expand_element_enhanced(exp)
                    if success:
                        expanded_count += 1
                        screenshot = self.take_screenshot(custom_name=f"expanded_{i+1}_{exp['type']}")
                        print(f"📸 Screenshot saved: {screenshot}")
                    else:
                        print(f"⚠️ Could not expand section {i+1}")
                    time.sleep(1)  # Brief pause between expansions
                
                print(f"\n✅ Expansion complete! Successfully expanded {expanded_count}/{len(expandables)} sections")
                
                # Take a final screenshot showing all expanded sections
                final_screenshot = self.take_screenshot(custom_name="all_expanded_final")
                print(f"📸 Final screenshot with all expansions: {final_screenshot}")
                
            elif exp_choice.upper() == "SKIP":
                print("⏭️ Skipping expansion...")
                
            else:
                # Handle specific section selection
                try:
                    idx = int(exp_choice) - 1
                    if 0 <= idx < len(expandables):
                        exp = expandables[idx]
                        print(f"\n🚀 Expanding selected section: {exp['text'][:50]}...")
                        success = self.try_expand_element_enhanced(exp)
                        if success:
                            screenshot = self.take_screenshot(custom_name=f"expanded_section_{idx+1}")
                            print(f"📸 Screenshot saved: {screenshot}")
                        else:
                            print(f"⚠️ Could not expand section: {exp['text'][:50]}")
                    else:
                        print(f"❌ Invalid selection. Please enter 1-{len(expandables)}, A, or SKIP")
                except ValueError:
                    print("❌ Invalid input. Please enter a number, A for all, or SKIP")
        else:
            print("❌ No expandable sections found")

    def save_results(self, filename=None):
        """Save results to Excel"""
        if not self.data:
            print("❌ No data to save")
            return
        
        filename = filename or DATA_FILE
        
        # Clean data for Excel export
        cleaned_data = []
        for row in self.data:
            cleaned_row = {}
            for key, value in row.items():
                if key != 'element':  # Skip WebElement objects
                    cleaned_row[key] = value
            cleaned_data.append(cleaned_row)
        
        # Create DataFrame and save
        df = pd.DataFrame(cleaned_data)
        df.to_excel(filename, index=False)
        print(f"💾 Results saved to {filename}")
        
        # Display summary
        print(f"\n📊 Summary:")
        print(f"   Total states captured: {len(df)}")
        print(f"   Websites processed: {df['Website'].nunique()}")
        
        return df
    
    def close(self):
        """Close the driver"""
        try:
            # Make sure we're out of any iframe
            if self.current_iframe:
                self.driver.switch_to.default_content()
            self.driver.quit()
            print("🔌 Driver closed successfully")
        except:
            pass

# Example usage
def main():
    """Main execution function"""
    print("🚀 Enhanced Privacy Policy Scraper with Smart Navigation")
    print("=" * 70)
    
    # Initialize scraper
    scraper = EnhancedPrivacyScraper(headless=False)
    
    # Website to test
    website = 'https://www.dropbox.com'
    
    try:
        # Run interactive exploration
        data = scraper.explore_interactively(website, max_depth=MAX_DEPTH)
        
        # Save results
        df = scraper.save_results()
        
        # Display preview
        if not df.empty:
            print("\n📋 Data Preview:")
            print(df[['Website', 'StateID', 'ChoiceMade', 'Choice Provided', 'In Iframe', 'Banner State Preserved']].head())
        
    except KeyboardInterrupt:
        print("\n⚠️ Scraping interrupted by user")
    except Exception as e:
        print(f"\n❌ Scraping failed: {e}")
    finally:
        scraper.close()

if __name__ == "__main__":
    main()

🚀 Enhanced Privacy Policy Scraper with Smart Navigation
Driver initialized with user agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb...
🌐 Visiting https://www.dropbox.com...
🤖 Checking for bot detection...
📸 Initial screenshot: screenshots/www_dropbox_com_initial_20250712_173701.png
Waiting up to 10s for consent banner to appear...
Found potential consent iframe: {'id': 'ccpa-iframe', 'class': '_ccpa-iframe_163y6_1 _banner-visible_163y6_13', 'src': 'https://www.dropbox.com/ccpa_iframe?hide_gdpr=false&should_disable_banner=false&gpc=false&origin=https%253A%252F%252Fwww.dropbox.com&uri_for_logging=dropbox.com%2F&should_show_floating_button=false&should_auto_open_options=undefined&width=1370&locale_override=en&default_non_ccpa=true', 'name': ''}
🎯 Found 1 iframe elements
🔄 Switching to iframe for content extraction...
📦 Extracted iframe content: 3 buttons, 2 links
📦 Button analysis - Reject: True, Accept All: True, Customize: True
📝 Recorded state: f15b40f7 (Depth: 0, Policy Pag