In [15]:
"""
Enhanced Privacy/Cookie Banner Scraper
- Shadow DOM & iframe-aware banner detection
- Robust extraction of text/buttons/links
- Safe URL resolution and click strategies
- Headless parity and relaxed banner heuristics
"""

import os
import time
import uuid
import json
import random
from datetime import datetime
from urllib.parse import urlparse, urljoin

import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains

from webdriver_manager.chrome import ChromeDriverManager

# ----------------- Configuration -----------------
SCREENSHOTS_DIR = "screenshots"
DATA_FILE = "privacy_notice_data.xlsx"
MAX_DEPTH = 3
WAIT_TIME = 6  # allow slower CMPs to attach
DETECT_TIMEOUT = 12  # total poll time for banner detection

os.makedirs(SCREENSHOTS_DIR, exist_ok=True)

# ----------------- Scraper -----------------
class EnhancedPrivacyScraper:
    def __init__(self, headless=False):
        self.driver = self.initialize_driver_with_stealth(headless)
        self.data = []
        self.current_iframe = None
        self.original_url = None
        self.current_website = None
        self.current_depth = 0

    # ---------- Driver ----------
    def initialize_driver_with_stealth(self, headless=False):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless=new")  # modern headless
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1280,900")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--lang=en-US")
        chrome_options.add_argument("--accept-lang=en-US,en;q=0.9")

        # Rotate Chrome UAs (keep Chrome UA for Chrome session)
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
        ]
        ua = random.choice(user_agents)
        chrome_options.add_argument(f"user-agent={ua}")

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        print(f"Driver initialized with user agent: {ua}")
        return driver

    # ---------- Utilities ----------
    def take_screenshot(self, state_id=None, custom_name=None):
        domain = urlparse(self.current_website).netloc.replace('.', '_') if self.current_website else "unknown"
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if custom_name:
            filename = f"{domain}_{custom_name}_{timestamp}.png"
        elif state_id:
            filename = f"{domain}_state_{state_id}_{timestamp}.png"
        else:
            filename = f"{domain}_{timestamp}.png"
        filepath = os.path.join(SCREENSHOTS_DIR, filename)
        self.driver.save_screenshot(filepath)
        return filepath

    def _js_dispatch_click(self, el):
        try:
            self.driver.execute_script("""
                const el = arguments[0];
                try { el.scrollIntoView({block: 'center'}); } catch(e){}
                const ev = new PointerEvent('click', {bubbles:true, cancelable:true});
                el.dispatchEvent(ev);
            """, el)
            return True
        except Exception:
            return False

    # ---------- Shadow DOM helpers ----------
    def _deep_shadow_query_all(self, root, css):
        js = """
        const results = [];
        const query = arguments[0];
        const start = arguments[1] || document;
        const stack = [start];
        const pushShadow = (el) => { if (el && el.shadowRoot) stack.push(el.shadowRoot); };
        while (stack.length) {
          const node = stack.pop();
          try {
            node.querySelectorAll('*').forEach(el => { if (el.shadowRoot) stack.push(el.shadowRoot); });
            node.querySelectorAll(query).forEach(el => results.push(el));
          } catch (e) {}
        }
        return results;
        """
        return self.driver.execute_script(js, css, root)

    def _find_banners_shadow_dom(self):
        selectors = [
            "#usercentrics-root", "div[id*='usercentrics']",
            "div[id*='truste']", "div[id*='onetrust']",
            "div[id*='cookie']", "div[class*='cookie']",
            "div[class*='consent']", "[role='dialog']"
        ]
        found = []
        for sel in selectors:
            try:
                hosts = self.driver.find_elements(By.CSS_SELECTOR, sel)
                for host in hosts:
                    try:
                        # look inside the shadow subtree for common UI
                        matches = self._deep_shadow_query_all(host, "button, a, [role='dialog'], [class*='banner'], [id*='banner']")
                        if matches:
                            found.append((host, matches))
                    except Exception:
                        continue
            except Exception:
                continue
        return found

    def _extract_from_shadow_host(self, host):
        # Access the shadow root
        try:
            sr = host.shadow_root
        except Exception:
            sr = self.driver.execute_script("return arguments[0].shadowRoot", host)

        # Buttons
        try:
            buttons = sr.find_elements(By.CSS_SELECTOR, "button, [role='button']")
        except Exception:
            buttons = []

        # Links
        try:
            links = sr.find_elements(By.CSS_SELECTOR, "a")
        except Exception:
            links = []

        # Main text (best effort)
        try:
            dialog_like = sr.find_elements(By.CSS_SELECTOR, "[role='dialog'], [class*='banner'], [id*='banner']")
            container = dialog_like[0] if dialog_like else sr
            text_val = container.get_attribute("innerText") or container.text or ""
        except Exception:
            text_val = ""

        details = self._build_details_from_elements(text_val, buttons, links, shadow=True)
        return details

    # ---------- Iframe helpers ----------
    def _search_iframes_for_banner(self, selectors):
        iframes = self.driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                self.driver.switch_to.frame(iframe)
                for sel in selectors:
                    els = self.driver.find_elements(By.CSS_SELECTOR, sel)
                    vis = [e for e in els if e.is_displayed()]
                    if vis:
                        self.current_iframe = iframe
                        return vis[0]
            except Exception:
                pass
            finally:
                self.driver.switch_to.default_content()
        return None

    def switch_to_iframe_and_extract(self, banner_root):
        try:
            # already switched by _search_iframes_for_banner; ensure we are inside
            try:
                self.driver.switch_to.frame(self.current_iframe)
            except Exception:
                # in case we are not yet inside
                pass

            container = banner_root
            if not container:
                try:
                    container = self.driver.find_element(By.TAG_NAME, "body")
                except Exception:
                    container = None

            if container:
                return self.extract_notice_details(container, is_in_iframe=True)
        except Exception as e:
            print(f"Error extracting iframe content: {e}")
        finally:
            # remain in iframe for possible interactions; caller manages switching back
            pass

        return {
            "text": "", "buttons": [], "links": [], "is_in_iframe": True,
            "all_buttons_formatted": "", "has_reject": False, "has_accept_all": False, "has_customize": False
        }

    # ---------- Detection ----------
    def detect_consent_banner(self, timeout=DETECT_TIMEOUT):
        print(f"Detecting consent banner (timeout={timeout}s)...")
        banner_selectors = [
            "#onetrust-banner-sdk", "#CybotCookiebotDialog", "#didomi-popup", "#usercentrics-root",
            "[class*='cookie']", "[class*='consent']", "[role='dialog']",
            "[id*='cookie']", "[id*='consent']"
        ]
        end = time.time() + timeout

        # small nudge to trigger scroll-based CMPs
        try:
            self.driver.execute_script("window.scrollBy(0, 1);")
            time.sleep(0.2)
            self.driver.execute_script("window.scrollBy(0, -1);")
        except Exception:
            pass

        while time.time() < end:
            # 1) Plain DOM
            try:
                for sel in banner_selectors:
                    els = self.driver.find_elements(By.CSS_SELECTOR, sel)
                    vis = [e for e in els if e.is_displayed()]
                    if vis:
                        return ("dom", vis[0])
            except Exception:
                pass

            # 2) Shadow DOM
            try:
                shadow_hits = self._find_banners_shadow_dom()
                if shadow_hits:
                    return ("shadow", shadow_hits[0][0])  # return host; extraction uses shadow root
            except Exception:
                pass

            # 3) Iframes (search ALL iframes)
            try:
                iframe_el = self._search_iframes_for_banner(banner_selectors)
                if iframe_el:
                    return ("iframe", iframe_el)
            except Exception:
                pass

            time.sleep(0.5)

        return (None, None)

    # ---------- Extraction ----------
    def _build_details_from_elements(self, raw_text, buttons, links, shadow=False, is_in_iframe=False):
        details = {
            "text": (raw_text or "").strip(),
            "buttons": [],
            "links": [],
            "is_in_iframe": is_in_iframe,
            "all_buttons_formatted": "",
            "has_reject": False,
            "has_accept_all": False,
            "has_customize": False
        }

        # Buttons
        btn_texts = []
        for b in buttons:
            try:
                if not b.is_displayed():
                    continue
            except Exception:
                continue
            try:
                t = b.text.strip()
                if not t:
                    # innerText is often non-empty when .text is empty (shadow)
                    t = b.get_attribute("innerText") or ""
                t = t.strip()
                if t:
                    details["buttons"].append({"text": t, "element": b})
                    btn_texts.append(t)
                    low = t.lower()
                    if any(w in low for w in ["reject", "decline", "deny", "opt out"]):
                        details["has_reject"] = True
                    if any(w in low for w in ["accept all", "allow all", "agree"]):
                        details["has_accept_all"] = True
                    if any(w in low for w in ["customize", "preferences", "settings", "manage"]):
                        details["has_customize"] = True
            except Exception:
                continue
        details["all_buttons_formatted"] = "; ".join(btn_texts)

        # Links
        for a in links:
            try:
                if not a.is_displayed():
                    continue
                txt = (a.text or "").strip()
                if not txt:
                    txt = (a.get_attribute("innerText") or "").strip()
                href = a.get_attribute("href")
                if href:
                    href_resolved = self._resolve_policy_url(href, txt)
                    details["links"].append({
                        "text": txt, "href": href_resolved,
                        "original_href": href, "element": a
                    })
            except Exception:
                continue

        return details

    def extract_notice_details(self, element, is_in_iframe=False):
        # Text
        try:
            raw_text = element.text.strip()
            if not raw_text:
                raw_text = (element.get_attribute("innerText") or "").strip()
        except Exception:
            raw_text = ""

        # Buttons/links
        try:
            buttons = element.find_elements(By.CSS_SELECTOR, "button, [role='button']")
        except Exception:
            buttons = []
        try:
            links = element.find_elements(By.TAG_NAME, "a")
        except Exception:
            links = []

        return self._build_details_from_elements(raw_text, buttons, links, shadow=False, is_in_iframe=is_in_iframe)

    def _resolve_policy_url(self, href, link_text):
        base_url = self.original_url or self.driver.current_url
        if not href:
            return base_url
        if href.startswith("http://") or href.startswith("https://"):
            return href
        return urljoin(base_url, href)

    # ---------- Clicking ----------
    def _is_in_iframe(self):
        try:
            return not self.driver.execute_script("return window.top === window;")
        except Exception:
            return False

    def _scroll_and_click(self, element):
        try:
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
            time.sleep(0.4)
            element.click()
            return True
        except Exception:
            return False

    def _actions_click(self, element):
        try:
            ActionChains(self.driver).move_to_element(element).click().perform()
            return True
        except Exception:
            return False

    def _click_element_safely_with_iframe_context(self, element):
        try:
            strategies = [
                ("Direct click", lambda: element.click() or True),
                ("Scroll and click", lambda: self._scroll_and_click(element)),
                ("Actions click", lambda: self._actions_click(element)),
                ("JS click", lambda: self.driver.execute_script("arguments[0].click();", element) or True),
                ("PointerEvent click", lambda: self._js_dispatch_click(element)),
            ]
            for name, fn in strategies:
                try:
                    ok = fn()
                    if ok:
                        print(f"   ✅ Clicked with: {name}")
                        return True
                except Exception as e:
                    print(f"   ❌ {name} failed: {e}")
            return False
        except Exception as e:
            print(f"Click failed: {e}")
            return False

    # ---------- Recording ----------
    def record_state(self, website, parent_id, state_id, notice_details, screenshot_path, choice=None, action_type=None):
        current_url = self.driver.current_url
        is_policy_page = (
            current_url != self.original_url and
            any(k in current_url.lower() for k in ['privacy', 'policy', 'legal', 'terms', 'cookie'])
        )

        # Links (up to 5)
        links_data = {}
        for i in range(1, 6):
            if i - 1 < len(notice_details.get('links', [])):
                link = notice_details['links'][i - 1]
                links_data[f"Link {i}"] = link.get('text', '')
                links_data[f"Link {i} Detail"] = link.get('href', '')
            else:
                links_data[f"Link {i}"] = ""
                links_data[f"Link {i} Detail"] = ""

        if action_type is None:
            action_type = choice or ""

        row = {
            "Website": website,
            "ParentID": parent_id or "",
            "StateID": state_id,
            "Depth": self.current_depth,
            "Timestamp": datetime.now().isoformat(),
            "Screenshot": screenshot_path,
            "Text of Notice": notice_details.get('text', ''),
            "All Buttons": notice_details.get('all_buttons_formatted', ''),
            "Has Reject Option": notice_details.get('has_reject', False),
            "Has Accept All": notice_details.get('has_accept_all', False),
            "Has Customize": notice_details.get('has_customize', False),
            "Choice Made": choice or "",
            "Is Policy Page": is_policy_page,
            **links_data,
            "Action Type": action_type
        }
        self.data.append(row)
        print(f"📝 Recorded state: {state_id} (Depth {self.current_depth}, Policy Page: {is_policy_page})")

    # ---------- Policy page scraping ----------
    def extract_policy_page_content(self):
        details = {
            "text": "",
            "buttons": [],
            "links": [],
            "is_in_iframe": False,
            "all_buttons_formatted": "",
            "has_reject": False,
            "has_accept_all": False,
            "has_customize": False
        }
        try:
            print("📄 Extracting policy page content...")
            content_selectors = [
                "main", "article", ".content", "#content", ".privacy-policy",
                ".cookie-policy", "[role='main']", ".policy-content",
                ".legal-content", ".page-content", "#main-content", "body"
            ]
            main_content = None
            for sel in content_selectors:
                try:
                    els = self.driver.find_elements(By.CSS_SELECTOR, sel)
                    for el in els:
                        txt = (el.text or "").strip()
                        if el.is_displayed() and len(txt) > 150:
                            main_content = el
                            break
                    if main_content:
                        break
                except Exception:
                    continue
            if not main_content:
                main_content = self.driver.find_element(By.TAG_NAME, "body")

            raw_text = (main_content.text or "").strip()
            if not raw_text:
                raw_text = (main_content.get_attribute("innerText") or "").strip()

            # Clean a bit
            import re
            cleaned = re.sub(r'\n\s*\n', '\n\n', raw_text)
            cleaned = re.sub(r'[ \t]+', ' ', cleaned)
            cleaned = re.sub(r'\n\s+', '\n', cleaned)

            details["text"] = cleaned

            # Buttons
            btn_texts = []
            try:
                btns = main_content.find_elements(By.CSS_SELECTOR, "button, [role='button'], input[type='submit'], input[type='button'], .btn, .button")
            except Exception:
                btns = []
            for b in btns:
                try:
                    if not b.is_displayed():
                        continue
                    t = (b.text or "").strip()
                    if not t:
                        t = (b.get_attribute("innerText") or "").strip()
                    if t and len(t) < 100:
                        details["buttons"].append({"text": t, "element": b})
                        btn_texts.append(t)
                        low = t.lower()
                        if any(w in low for w in ["reject", "decline", "deny", "opt out"]):
                            details["has_reject"] = True
                        if any(w in low for w in ["accept all", "allow all", "agree"]):
                            details["has_accept_all"] = True
                        if any(w in low for w in ["customize", "preferences", "settings", "manage"]):
                            details["has_customize"] = True
                except Exception:
                    continue
            details["all_buttons_formatted"] = "; ".join(btn_texts)

            # Links (first 10 policy-ish)
            try:
                anchors = main_content.find_elements(By.TAG_NAME, "a")
            except Exception:
                anchors = []
            picked = 0
            for a in anchors:
                if picked >= 10:
                    break
                try:
                    if not a.is_displayed():
                        continue
                    t = (a.text or "").strip()
                    if not t:
                        t = (a.get_attribute("innerText") or "").strip()
                    href = a.get_attribute("href")
                    if href and t and len(t) < 100:
                        if any(k in (t.lower()) for k in ['privacy', 'cookie', 'terms', 'legal', 'policy', 'data', 'gdpr', 'ccpa']):
                            details["links"].append({"text": t, "href": href, "element": a})
                            picked += 1
                except Exception:
                    continue

            print(f"📄 Policy content: {len(details['text'])} chars, {len(details['buttons'])} buttons, {len(details['links'])} links")
        except Exception as e:
            details["text"] = f"Error extracting policy content: {e}"

        return details

    # ---------- Interactive exploration ----------
    def display_options(self, notice_details):
        print(f"\n--- Notice Text ---\n{notice_details.get('text','')[:1000]}\n")
        print("--- Available Buttons ---")
        for i, b in enumerate(notice_details.get('buttons', [])):
            print(f"{i+1}. {b['text']}")
        print("\n--- Available Links ---")
        for i, l in enumerate(notice_details.get('links', [])):
            print(f"{i+1}. {l['text']} => {l['href']}")
        print("\n--- Actions ---")
        print("B<number>  Click button (e.g., B1)")
        print("L<number>  Open link (e.g., L1)")
        print("S          Screenshot")
        print("BACK       Back to previous state")
        print("DONE       Finish")
        return input("Enter choice: ").strip()

    def explore_interactively(self, website, max_depth=MAX_DEPTH):
        self.current_website = website
        self.original_url = website
        self.current_depth = 0
        print(f"🌐 Visiting {website}...")
        self.driver.get(website)
        time.sleep(3)

        # Light resize/scroll nudges (helps some CMPs)
        try:
            self.driver.set_window_size(1280, 900)
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
            time.sleep(0.4)
            self.driver.execute_script("window.scrollTo(0, 0);")
        except Exception:
            pass

        # Initial screenshot
        root_state_id = str(uuid.uuid4())[:8]
        screenshot_path = self.take_screenshot(root_state_id, "initial")
        print(f"📸 Initial screenshot: {screenshot_path}")

        kind, el = self.detect_consent_banner(timeout=DETECT_TIMEOUT)
        if kind == "dom":
            notice_details = self.extract_notice_details(el, is_in_iframe=False)
        elif kind == "iframe":
            notice_details = self.switch_to_iframe_and_extract(el)
        elif kind == "shadow":
            notice_details = self._extract_from_shadow_host(el)
        else:
            print("❌ No consent banner detected")
            notice_details = {
                "text": "No cookie/consent notice detected", "buttons": [], "links": [],
                "is_in_iframe": False, "all_buttons_formatted": "",
                "has_reject": False, "has_accept_all": False, "has_customize": False
            }

        self.record_state(website, None, root_state_id, notice_details, screenshot_path)

        # Exploration loop
        current_state_id = root_state_id
        parent_id = None
        state_stack = []  # (url, state_id, parent_id, iframe)

        while self.current_depth < max_depth:
            print(f"\n🔍 Depth {self.current_depth}/{max_depth}")
            choice = self.display_options(notice_details)
            if choice.upper() == "DONE":
                break
            elif choice.upper() == "S":
                sc = self.take_screenshot(custom_name=f"manual_{datetime.now().strftime('%H%M%S')}")
                print(f"📸 {sc}")
                continue
            elif choice.upper() == "BACK":
                if not state_stack:
                    print("At root; cannot go back.")
                    continue
                prev_url, prev_state_id, prev_parent_id, prev_iframe = state_stack.pop()
                self.current_depth = max(0, self.current_depth - 1)
                # Try to navigate back to original URL (simplified/robust)
                try:
                    self.driver.get(self.original_url)
                    time.sleep(3)
                except Exception:
                    pass
                kind, el = self.detect_consent_banner(timeout=6)
                if kind == "dom":
                    notice_details = self.extract_notice_details(el, is_in_iframe=False)
                elif kind == "iframe":
                    notice_details = self.switch_to_iframe_and_extract(el)
                elif kind == "shadow":
                    notice_details = self._extract_from_shadow_host(el)
                else:
                    notice_details = {
                        "text": "Returned but no banner restored", "buttons": [], "links": [],
                        "is_in_iframe": False, "all_buttons_formatted": "",
                        "has_reject": False, "has_accept_all": False, "has_customize": False
                    }
                current_state_id = prev_state_id
                parent_id = prev_parent_id
                sc = self.take_screenshot(current_state_id, "back")
                self.record_state(website, parent_id, current_state_id, notice_details, sc, action_type="Back")
                continue

            # Click Button
            if choice.upper().startswith("B") and notice_details.get("buttons"):
                try:
                    idx = int(choice[1:]) - 1
                except ValueError:
                    print("Invalid button index.")
                    continue
                if 0 <= idx < len(notice_details["buttons"]):
                    button = notice_details["buttons"][idx]["element"]
                    # Save current state
                    state_stack.append((self.driver.current_url, current_state_id, parent_id, self.current_iframe))
                    ok = self._click_element_safely_with_iframe_context(button)
                    if not ok:
                        print("Failed to click button.")
                        state_stack.pop()
                        continue
                    time.sleep(WAIT_TIME)
                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    self.current_depth += 1

                    # After click, re-detect banner (could morph/close)
                    kind, el = self.detect_consent_banner(timeout=6)
                    if kind == "dom":
                        notice_details = self.extract_notice_details(el, is_in_iframe=False)
                    elif kind == "iframe":
                        notice_details = self.switch_to_iframe_and_extract(el)
                    elif kind == "shadow":
                        notice_details = self._extract_from_shadow_host(el)
                    else:
                        notice_details = {
                            "text": "No notice detected after clicking", "buttons": [], "links": [],
                            "is_in_iframe": False, "all_buttons_formatted": "",
                            "has_reject": False, "has_accept_all": False, "has_customize": False
                        }

                    sc = self.take_screenshot(current_state_id, f"after_button_{self.current_depth}")
                    self.record_state(website, parent_id, current_state_id, notice_details, sc,
                                      choice=notice_details.get("buttons", [])[idx]["text"] if idx < len(notice_details.get("buttons", [])) else "Button")

                    continue
                else:
                    print("Button index out of range.")
                    continue

            # Click Link
            if choice.upper().startswith("L") and notice_details.get("links"):
                try:
                    idx = int(choice[1:]) - 1
                except ValueError:
                    print("Invalid link index.")
                    continue
                if 0 <= idx < len(notice_details["links"]):
                    link = notice_details["links"][idx]
                    href = link.get("href")
                    if not href:
                        print("Link has no href.")
                        continue

                    state_stack.append((self.driver.current_url, current_state_id, parent_id, self.current_iframe))
                    try:
                        if self.current_iframe:
                            self.driver.switch_to.default_content()
                            self.current_iframe = None
                        self.driver.get(href)
                        time.sleep(WAIT_TIME)
                    except Exception as e:
                        print(f"Failed to open link: {e}")
                        state_stack.pop()
                        continue

                    parent_id = current_state_id
                    current_state_id = str(uuid.uuid4())[:8]
                    self.current_depth += 1

                    # Extract policy page
                    policy_details = self.extract_policy_page_content()
                    sc = self.take_screenshot(current_state_id, "policy_page")
                    self.record_state(website, parent_id, current_state_id, policy_details, sc,
                                      choice=f"Policy Link: {link.get('text','')}")
                    continue
                else:
                    print("Link index out of range.")
                    continue

            print("Unrecognized action.")
        return self.data

    # ---------- Save & Close ----------
    def save_results(self, filename=None):
        if not self.data:
            print("No data to save.")
            return pd.DataFrame()

        filename = filename or DATA_FILE
        # Convert to DataFrame (strip WebElements)
        cleaned = []
        for row in self.data:
            new_row = {}
            for k, v in row.items():
                if isinstance(v, dict) and "element" in v:
                    v = {kk: vv for kk, vv in v.items() if kk != "element"}
                new_row[k] = v
            cleaned.append(new_row)
        df = pd.DataFrame(cleaned)
        df.to_excel(filename, index=False)
        print(f"💾 Results saved to {filename}")

        # Safe preview
        cols = [c for c in ["Website", "StateID", "Choice Made", "Is Policy Page"] if c in df.columns]
        if cols:
            print("\n📋 Data Preview:")
            print(df[cols].head())
        return df

    def close(self):
        try:
            if self.current_iframe and self._is_in_iframe():
                self.driver.switch_to.default_content()
            self.driver.quit()
            print("🔌 Driver closed.")
        except Exception:
            pass


# ----------------- Runner -----------------
def main():
    print("🚀 Enhanced Privacy/Cookie Banner Scraper")
    scraper = EnhancedPrivacyScraper(headless=False)

    # Example target (replace with your list/loop)
    website = "https://www.typepad.com/"  # change this as needed

    try:
        data = scraper.explore_interactively(website, max_depth=MAX_DEPTH)
        scraper.save_results(DATA_FILE)
    except KeyboardInterrupt:
        print("\nInterrupted by user.")
    except Exception as e:
        print(f"\n❌ Scraping failed: {e}")
    finally:
        scraper.close()


if __name__ == "__main__":
    main()


🚀 Enhanced Privacy/Cookie Banner Scraper
Driver initialized with user agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36
🌐 Visiting https://www.typepad.com/...
📸 Initial screenshot: screenshots\www_typepad_com_initial_20250905_115646.png
Detecting consent banner (timeout=12s)...
📝 Recorded state: 97901edb (Depth 0, Policy Page: False)

🔍 Depth 0/3

--- Notice Text ---
About Cookies
We use cookies to ensure that we give you the best experience on our website. This includes cookies from third party social media websites and advertising cookies that may analyze your use of this site. Click "Got it" to agree or "Cookie Settings" to opt out.
Cookie Notice
Cookies Settings Got It

--- Available Buttons ---
1. Cookies Settings
2. Got It

--- Available Links ---
1. Cookie Notice => https://newfold.com/privacy-center/cookie-policy

--- Actions ---
B<number>  Click button (e.g., B1)
L<number>  Open link (e.g., L1)
S       

Enter choice:  B1


   ✅ Clicked with: Direct click
Detecting consent banner (timeout=6s)...
📝 Recorded state: 3998b03f (Depth 1, Policy Page: False)

🔍 Depth 1/3

--- Notice Text ---
About Cookies
We use cookies to ensure that we give you the best experience on our website. This includes cookies from third party social media websites and advertising cookies that may analyze your use of this site. Click "Got it" to agree or "Cookie Settings" to opt out.
Cookie Notice
Cookies Settings Got It

--- Available Buttons ---
1. Cookies Settings
2. Got It

--- Available Links ---
1. Cookie Notice => https://newfold.com/privacy-center/cookie-policy

--- Actions ---
B<number>  Click button (e.g., B1)
L<number>  Open link (e.g., L1)
S          Screenshot
BACK       Back to previous state
DONE       Finish


Enter choice:  done


💾 Results saved to privacy_notice_data.xlsx

📋 Data Preview:
                    Website   StateID       Choice Made  Is Policy Page
0  https://www.typepad.com/  97901edb                             False
1  https://www.typepad.com/  3998b03f  Cookies Settings           False
🔌 Driver closed.
