# Navigation Scraper Debug Notebook

Test navigation extraction on multiple brands. Always loops through the brand list.

In [2]:
# Setup - Run this first
import sys
from pathlib import Path

# Add backend to path
backend_path = Path.cwd().parent.parent
if str(backend_path) not in sys.path:
    sys.path.insert(0, str(backend_path))

# Load env
from dotenv import load_dotenv
load_dotenv(backend_path.parent / 'config' / '.env')

print(f"Backend path: {backend_path}")
print("Setup complete. Run browser cell next.")

Backend path: /Users/bhavyajain/Code/fashion_archive/backend
Setup complete. Run browser cell next.


In [3]:
# Start browser (handles existing browser)
from playwright.async_api import async_playwright

# Helper function for cleanup - used throughout notebook
def reload_modules():
    """Reload all navigation modules to pick up code changes."""
    import importlib
    import sys
    modules_to_clear = [
        'scraper.navigation.dynamic_explorer',
        'scraper.navigation.step_explorer',
        'scraper.navigation.extraction.links',
        'scraper.navigation.aria.elements',
        'scraper.navigation.aria.diff',
        'scraper.navigation.menu.context',
        'scraper.navigation.llm.classification',
        'scraper.navigation.llm.client',
        'dynamic_explorer',
        'step_explorer'
    ]
    for mod in modules_to_clear:
        if mod in sys.modules:
            del sys.modules[mod]
    # Re-import dynamic_explorer
    import scraper.navigation.dynamic_explorer as de
    globals()['dynamic_explorer'] = de
    return de

# Close existing browser if any
async def cleanup_browser():
    global browser, playwright
    try:
        if 'browser' in globals() and browser is not None:
            await browser.close()
    except:
        pass
    try:
        if 'playwright' in globals() and playwright is not None:
            await playwright.stop()
    except:
        pass

await cleanup_browser()

# Start fresh
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page(viewport={'width': 768, 'height': 900})

# Import dynamic_explorer
import scraper.navigation.dynamic_explorer as dynamic_explorer

print("Browser started. reload_modules() available.")

Browser started. reload_modules() available.


In [None]:
# =============================================================================
# TEST BRANDS - Comment/uncomment to choose which to test
# =============================================================================

TEST_BRANDS = {
    "entire_studios": "https://www.entirestudios.com",
    "prod_bldg": "https://prod.net",
    "named_collective": "https://namedcollective.com",
    "alexander_mcqueen": "https://www.alexandermcqueen.com",
    "balenciaga": "https://www.balenciaga.com",
    "zalando_kids": "https://www.zalando.de/kinder-home/",
    "uniqlo": "https://www.uniqlo.com",
    "aelfric_eden": "https://www.aelfriceden.com",
    "eckhaus_latta": "https://www.eckhauslatta.com",
    "axel_arigato": "https://www.axelarigato.com",
}

# Comment out brands you don't want to test:
ACTIVE_BRANDS = [
    "entire_studios",
    "prod_bldg",
    "named_collective",
    "alexander_mcqueen",
    "balenciaga",
    "zalando_kids",
    "uniqlo",
    "aelfric_eden",
    "eckhaus_latta",
    "axel_arigato",
]

print(f"Testing {len(ACTIVE_BRANDS)} brands")

In [None]:
# =============================================================================
# TEST: Open menu and capture ARIA diff for all brands
# =============================================================================
reload_modules()

results = {}
for brand in ACTIVE_BRANDS:
    url = TEST_BRANDS[brand]
    print(f"\n{'='*60}")
    print(f"{brand}")
    print(f"{'='*60}")
    
    try:
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1500)
        
        # Open menu and capture before/after diff
        result = await dynamic_explorer.open_menu_and_capture(page)
        
        if not result['opened']:
            print("  ❌ Menu not opened")
            results[brand] = {"status": "❌ Menu not opened"}
            continue
        
        if result['menu_start']:
            line_idx, first_line = result['menu_start']
            saved = len(result['after_aria']) - len(result['menu_aria'])
            print(f"  ✅ Menu opened")
            print(f"     Menu starts at line {line_idx}")
            print(f"     First new: {first_line}")
            print(f"     Saved: {saved} chars")
            print(f"\n--- MENU ARIA ({len(result['menu_aria'])} chars) ---")
            print(result['menu_aria'])
            results[brand] = {
                "status": "✅",
                "line": line_idx,
                "first_new": first_line,
                "saved": saved,
                "result": result
            }
        else:
            print("  ⚠️ Menu opened but no ARIA diff detected")
            print(f"\n--- FULL ARIA (no diff) ---")
            print(result['after_aria'][:5000])
            results[brand] = {"status": "⚠️ No diff", "result": result}
            
    except Exception as e:
        print(f"  ⚠️ Error: {e}")
        results[brand] = {"status": f"⚠️ ERROR: {str(e)[:30]}"}

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
for brand, r in results.items():
    status = r['status']
    extra = f" | Line {r['line']}, saved {r['saved']} chars" if r.get('saved') else ""
    print(f"  {brand:20} {status}{extra}")

In [None]:
# =============================================================================
# OPTIONAL: Manually set cached results (skip LLM entirely)
# Uncomment and edit if you already know the tab names from previous runs
# =============================================================================

# CACHED_RESULTS = {
#     "uniqlo": {
#         "status": "ok",
#         "url": "https://www.uniqlo.com",
#         "tabs": [
#             {"text": "women", "role": "tab", "x": 24, "y": 64},
#             {"text": "men", "role": "tab", "x": 252, "y": 64},
#             {"text": "kids", "role": "tab", "x": 476, "y": 64},
#             {"text": "baby", "role": "tab", "x": 700, "y": 64},
#         ],
#         "menu_aria": None  # Not needed for interaction
#     },
#     "axel_arigato": {
#         "status": "ok",
#         "url": "https://www.axelarigato.com",
#         "tabs": [
#             {"text": "Men", "role": "button", "x": 0, "y": 46},
#             {"text": "Women", "role": "button", "x": 384, "y": 46},
#         ],
#         "menu_aria": None
#     },
# }
# print(f"Manually set {len(CACHED_RESULTS)} cached results")

In [None]:
# =============================================================================
# STEP 1: Detect tabs (runs LLM) - SAVE RESULTS
# Run this ONCE, then use cached results in subsequent cells
# =============================================================================
reload_modules()
from IPython.display import display, Image
import importlib
import scraper.navigation.llm_popup_dismiss as popup_module
importlib.reload(popup_module)
from scraper.navigation.llm_popup_dismiss import dismiss_popups_with_llm
import json

# Cache to store results - persists across cell runs
CACHED_RESULTS = {}

for brand in ACTIVE_BRANDS:
    url = TEST_BRANDS[brand]
    print(f"\n{'='*70}")
    print(f"{brand}")
    print(f"{'='*70}")
    
    try:
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1500)
        
        # Dismiss popups BEFORE menu opens
        await page.wait_for_timeout(500)
        await dismiss_popups_with_llm(page, max_attempts=2)
        
        # Open menu
        result = await dynamic_explorer.open_menu_and_capture(page)
        
        if not result['opened']:
            print("  ❌ Menu not opened")
            CACHED_RESULTS[brand] = {"status": "no_menu", "tabs": None, "menu_aria": None}
            continue
        
        print("  ✅ Menu opened")
        
        # Dismiss popups AFTER menu - use menu_is_open=True!
        await page.wait_for_timeout(500)
        await dismiss_popups_with_llm(page, max_attempts=1, menu_is_open=True)
        
        # Take screenshot for display only (not sending to LLM again)
        screenshot = await page.screenshot()
        display(Image(data=screenshot, width=400))
        
        # Identify tabs (LLM call)
        llm_result = await dynamic_explorer.identify_tabs_with_llm(page, result['menu_aria'])
        print(f"  LLM tab names: {llm_result['tab_names']}")
        
        if not llm_result['tab_names']:
            CACHED_RESULTS[brand] = {
                "status": "no_tabs",
                "tabs": None,
                "menu_aria": result['menu_aria']
            }
            continue
        
        # Find tabs in DOM
        dom_result = await dynamic_explorer.find_tabs_in_dom(
            page, llm_result['tab_names'], result['menu_aria']
        )
        
        if dom_result['found']:
            print(f"  ✅ Found {len(dom_result['tabs'])} tabs: {[t['text'] for t in dom_result['tabs']]}")
            CACHED_RESULTS[brand] = {
                "status": "ok",
                "tabs": dom_result['tabs'],
                "menu_aria": result['menu_aria'],
                "url": url
            }
        else:
            CACHED_RESULTS[brand] = {
                "status": "tabs_not_found",
                "tabs": None,
                "menu_aria": result['menu_aria']
            }
            
    except Exception as e:
        print(f"  ⚠️ Error: {e}")
        CACHED_RESULTS[brand] = {"status": "error", "tabs": None, "menu_aria": None}

print(f"\n{'='*70}")
print("CACHED RESULTS (use in next cell without re-running LLM)")
print(f"{'='*70}")
for brand, r in CACHED_RESULTS.items():
    if r['tabs']:
        print(f"  {brand:20} ✅ {[t['text'] for t in r['tabs']]}")
    else:
        print(f"  {brand:20} {r['status']}")

In [None]:
# =============================================================================
# STEP 2: Click/hover tabs using CACHED results (NO LLM calls)
# Uses CACHED_RESULTS from previous cell
# =============================================================================
reload_modules()
import importlib
import scraper.navigation.llm_popup_dismiss as popup_module
importlib.reload(popup_module)
from scraper.navigation.llm_popup_dismiss import dismiss_popups_with_llm

# Check if we have cached results
if not CACHED_RESULTS:
    print("❌ No cached results! Run STEP 1 first.")
else:
    for brand, cached in CACHED_RESULTS.items():
        if cached['status'] != 'ok' or not cached['tabs']:
            print(f"\n{brand}: Skipping (status={cached['status']})")
            continue
            
        url = cached['url']
        tabs = cached['tabs']
        menu_aria = cached['menu_aria']
        
        print(f"\n{'='*70}")
        print(f"{brand} - Using cached tabs: {[t['text'] for t in tabs]}")
        print(f"{'='*70}")
        
        try:
            # Navigate and open menu
            await page.goto(url, wait_until="domcontentloaded")
            await page.wait_for_timeout(1500)
            
            await page.wait_for_timeout(500)
            await dismiss_popups_with_llm(page, max_attempts=2)
            
            result = await dynamic_explorer.open_menu_and_capture(page)
            if not result['opened']:
                print("  ❌ Menu not opened")
                continue
            
            # IMPORTANT: menu_is_open=True after menu opens!
            await page.wait_for_timeout(500)
            await dismiss_popups_with_llm(page, max_attempts=1, menu_is_open=True)
            
            # Interact with each cached tab (NO LLM calls)
            for tab in tabs:
                print(f"\n  {'─'*60}")
                print(f"  TAB: {tab['text']} (role={tab['role']})")
                print(f"  {'─'*60}")
                
                try:
                    # FIRST: Try hover
                    revealed, aria_after = await dynamic_explorer.hover_and_check(
                        page, tab['text'], item_type=tab['role']
                    )
                    
                    if revealed:
                        print(f"    ✅ Hover revealed content")
                        tab_aria = aria_after
                    else:
                        # SECOND: Try click
                        print(f"    Hover didn't work, clicking...")
                        clicked = await dynamic_explorer.click_button(
                            page, tab['text'], prefer_role=tab['role']
                        )
                        if not clicked:
                            print(f"    ⚠️ Could not click")
                            continue
                        await page.wait_for_timeout(300)
                        tab_aria = await page.locator('body').aria_snapshot()
                        print(f"    ✅ Clicked")
                    
                    # Show content under tab
                    lines = tab_aria.split('\n')
                    start_idx = 0
                    for i, line in enumerate(lines):
                        if f'"{tab["text"]}"' in line:
                            start_idx = i + 1
                            break
                    
                    content_lines = lines[start_idx:start_idx + 30]
                    print(f"\n    ARIA CONTENT:")
                    print("    " + "\n    ".join(content_lines[:20]))
                    
                except Exception as e:
                    print(f"    ⚠️ Error: {e}")
                    
        except Exception as e:
            import traceback
            print(f"  ⚠️ Error: {e}")
            traceback.print_exc()

print(f"\n{'='*70}")
print("DONE")
print(f"{'='*70}")

In [None]:
# =============================================================================
# STEP 3: Full exploration using explore_all_tabs (NEW UNIFIED APPROACH)
# Automatically detects BULK vs DFS mode per tab
# Handles both tabs and no-tabs cases
# =============================================================================
reload_modules()
from scraper.navigation.llm_popup_dismiss import dismiss_popups_with_llm
import importlib
import scraper.navigation.llm_popup_dismiss as popup_module
importlib.reload(popup_module)
from scraper.navigation.llm_popup_dismiss import dismiss_popups_with_llm

BRANDS = [
    "entire_studios",
    "prod_bldg",
    "named_collective",
    "alexander_mcqueen",
    "balenciaga",
    "zalando_kids",
    "uniqlo",
    "aelfric_eden",
    "eckhaus_latta",
    "axel_arigato",
]

# Select ONE brand to test (full exploration can be slow)
TEST_BRAND = BRANDS[2]  # Change this to test different brands

# Check if CACHED_RESULTS exists
if 'CACHED_RESULTS' not in dir():
    print("❌ CACHED_RESULTS not defined. Run STEP 1 first.")
elif TEST_BRAND not in CACHED_RESULTS:
    print(f"❌ '{TEST_BRAND}' not in CACHED_RESULTS.")
    print(f"   Available brands: {list(CACHED_RESULTS.keys())}")
else:
    cached = CACHED_RESULTS[TEST_BRAND]
    status = cached['status']
    
    # Get tabs (may be None for no_tabs status)
    tabs = cached.get('tabs') or []
    url = cached.get('url') or TEST_BRANDS[TEST_BRAND]
    
    print(f"{'='*70}")
    print(f"FULL EXPLORATION: {TEST_BRAND}")
    print(f"Status: {status}")
    if tabs:
        print(f"Tabs: {[t['text'] for t in tabs]}")
    else:
        print(f"Tabs: None (will explore menu directly)")
    print(f"{'='*70}")
    
    try:
        # Navigate and setup
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1500)
        
        await dismiss_popups_with_llm(page, max_attempts=2)
        
        # Open menu
        menu_result = await dynamic_explorer.open_menu_and_capture(page)
        if not menu_result['opened']:
            print("❌ Could not open menu")
        else:
            # IMPORTANT: menu_is_open=True to avoid closing the menu!
            await dismiss_popups_with_llm(page, max_attempts=1, menu_is_open=True)
            
            # Run full exploration (handles both tabs and no-tabs)
            results = await dynamic_explorer.explore_all_tabs(
                page=page,
                tabs=tabs,  # Empty list if no tabs
                base_url=url,
                menu_result=menu_result
            )
            
            # Print full results (no truncation)
            print(f"\n{'='*70}")
            print("RESULTS")
            print(f"{'='*70}")
            
            for tab_name, tab_result in results['tabs'].items():
                mode = tab_result.get('mode', 'unknown')
                cats = tab_result.get('categories', {})
                print(f"\n[{tab_name}] Mode: {mode}, Categories: {len(cats)}")
                for name, cat_url in cats.items():
                    print(f"    {name} → {cat_url}")
            
            print(f"\n{'='*70}")
            print(f"TOTAL: {len(results['all_categories'])} categories")
            print(f"LLM Usage: {results['llm_usage']}")
            
    except Exception as e:
        import traceback
        print(f"❌ Error: {e}")
        traceback.print_exc()

In [None]:
# Check LLM usage
print(f"Total LLM usage: {dynamic_explorer._llm_usage}")

In [None]:
# Close browser when done
await browser.close()
await playwright.stop()
print("Browser closed.")

# =============================================================================
# STEP-BY-STEP EXPLORER - Debug one interaction at a time
# =============================================================================

In [1]:
# RELOAD: Reload modules without restarting browser
import importlib
import sys

# Force remove all cached modules to pick up changes
modules_to_clear = [
    'scraper.navigation.dynamic_explorer',
    'scraper.navigation.llm_popup_dismiss', 
    'scraper.navigation.step_explorer',
    'scraper.navigation.extraction.links',
    'scraper.navigation.extraction.nav_elements',  # ADDED - was missing!
    'scraper.navigation.aria.elements',
    'scraper.navigation.aria.diff',
    'scraper.navigation.menu.context',
    'scraper.navigation.llm.classification',
    'scraper.navigation.llm.client',
    'dynamic_explorer',
    'step_explorer'
]
for mod in modules_to_clear:
    if mod in sys.modules:
        del sys.modules[mod]

# Fresh import
from step_explorer import NavExplorer

# Create new explorer with SAME page (browser stays open)
explorer = NavExplorer(page)
print("Modules reloaded. Run explorer.setup(URL) to initialize.")

# SETUP: Create step explorer
import importlib
import sys

# Force reload of all navigation modules
modules_to_clear = [
    'scraper.navigation.dynamic_explorer',
    'scraper.navigation.step_explorer',
    'scraper.navigation.extraction.links',
    'scraper.navigation.extraction.nav_elements',  # ADDED - was missing!
    'scraper.navigation.aria.elements',
    'scraper.navigation.aria.diff',
    'scraper.navigation.menu.context',
    'scraper.navigation.llm.classification',
    'scraper.navigation.llm.client',
    'dynamic_explorer',
    'step_explorer'
]
for mod in modules_to_clear:
    if mod in sys.modules:
        del sys.modules[mod]

# Fresh import
from step_explorer import NavExplorer

TEST_BRANDS = {
    "entire_studios": "https://www.entirestudios.com",
    "prod_bldg": "https://prod.net",
    "named_collective": "https://namedcollective.com",
    "alexander_mcqueen": "https://www.alexandermcqueen.com",
    "balenciaga": "https://www.balenciaga.com",
    "zalando_kids": "https://www.zalando.de/kinder-home/",
    "uniqlo": "https://www.uniqlo.com",
    "aelfric_eden": "https://www.aelfriceden.com",
    "eckhaus_latta": "https://www.eckhauslatta.com",
    "axel_arigato": "https://www.axelarigato.com",
}

# Choose URL to test
URL = TEST_BRANDS["axel_arigato"]

explorer = NavExplorer(page)
result = await explorer.setup(URL)
print(result)

ModuleNotFoundError: No module named 'scraper'

In [311]:
# RUN ONE STEP (re-run this cell to retry same item)
result = await explorer.step()
print(f"\nAction: {result.action}")
print(f"Item: {' > '.join(result.item_path)}")
print(f"Success: {result.success}")
if result.error:
    print(f"Error: {result.error}")
if result.links_found:
    print(f"Links found: {list(result.links_found.keys())}")
print(f"Children added: {result.children_added}")


[STEP] Men (button)
  Stack: 1 remaining
  [ARIA BEFORE] 15788 chars, 523 lines
    - main:
      - button "FREE RETURNS":
        - link "FREE RETURNS":
          - /url: /assistance?group=returns-and-exchanges&faq=what-is-your-return-policy
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      - button "Toggle mobile cart":
        - img
      - button "Toggle mobile primaryMenu":
        - img
      - navigation
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      - button "Toggle mobile cart":
        - img
      - button "Toggle mobile primaryMenu":
        - img
      - searchbox "Search"
      - button "Close search":
        - img
      - text: Search
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      

In [308]:
# RUN MULTIPLE STEPS
for i in range(37):  # adjust number as needed
    if explorer.done():
        print("Done!")
        break
    result = await explorer.step()
    print(f"{i+1}. {result.item_name}: {len(result.links_found)} links, {result.children_added} children")

explorer.show_state()


[STEP] Women (button)
  Stack: 0 remaining
  [ARIA BEFORE] 15761 chars, 524 lines
    - main:
      - button "DISCOVER THE SS26 EDIT":
        - link "DISCOVER THE SS26 EDIT":
          - /url: /men-ss26-edit
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      - button "Toggle mobile cart":
        - img
      - button "Toggle mobile primaryMenu":
        - img
      - navigation
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      - button "Toggle mobile cart":
        - img
      - button "Toggle mobile primaryMenu":
        - img
      - searchbox "Search"
      - button "Close search":
        - img
      - text: Search
      - link:
        - /url: /
        - img
      - button "Search":
        - img
      - button "Toggle mobile wishlist":
        - img
      - button "Toggle mobile cart":
   

In [309]:
# SHOW TREE: Print categories as tree
explorer.print_tree()

print(f"\nExplored: {len(explorer.explored)}")
print(f"Remaining: {len(explorer.stack)}")


[TREE] 2 categories
├── Men
│   └── FREE RETURNS
│       └── FREE RETURNS → /assistance?group=returns-and-exchanges&faq=what-is-your-return-policy
└── Women
    └── FREE EXPRESS DELIVERY
        └── FREE EXPRESS DELIVERY → /assistance?group=shipping-and-delivery&faq=which-delivery-services-do-you-offer-how-much-do-they-cost-and-how-long-do-they-take

Explored: 2
Remaining: 0


In [279]:
# UTILITIES
# explorer.stack  # Direct access to stack
# explorer.explored  # Set of explored paths
# explorer.categories  # Dict of path -> url
# explorer.css_cache  # Cached CSS classes per depth

In [None]:
# Scratch cell
