In [None]:
#!/usr/bin/env python3
"""
AI-enabled link crawler (improved link discovery + Playwright interactions)

Usage:
  python ai_crawler_full.py --start-url https://www.netcomlearning.com/ --output netcom_links.json --max-pages 200 --concurrency 2

Notes:
- Install dependencies:
    pip install playwright requests beautifulsoup4
    playwright install
- OpenAI integration is optional and disabled if OPENAI_API_KEY is not set.
"""

import argparse
import asyncio
import json
import logging
import os
import re
import time
from collections import deque
from urllib.parse import urlparse, urljoin, urlunparse, parse_qsl

from dotenv import load_dotenv
load_dotenv()

import requests
from bs4 import BeautifulSoup
import urllib.robotparser as robotparser

# Playwright
try:
    from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
except Exception:
    async_playwright = None
    PlaywrightTimeoutError = Exception

# Optional OpenAI (modern client)
try:
    from openai import OpenAI
except Exception:
    OpenAI = None

# Logging setup
logger = logging.getLogger('ai_crawler_full')
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

# Constants
USER_AGENT = 'AICompanyLinkCrawler/1.0 (+https://example.com)'
HEADERS = {'User-Agent': USER_AGENT}
EXTENSION_SKIP = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.rar', '.exe', '.tar', '.gz')
COMMON_MENU_SELECTORS = ['.menu', '.nav', '.dropdown', '[data-toggle]', '[aria-haspopup]']

# Optional OpenAI setup (if available)
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
OPENAI_MODEL = os.environ.get('OPENAI_MODEL', 'gpt-4o-mini')
openai_client = None
if OpenAI is not None and OPENAI_API_KEY:
    try:
        openai_client = OpenAI(api_key=OPENAI_API_KEY)
        logger.info('OpenAI client initialized (optional features enabled)')
    except Exception as e:
        logger.warning(f'Failed to init OpenAI client: {e}')
        openai_client = None
else:
    logger.info('OpenAI client not available/disabled; using heuristics only')

# Helper: normalize urls to deduplicate
def normalize_url(u, base=None):
    if not u:
        return None
    if base:
        u = urljoin(base, u)
    # strip fragments
    u = u.split('#', 1)[0]
    parsed = urlparse(u)
    scheme = (parsed.scheme or 'http').lower()
    netloc = parsed.netloc.lower()
    path = parsed.path or '/'
    # normalize path: remove trailing slash except root
    if path != '/' and path.endswith('/'):
        path = path.rstrip('/')
    # remove common tracking params
    qs_items = parse_qsl(parsed.query, keep_blank_values=True)
    qs_filtered = [(k, v) for k, v in qs_items if not (k.lower().startswith('utm_') or k.lower() in ('fbclid', 'gclid', 'icid'))]
    query = '&'.join(f'{k}={v}' for k, v in qs_filtered) if qs_filtered else ''
    normalized = urlunparse((scheme, netloc, path, '', query, ''))
    return normalized

# Plain fetch with requests (runs in thread)
def fetch_plain(url, timeout=15):
    logger.debug(f'fetch_plain start: {url}')
    try:
        r = requests.get(url, headers=HEADERS, timeout=timeout)
        r.raise_for_status()
        logger.debug(f'fetch_plain success: {url} (len={len(r.text)})')
        return r.text
    except Exception as e:
        logger.debug(f'fetch_plain failed {url}: {e}')
        return None

# Playwright fetch with additional JS-based link collection and optional hover attempts
async def fetch_with_playwright(context, url, wait_until='networkidle', timeout=30000):
    logger.debug(f'fetch_with_playwright start: {url}')
    page = await context.new_page()
    try:
        await page.set_extra_http_headers({"User-Agent": USER_AGENT})
        # goto
        await page.goto(url, wait_until=wait_until, timeout=timeout)
        # short wait for dynamic content
        await asyncio.sleep(0.35)

        # Try hovering common menu selectors to reveal hidden links
        for sel in COMMON_MENU_SELECTORS:
            try:
                # try a few times: if selector exists, hover first matching element
                if await page.query_selector(sel):
                    try:
                        await page.hover(sel)
                        logger.debug(f'hovered selector {sel} on {url}')
                        await asyncio.sleep(0.15)
                    except Exception:
                        # sometimes hover fails; ignore
                        logger.debug(f'hover failed for selector {sel} on {url}')
            except Exception:
                pass

        # Evaluate JS to collect additional link candidates created by scripts, data attributes, onclick handlers
        js_collect = r"""
        () => {
          const urls = new Set();
          function add(u){ if(u && typeof u === 'string') urls.add(u); }
          // anchors
          for(const a of document.querySelectorAll('a[href]')) add(a.getAttribute('href'));
          // data attributes often used by SPA menus
          const dataAttrs = ['data-href','data-url','data-link','data-target'];
          for(const attr of dataAttrs){
            for(const el of document.querySelectorAll('['+attr+']')){
              add(el.getAttribute(attr));
            }
          }
          // onclick patterns
          for(const el of document.querySelectorAll('[onclick]')){
            const s = el.getAttribute('onclick') || '';
            let m = s.match(/location\.href\s*=\s*['"]([^'"]+)['"]/);
            if(m) add(m[1]);
            m = s.match(/window\.location(?:\.href)?\s*=\s*['"]([^'"]+)['"]/);
            if(m) add(m[1]);
            m = s.match(/window\.open\(\s*['"]([^'"]+)['"]/);
            if(m) add(m[1]);
          }
          // attributes that may contain urls
          for(const el of document.querySelectorAll('[href], [src], [data-href], [data-url]')){
            for(const k of ['href','src','data-href','data-url']) {
              if(el.getAttribute(k)) add(el.getAttribute(k));
            }
          }
          // meta/link rel canonical/alternate
          for(const link of document.querySelectorAll('link[href], meta[content]')) {
            const rel = link.getAttribute('rel') || '';
            const href = link.getAttribute('href') || link.getAttribute('content') || '';
            if(rel && ['canonical','prev','next','alternate'].some(r=>rel.includes(r))) add(href);
          }
          return Array.from(urls);
        }
        """
        collected = await page.evaluate(js_collect)
        # page content after possible hover
        content = await page.content()

        # close page
        try:
            await page.close()
        except Exception:
            pass

        # Return tuple: html + extra links found by JS
        logger.debug(f'fetch_with_playwright success: {url} (len={len(content)}), collected {len(collected)} extra links')
        return content, collected
    except PlaywrightTimeoutError:
        logger.debug(f'PlaywrightTimeoutError for {url}')
        try:
            await page.close()
        except Exception:
            pass
        return None, []
    except Exception as e:
        logger.debug(f'Playwright fetch error {url}: {e}')
        try:
            await page.close()
        except Exception:
            pass
        return None, []

# Enhanced link extraction and normalization
def extract_links_from_html(base_url, html):
    soup = BeautifulSoup(html, 'html.parser')
    found = set()

    # 1) anchors
    for a in soup.find_all('a', href=True):
        href = a['href'].strip()
        if href:
            n = normalize_url(href, base=base_url)
            if n:
                found.add(n)

    # 2) data attributes commonly used by JS
    for attr in ('data-href', 'data-url', 'data-link', 'data-target'):
        for el in soup.find_all(attrs={attr: True}):
            v = el.get(attr)
            if v:
                n = normalize_url(v, base=base_url)
                if n:
                    found.add(n)

    # 3) onclick patterns
    for el in soup.find_all(attrs={'onclick': True}):
        onclick = el.get('onclick') or ''
        m = re.search(r"""location\.href\s*=\s*['"]([^'"]+)['"]""", onclick)
        if not m:
            m = re.search(r"""window\.location(?:\.href)?\s*=\s*['"]([^'"]+)['"]""", onclick)
        if not m:
            m = re.search(r"""window\.open\(\s*['"]([^'"]+)['"]""", onclick)
        if m:
            n = normalize_url(m.group(1), base=base_url)
            if n:
                found.add(n)

    # 4) head link rel canonical/alternate
    for tag in soup.find_all('link', href=True):
        rel = tag.get('rel') or []
        if isinstance(rel, list) and any(r in ('canonical', 'prev', 'next', 'alternate') for r in rel):
            n = normalize_url(tag['href'], base=base_url)
            if n:
                found.add(n)

    # filter and return
    out = set()
    for u in found:
        if not u:
            continue
        lu = u.lower()
        if lu.startswith('mailto:') or lu.startswith('tel:'):
            continue
        if any(lu.endswith(ext) for ext in EXTENSION_SKIP):
            continue
        out.add(u)

    logger.info(f'Extracted {len(out)} normalized links from {base_url}')
    return out

# Optional simple classifier fallback (no OpenAI)
def simple_classify(title_or_text):
    t = (title_or_text or '').lower()
    if 'course' in t or 'enroll' in t or 'training' in t:
        return 'course'
    if 'certif' in t or 'certificate' in t or 'exam' in t:
        return 'certification'
    if 'product' in t or 'buy' in t or 'price' in t:
        return 'product'
    if 'press' in t or 'news' in t or 'announcement' in t:
        return 'announcement'
    if 'blog' in t or 'case study' in t or 'case-study' in t:
        return 'blog'
    return 'other'

# Main crawl function
async def crawl(start_url, output='out.json', max_pages=500, concurrency=3, delay=0.4, ignore_robots=False, use_playwright=True):
    logger.info(f'start crawl for {start_url}')
    parsed_start = urlparse(start_url)
    base_host = parsed_start.netloc
    base_root = f'{parsed_start.scheme}://{parsed_start.netloc}'

    # robots.txt
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url(base_root + '/robots.txt')
        rp.read()
        logger.debug('robots.txt loaded')
    except Exception:
        rp = None
        logger.debug('robots.txt not available or failed to load')

    q = deque([normalize_url(start_url)])
    visited = set()
    discovered = set(q)
    results = []

    playwright_ctx = None
    browser = None
    browser_context = None

    # Use async context for playwright if available and requested
    use_pw = use_playwright and (async_playwright is not None)
    ctx_manager = async_playwright() if use_pw else None

    async def start_playwright():
        nonlocal playwright_ctx, browser, browser_context
        playwright_ctx = await async_playwright().__aenter__()  # will be closed later
        browser = await playwright_ctx.chromium.launch(headless=True)
        browser_context = await browser.new_context(user_agent=USER_AGENT)
        logger.info('Playwright started')

    async def stop_playwright():
        nonlocal playwright_ctx, browser, browser_context
        try:
            if browser_context is not None:
                logger.info('closing Playwright context')
                await browser_context.close()
            if browser is not None:
                logger.info('closing Playwright browser')
                await browser.close()
            if playwright_ctx is not None:
                await playwright_ctx.__aexit__(None, None, None)
                logger.info('Playwright stopped')
        except Exception as e:
            logger.warning(f'Error during Playwright shutdown: {e}')

    if use_pw:
        # start playwright upfront
        try:
            await start_playwright()
        except Exception as e:
            logger.warning(f'Could not start Playwright: {e}')
            use_pw = False

    # worker
    async def worker(worker_id):
        nonlocal q, visited, discovered, results, browser_context, use_pw
        logger.info(f'worker {worker_id} started')
        while q and len(visited) < max_pages:
            url = None
            try:
                url = q.popleft()
            except Exception:
                break
            if not url:
                continue
            if url in visited:
                continue
            # host restriction
            if urlparse(url).netloc != base_host:
                logger.debug(f'worker {worker_id} skipping external host: {url}')
                visited.add(url)
                continue
            # robots
            if rp and not ignore_robots and not rp.can_fetch(USER_AGENT, url):
                logger.debug(f'worker {worker_id} blocked by robots: {url}')
                visited.add(url)
                continue

            logger.info(f'worker {worker_id} crawling: {url} (visited {len(visited)+1}/{max_pages})')
            html = None
            extra_links = []
            # try plain fetch
            html = await asyncio.to_thread(fetch_plain, url)
            used_playwright = False
            # if no html or small html and playwright available, use it
            if (not html or len(html) < 500) and use_pw and browser_context is not None:
                logger.debug(f'worker {worker_id} using Playwright for {url}')
                html, extra_links = await fetch_with_playwright(browser_context, url)
                used_playwright = True

            visited.add(url)
            if not html:
                logger.warning(f'worker {worker_id} failed to fetch html for {url}')
                await asyncio.sleep(delay)
                continue

            # extract links from HTML
            links = extract_links_from_html(url, html)
            # add any extra js-collected links from Playwright evaluation
            for e in extra_links:
                n = normalize_url(e, base=url)
                if n:
                    links.add(n)

            logger.info(f'worker {worker_id} found {len(links)} links on {url}')

            # simple content sampling: title/meta
            soup = BeautifulSoup(html, 'html.parser')
            title = None
            if soup.find('h1') and soup.find('h1').get_text(strip=True):
                title = soup.find('h1').get_text(strip=True)
            elif soup.title and soup.title.string:
                title = soup.title.string.strip()

            classification = simple_classify((title or '') + ' ' + (soup.get_text(' ')[:800] or ''))

            # store result for this page
            results.append({
                'url': url,
                'title': title,
                'classification': classification,
                'rendered_with_playwright': used_playwright,
                'links_found': sorted(list(links))[:30],  # store up to 30 sample links per page to avoid huge output
            })
            logger.info(f'worker {worker_id} saved page {url} classified as {classification} (links sample stored)')

            # enqueue normalized links
            enqueued = 0
            for l in links:
                if not l:
                    continue
                # skip static assets
                if any(l.lower().endswith(ext) for ext in EXTENSION_SKIP):
                    continue
                if urlparse(l).netloc == base_host and l not in discovered:
                    discovered.add(l)
                    q.append(l)
                    enqueued += 1
            logger.debug(f'worker {worker_id} enqueued {enqueued} new links from {url}')

            await asyncio.sleep(delay)

        logger.info(f'worker {worker_id} finished')

    # start workers
    tasks = [asyncio.create_task(worker(i)) for i in range(concurrency)]
    await asyncio.gather(*tasks)

    # shutdown playwright cleanly
    if use_pw:
        await stop_playwright()

    # write output
    out = {
        'start_url': start_url,
        'scraped_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
        'results': results,
        'visited_count': len(visited),
    }
    with open(output, 'w', encoding='utf-8') as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    logger.info(f'crawl complete. visited={len(visited)} saved={output}')

# CLI
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='AI-enabled link crawler (improved discovery)')
    parser.add_argument('--start-url', required=True)
    parser.add_argument('--output', default='netcom_links.json')
    parser.add_argument('--max-pages', type=int, default=200)
    parser.add_argument('--concurrency', type=int, default=3)
    parser.add_argument('--delay', type=float, default=0.4)
    parser.add_argument('--ignore-robots', action='store_true')
    parser.add_argument('--no-playwright', action='store_true')
    args = parser.parse_args()

    use_playwright = not args.no_playwright and async_playwright is not None
    try:
        asyncio.run(crawl(args.start_url, output=args.output, max_pages=args.max_pages, concurrency=args.concurrency, delay=args.delay, ignore_robots=args.ignore_robots, use_playwright=use_playwright))
    except KeyboardInterrupt:
        logger.info('Interrupted by user')


2025-09-22 14:34:03,832 [INFO] Starting crawl: https://www.netcomlearning.com/ (max 100 pages)
Pages crawled:   1%|          | 1/100 [00:03<05:33,  3.37s/it]
2025-09-22 14:34:19,428 [INFO] Crawl finished. Pages visited: 1. Products found: 0
2025-09-22 14:34:19,428 [INFO] Wrote products to product.json
