In [1]:
#@title Run this cell
# --- Cell 1: Installs and Imports ---
# (Sin cambios respecto a la versión anterior de Colab)
import asyncio
import platform
import xml.etree.ElementTree as ET
from urllib.parse import urlparse, urljoin
import logging
import traceback
from pathlib import Path
import httpx
import threading
import queue
import time
from typing import Optional, List, Tuple, Dict, Any
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files

print("Installing dependencies...")
!pip install crawl4ai httpx ipywidgets --quiet
!playwright install --with-deps
print("Dependencies installed.")
clear_output(wait=True)

try:
    from crawl4ai import (
        AsyncWebCrawler,
        CrawlerRunConfig,
        CacheMode,
        DefaultMarkdownGenerator,
        PruningContentFilter,
    )
    print("Crawl4AI imported successfully.")
except ImportError as e:
    print(f"Error importing Crawl4AI: {e}")
    raise

Crawl4AI imported successfully.


In [2]:
#@title Configuration and functions
# --- Cell 2: Configuration and Logging ---
# (Sin cambios respecto a la versión anterior de Colab)
DEFAULT_SITEMAP_URL = "https://docs.crewai.com/sitemap.xml"
DEFAULT_OUTPUT_FILE = Path("/content/crew_docs.md")
DEFAULT_CONCURRENCY_LIMIT = 5
DEFAULT_REQUEST_TIMEOUT = 45
DEFAULT_WORD_COUNT_THRESHOLD = 10
DEFAULT_PRUNING_DENSITY_THRESHOLD = 0.45
DEFAULT_LOG_FILE = "/content/crew.log"

def setup_logging(log_file=DEFAULT_LOG_FILE):
    log_level = logging.DEBUG
    log_format = "%(asctime)s - %(levelname)s - %(threadName)s - %(name)s:%(lineno)d - %(message)s"
    log_path = Path(log_file)
    log_path.parent.mkdir(parents=True, exist_ok=True)
    logging.basicConfig(
        level=log_level, force=True, format=log_format,
        handlers=[logging.FileHandler(log_file, mode='w', encoding='utf-8')]
    )
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("httpcore").setLevel(logging.WARNING)
    logging.getLogger("crawl4ai").setLevel(logging.INFO)
    logging.getLogger("playwright").setLevel(logging.WARNING)

setup_logging(DEFAULT_LOG_FILE)
logger = logging.getLogger("UnifiedWebCrawlerColab")
logger.setLevel(logging.DEBUG)

print(f"Logging configured. Level: DEBUG. File: {DEFAULT_LOG_FILE}")
print("="*30)


# --- Cell 3: Crawler Class (Modificaciones en run_crawler) ---
class UnifiedWebCrawlerColab:

    # fetch_sitemap_urls, scrape_url, write_output (sin cambios respecto a la versión anterior de Colab)
    async def fetch_sitemap_urls(self, url: str, timeout: int) -> List[str]:
        logger.debug(f"Entering fetch_sitemap_urls for {url}")
        urls = []
        logger.debug(f"Attempting download: {url} (Timeout: {timeout}s)")
        try:
            async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
                logger.debug(f"[{url}] Sending GET...")
                response = await client.get(url)
                logger.debug(f"[{url}] Status: {response.status_code}")
                response.raise_for_status()
                if not response.content:
                    logger.warning(f"[{url}] Sitemap empty.")
                    return []
                logger.debug(f"[{url}] Downloaded {len(response.content)} bytes. Decoding...")
                try:
                    xml_content = response.content.decode('utf-8')
                    logger.debug(f"[{url}] Decoded as UTF-8.")
                except UnicodeDecodeError:
                    logger.warning(f"[{url}] UTF-8 decode failed, fallback to auto.")
                    xml_content = response.text
                    logger.debug(f"[{url}] Decoded using: {response.encoding}")
                logger.debug(f"[{url}] Parsing XML...")
                root = ET.fromstring(xml_content)
                namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
                url_elements = root.findall('.//ns:loc', namespaces)
                if not url_elements:
                    logger.warning(f"[{url}] Standard NS failed, trying without.")
                    url_elements = root.findall('.//loc')
                urls = [elem.text.strip() for elem in url_elements if elem.text]
                logger.info(f"[{url}] Found {len(urls)} URLs.")
                # NO devolver urls aquí dentro del try
            # Mover el return fuera del bloque try principal de fetch_sitemap_urls
        except httpx.TimeoutException as e:
             logger.error(f"[{url}] Timeout fetching sitemap.", exc_info=True)
             urls = [] # Asegurar lista vacía
        except httpx.HTTPStatusError as e:
            logger.error(f"[{url}] HTTP error {e.response.status_code} fetching sitemap.", exc_info=True)
            urls = []
        except httpx.RequestError as e:
            logger.error(f"[{url}] Network error fetching sitemap.", exc_info=True)
            urls = []
        except ET.ParseError as e:
            logger.error(f"[{url}] XML parse error for sitemap.", exc_info=True)
            urls = []
        except Exception as e:
            logger.exception(f"[{url}] Unexpected error fetching sitemap:")
            urls = []
        # Devolver fuera del try/except
        logger.debug(f"Exiting fetch_sitemap_urls for {url}, returning {len(urls)} URLs.")
        return urls # Devuelve la lista (posiblemente vacía)

    async def scrape_url(
        self,
        crawler: AsyncWebCrawler,
        url: str,
        semaphore: asyncio.Semaphore,
        config: CrawlerRunConfig
    ) -> Optional[Tuple[str, str]]:
        logger.debug(f"[{url}] Entering scrape_url.")
        async with semaphore:
            logger.debug(f"[{url}] Acquired semaphore. Calling arun...")
            result = None
            try:
                result = await crawler.arun(url=url, config=config)
                logger.debug(f"[{url}] arun finished.")
                if result and result.success and result.markdown and len(result.markdown.strip()) > 0:
                    logger.info(f"[{url}] OK, len={len(result.markdown.strip())}")
                    return url, result.markdown.strip()
                elif result and result.success:
                    logger.warning(f"[{url}] OK, but no content.")
                    return None
                elif result:
                    logger.error(f"[{url}] FAIL Status={result.status_code}, Err={result.error_message}")
                    return None
                else:
                    logger.error(f"[{url}] FAIL arun returned None.")
                    return None
            except httpx.TimeoutException:
                 logger.error(f"[{url}] FAIL Timeout.", exc_info=True)
                 return None
            except Exception as e:
                logger.exception(f"[{url}] FAIL Unexpected error:")
                return None
            finally:
                 logger.debug(f"[{url}] Exiting scrape_url.")

    async def write_output(
        self,
        filepath: Path,
        content_list: List[Tuple[str, str]],
        status_queue: queue.Queue
        ) -> bool:
        logger.debug(f"Entering write_output for {filepath}.")
        if not content_list:
            logger.warning("No content collected, skipping file write.")
            status_queue.put({'type': 'message', 'value': "No content collected, skipping file write."})
            logger.debug("Exiting write_output (no content).")
            return True
        logger.info(f"Writing {len(content_list)} sections to: {filepath}")
        status_queue.put({'type': 'message', 'value': f"Writing output to {filepath}..."})
        write_success = False
        try:
            filepath.parent.mkdir(parents=True, exist_ok=True)
            with filepath.open("w", encoding="utf-8") as f:
                for i, (url, markdown) in enumerate(content_list):
                    f.write(f"## Source URL: {url}\n\n")
                    f.write(markdown)
                    if i < len(content_list) - 1:
                        f.write("\n\n---\n\n")
            logger.info(f"Successfully wrote content to {filepath}")
            write_success = True # Marcar éxito aquí
            # return True # No devolver dentro del try
        except IOError as e:
            logger.error(f"Failed to write output file {filepath}", exc_info=True)
            status_queue.put({'type': 'message', 'value': f"Error writing file: {e}"})
            write_success = False # Marcar fallo
            # return False
        except Exception as e:
            logger.exception(f"Unexpected error during file writing to {filepath}")
            status_queue.put({'type': 'message', 'value': f"Error during file writing: {e}"})
            write_success = False # Marcar fallo
            # return False
        # Devolver el estado de éxito fuera del try/except
        logger.debug(f"Exiting write_output for {filepath}. Success: {write_success}")
        return write_success # Devuelve True o False

    # --- run_crawler con correcciones ---
    async def run_crawler(
        self,
        sitemap_url: str,
        output_file: Path,
        concurrency_limit: int,
        request_timeout: int,
        word_count_threshold: int,
        pruning_density_threshold: float,
        status_queue: queue.Queue # Queue for reporting back
    ):
        logger.info(f"Entering run_crawler (Sitemap: {sitemap_url}).")
        # --- Variables de Estado ---
        success_flag = True # Empezar asumiendo éxito, cambiar a False si algo falla
        final_status_message = "Crawler started." # Mensaje inicial, se actualizará
        processed_count = 0
        total_urls = 0
        successful_scrapes = 0
        failed_scrapes = 0
        collected_content: List[Tuple[str, str]] = []
        current_url_being_processed = "N/A" # Para contexto de error

        try:
            # --- Fase 1: Obtener Sitemap ---
            logger.debug("--- Starting Unified Crawler ---")
            logger.debug(f"Config: Output={output_file}, Concurrency={concurrency_limit}, Timeout={request_timeout}, WordThreshold={word_count_threshold}, PruningDensity={pruning_density_threshold}")
            status_queue.put({'type': 'progress', 'value': 0.0})
            status_queue.put({'type': 'message', 'value': "Fetching sitemap..."})

            logger.debug(f"Await fetch_sitemap_urls for {sitemap_url}...")
            urls = await self.fetch_sitemap_urls(sitemap_url, request_timeout)
            logger.debug(f"Await fetch_sitemap_urls done. Result len: {len(urls)}")

            if not urls:
                logger.error("No URLs obtained from sitemap. Setting status to failed.")
                final_status_message = "Error: Could not fetch or parse sitemap."
                success_flag = False
                # NO salir aquí, dejar que el finally envíe el estado final
            else:
                total_urls = len(urls)
                logger.info(f"Processing {total_urls} URLs with concurrency {concurrency_limit}.")
                final_status_message = f"Fetched {total_urls} URLs. Initializing crawler..." # Mensaje provisional
                status_queue.put({'type': 'message', 'value': final_status_message})

            # --- Fase 2: Rastreo (Solo si el sitemap se obtuvo) ---
            if success_flag: # Solo proceder si la obtención del sitemap fue OK
                logger.debug("Configuring CrawlerRunConfig...")
                crawler_config = CrawlerRunConfig( # ... (config como antes) ...
                     cache_mode=CacheMode.DISABLED,
                     excluded_tags=['nav', 'footer', 'aside', 'header', 'script', 'style', 'button', 'form', 'input'],
                     remove_overlay_elements=True, word_count_threshold=word_count_threshold,
                     markdown_generator=DefaultMarkdownGenerator(
                         content_filter=PruningContentFilter(
                             threshold=pruning_density_threshold, threshold_type="fixed",
                             min_word_threshold=word_count_threshold),
                         options={"ignore_links": True, "ignore_images": True}),
                     js_code=None, exclude_external_links=True,
                     exclude_social_media_links=True, exclude_external_images=True,
                )
                logger.debug("CrawlerRunConfig created.")
                semaphore = asyncio.Semaphore(concurrency_limit)
                logger.debug(f"Semaphore created (limit {concurrency_limit}).")
                logger.info("Initializing AsyncWebCrawler...")
                status_queue.put({'type': 'progress', 'value': 0.05})

                logger.debug("Entering main crawl loop (async with AsyncWebCrawler)...")
                # Envolver el bloque principal de rastreo en try/except
                crawl_loop_success = True # Bandera para esta fase
                try:
                    async with AsyncWebCrawler(verbose=False) as crawler:
                        logger.info("AsyncWebCrawler initialized.")
                        tasks = []
                        logger.debug("Creating scrape tasks...")
                        for url in urls:
                            tasks.append(self.scrape_url(crawler, url, semaphore, crawler_config))
                        logger.debug(f"Created {len(tasks)} tasks.")
                        logger.info(f"Starting processing of {len(tasks)} tasks...")
                        status_queue.put({'type': 'message', 'value': f"Starting scrape of {total_urls} URLs..."})

                        original_urls_map = {id(task): url for task, url in zip(tasks, urls)}

                        # Envolver el bucle as_completed en try/except
                        tasks_processed_count_in_loop = 0
                        try:
                            for future in asyncio.as_completed(tasks):
                                task_id = id(future)
                                original_url = original_urls_map.get(task_id, "Unknown URL")
                                current_url_being_processed = original_url
                                logger.debug(f"[{original_url}] Awaiting task result...")
                                result = None
                                try:
                                    result = await future
                                    logger.debug(f"[{original_url}] Task completed.")
                                except Exception as task_exc:
                                     logger.error(f"[{original_url}] Exception awaiting task future:", exc_info=True)
                                     result = None # Tratar como fallo

                                tasks_processed_count_in_loop += 1 # Contar tareas procesadas en el bucle
                                progress = 0.05 + 0.85 * (tasks_processed_count_in_loop / total_urls)
                                status_queue.put({'type': 'progress', 'value': progress})

                                if result is not None:
                                    scraped_url, markdown_content = result
                                    collected_content.append((scraped_url, markdown_content))
                                    successful_scrapes += 1
                                    status_queue.put({'type': 'message', 'value': f"Scraped {original_url} ({tasks_processed_count_in_loop}/{total_urls})"})
                                else:
                                    failed_scrapes += 1
                                    status_queue.put({'type': 'message', 'value': f"Failed/No content: {original_url} ({tasks_processed_count_in_loop}/{total_urls})"})

                            # Si el bucle termina sin excepción
                            processed_count = tasks_processed_count_in_loop # Actualizar contador global
                            logger.info(f"Finished processing all {processed_count} tasks in loop.")

                        except Exception as loop_exc:
                            # Error DENTRO del bucle for asyncio.as_completed
                            logger.error(f"Error WITHIN asyncio.as_completed loop (around URL: {current_url_being_processed}):", exc_info=True)
                            final_status_message = f"Error during task processing: {loop_exc}. See logs."
                            success_flag = False
                            crawl_loop_success = False
                            processed_count = tasks_processed_count_in_loop # Actualizar con lo que se alcanzó

                except Exception as crawl_e:
                     # Error FUERA del bucle for (ej. en async with o inicialización)
                     logger.error(f"Error in main crawling phase (around URL: {current_url_being_processed}):", exc_info=True)
                     final_status_message = f"Crawling phase error: {crawl_e}. See logs."
                     success_flag = False
                     crawl_loop_success = False

                # Después del bloque try/except del rastreo
                if crawl_loop_success:
                     logger.info(f"--- Scraping Phase Finished OK ---")
                     final_status_message = f"Scraping complete ({successful_scrapes} OK, {failed_scrapes} Fail). Writing output..."
                     status_queue.put({'type': 'message', 'value': final_status_message})
                else:
                     # El mensaje de error ya se estableció en los bloques except
                     logger.warning("Crawling phase finished WITH ERRORS.")
                     # success_flag ya es False

            # --- Fase 3: Escritura (Solo si las fases anteriores fueron OK) ---
            if success_flag: # Solo si sitemap y rastreo fueron OK
                logger.info(f"Proceeding to write output. Successful scrapes: {successful_scrapes}")
                status_queue.put({'type': 'progress', 'value': 0.90})
                logger.debug("Calling write_output...")
                try:
                    write_success = await self.write_output(output_file, collected_content, status_queue)
                    logger.debug(f"write_output returned: {write_success}")
                    if write_success:
                        final_status_message = f"OK. Scraped: {successful_scrapes}, Failed: {failed_scrapes}. Output: {output_file}"
                        success_flag = True # Confirmar éxito general
                        logger.info(final_status_message)
                        status_queue.put({'type': 'progress', 'value': 1.0})
                    else:
                        final_status_message = f"Crawled ({successful_scrapes} OK, {failed_scrapes} Fail), BUT FAILED writing output: {output_file}."
                        success_flag = False # Marcar fallo general
                        logger.error(final_status_message)
                        status_queue.put({'type': 'progress', 'value': 1.0}) # Igual marcar 100%

                except Exception as write_exc:
                    logger.exception("Unexpected error DURING await write_output:")
                    final_status_message = f"Error during file writing phase: {write_exc}. See logs."
                    success_flag = False
                    status_queue.put({'type': 'progress', 'value': 1.0})
            else:
                # Si hubo error antes, asegurarse de que el progreso marque 100%
                logger.warning("Skipping output writing due to earlier errors.")
                if not final_status_message or final_status_message == "Crawler started.": # Si no se estableció mensaje de error
                    final_status_message = "Crawling failed before output writing phase. Check logs."
                status_queue.put({'type': 'progress', 'value': 1.0})

        except Exception as e:
            # Captura errores muy tempranos o inesperados en run_crawler
            logger.exception(f"CRITICAL UNEXPECTED error in run_crawler:")
            final_status_message = f"Critical unexpected error: {e}. Check logs."
            success_flag = False
            try:
                # Intentar actualizar el progreso a 100% si es posible
                status_queue.put({'type': 'progress', 'value': 1.0})
            except Exception:
                pass # Ignorar si la cola ya no funciona

        finally:
            # --- Bloque Final: Siempre se ejecuta ---
            logger.info(f"Entering run_crawler finally block. Final Success Flag: {success_flag}")
            try:
                 # Asegurar que el progreso final sea 1.0
                 status_queue.put({'type': 'progress', 'value': 1.0})
                 # Enviar la señal final 'finished' con el estado y mensaje determinados
                 status_queue.put({'type': 'finished', 'success': success_flag, 'message': final_status_message})
                 logger.info(f"Sent 'finished' signal. Success: {success_flag}. Message: {final_status_message}")
            except Exception as q_err:
                 # Si falla el envío a la cola aquí, ya no hay mucho que hacer
                 logger.error(f"Could not put final 'finished' signal on queue: {q_err}")
            logger.info(f"Exiting run_crawler.")

    # run_crawler_thread (sin cambios respecto a la versión anterior de Colab)
    def run_crawler_thread(self, status_queue: queue.Queue, **kwargs):
        logger.info("Crawler thread started.")
        try:
            logger.debug("Calling asyncio.run(self.run_crawler)...")
            asyncio.run(self.run_crawler(status_queue=status_queue, **kwargs))
            logger.debug("asyncio.run completed normally.")
        except Exception as e:
            error_message = f"Crawling failed critically in thread execution."
            logger.exception(error_message)
            try:
                detailed_error_msg = f"{error_message}: {type(e).__name__} - See logs."
                status_queue.put({'type': 'message', 'value': detailed_error_msg})
                # Intentar enviar 'finished' incluso en error crítico
                status_queue.put({'type': 'finished', 'success': False, 'message': detailed_error_msg})
                logger.info("Sent 'finished' signal after critical thread error.")
            except Exception as q_e:
                 logger.error(f"Failed to put critical thread error on queue: {q_e}")
        finally:
             logger.info("Crawler thread finished.")



Logging configured. Level: DEBUG. File: /content/crew.log


In [3]:
#@title RUN
# --- Cell 4: Colab UI and Execution Logic ---
# (Sin cambios respecto a la versión anterior de Colab - el bucle de polling
# ahora debería recibir el mensaje 'finished' correctamente)

# --- Create Widgets ---
sitemap_url_input = widgets.Text(value=DEFAULT_SITEMAP_URL, description="Sitemap URL:", layout=widgets.Layout(width='80%'))
output_file_input = widgets.Text(value=str(DEFAULT_OUTPUT_FILE), description="Output File:", layout=widgets.Layout(width='80%'))
log_file_input = widgets.Text(value=DEFAULT_LOG_FILE, description="Log File:", layout=widgets.Layout(width='80%'))

concurrency_input = widgets.IntSlider(value=DEFAULT_CONCURRENCY_LIMIT, min=1, max=20, step=1, description='Concurrency:')
timeout_input = widgets.IntSlider(value=DEFAULT_REQUEST_TIMEOUT, min=5, max=120, step=5, description='Timeout (s):')
word_threshold_input = widgets.IntSlider(value=DEFAULT_WORD_COUNT_THRESHOLD, min=0, max=100, step=5, description='Min Words:')
pruning_density_input = widgets.FloatSlider(value=DEFAULT_PRUNING_DENSITY_THRESHOLD, min=0.0, max=1.0, step=0.05, description='Pruning:', readout_format='.2f')

crawl_button = widgets.Button(description="Start Crawling", button_style='success', icon='play')
progress_bar = widgets.FloatProgress(value=0.0, min=0.0, max=1.0, description='Progress:', bar_style='info')
status_output = widgets.Output()

# --- Display Widgets ---
print("--- Configuration ---")
display(sitemap_url_input)
display(output_file_input)
display(log_file_input)
display(widgets.HBox([concurrency_input, timeout_input]))
display(widgets.HBox([word_threshold_input, pruning_density_input]))
print("\n--- Control & Status ---")
display(crawl_button)
display(progress_bar)
display(status_output)

# --- Crawler Instance ---
crawler_instance = UnifiedWebCrawlerColab()
crawl_active = threading.Event()

# --- Button Click Handler ---
def on_crawl_button_clicked(b):
    crawl_button.disabled = True
    crawl_button.description = "Crawling..."
    crawl_button.button_style = 'warning'
    progress_bar.value = 0.0
    progress_bar.bar_style = 'info'
    crawl_active.clear()

    with status_output:
        clear_output(wait=True)
        print("Initializing...")

    sitemap_url = sitemap_url_input.value
    output_file = Path(output_file_input.value)
    log_file = log_file_input.value
    concurrency = concurrency_input.value
    request_timeout = timeout_input.value
    word_threshold = word_threshold_input.value
    pruning_density = pruning_density_input.value

    try:
        setup_logging(log_file)
        logger.info(f"Logging reconfigured. Level: DEBUG. File: {log_file}")
    except Exception as log_e:
        with status_output:
             clear_output(wait=True)
             print(f"❌ Failed to configure logging: {log_e}")
        crawl_button.disabled = False
        crawl_button.description = "Start Crawling"
        crawl_button.button_style = 'success'
        return

    status_queue = queue.Queue()
    crawler_args = {
        "sitemap_url": sitemap_url, "output_file": output_file,
        "concurrency_limit": concurrency, "request_timeout": request_timeout,
        "word_count_threshold": word_threshold, "pruning_density_threshold": pruning_density,
        "status_queue": status_queue
    }

    crawler_thread = threading.Thread(
        target=crawler_instance.run_crawler_thread,
        kwargs=crawler_args, daemon=True, name="CrawlerThreadColab"
    )
    crawler_thread.start()
    logger.info("Crawler thread started from UI.")

    final_success = False
    final_message = "Polling started..."
    try:
        while not crawl_active.is_set():
            try:
                update = status_queue.get(timeout=0.5)
                update_type = update.get('type')

                if update_type == 'progress':
                    progress_bar.value = update.get('value', progress_bar.value)
                elif update_type == 'message':
                    message = update.get('value', '...')
                    with status_output:
                        clear_output(wait=True)
                        print(message)
                    final_message = message
                elif update_type == 'finished':
                    logger.info(f"Polling loop received 'finished'. Success: {update.get('success')}")
                    final_success = update.get('success', False)
                    final_message = update.get('message', final_message)
                    progress_bar.value = 1.0
                    crawl_active.set()
                    break # Salir del bucle while

            except queue.Empty:
                if not crawler_thread.is_alive() and not crawl_active.is_set():
                    logger.error("Crawler thread died unexpectedly without sending 'finished'.")
                    final_message = "Error: Crawler thread stopped responding. Check logs."
                    final_success = False
                    progress_bar.value = 1.0 # Marcar como terminado aunque falló
                    progress_bar.bar_style = 'danger'
                    crawl_active.set()
                    break
            except Exception as poll_e:
                 logger.error(f"Error during queue polling: {poll_e}", exc_info=True)
                 final_message = f"Error processing status: {poll_e}"
                 final_success = False
                 progress_bar.value = 1.0
                 progress_bar.bar_style = 'danger'
                 crawl_active.set()
                 break
    finally:
        # --- Post Crawl Actions ---
        with status_output:
            clear_output(wait=True)
            if final_success:
                print(f"✅ {final_message}")
                progress_bar.bar_style = 'success'
                if output_file.exists() and output_file.stat().st_size > 0: # Verificar que existe y no está vacío
                    print("\nAttempting to provide download link...")
                    try:
                        files.download(str(output_file))
                    except Exception as dl_e:
                        print(f"Could not automatically trigger download: {dl_e}")
                        print(f"You can download manually: {output_file}")
                elif not output_file.exists():
                     print(f"\nNote: Output file was not created ({output_file}).")

            else:
                print(f"❌ {final_message}")
                progress_bar.bar_style = 'danger'
            print(f"\nLog file located at: {log_file}")

        crawl_button.disabled = False
        crawl_button.description = "Start Crawling"
        crawl_button.button_style = 'success'
        logger.info("Crawling process finished from UI perspective.")

# --- Attach Handler ---
crawl_button.on_click(on_crawl_button_clicked)

--- Configuration ---


Text(value='https://docs.crewai.com/sitemap.xml', description='Sitemap URL:', layout=Layout(width='80%'))

Text(value='/content/crew_docs.md', description='Output File:', layout=Layout(width='80%'))

Text(value='/content/crew.log', description='Log File:', layout=Layout(width='80%'))

HBox(children=(IntSlider(value=5, description='Concurrency:', max=20, min=1), IntSlider(value=45, description=…

HBox(children=(IntSlider(value=10, description='Min Words:', step=5), FloatSlider(value=0.45, description='Pru…


--- Control & Status ---


Button(button_style='success', description='Start Crawling', icon='play', style=ButtonStyle())

FloatProgress(value=0.0, bar_style='info', description='Progress:', max=1.0)

Output()

[INIT].... → Crawl4AI 0.5.0.post8
[FETCH]... ↓ https://docs.crewai.com/tools/browserbaseloadtool... | Status: True | Time: 17.87s
[SCRAPE].. ◆ https://docs.crewai.com/tools/browserbaseloadtool... | Time: 0.235s
[COMPLETE] ● https://docs.crewai.com/tools/browserbaseloadtool... | Status: True | Total: 18.13s
[FETCH]... ↓ https://docs.crewai.com/how-to/custom-manager-agen... | Status: True | Time: 18.72s
[SCRAPE].. ◆ https://docs.crewai.com/how-to/custom-manager-agen... | Time: 0.577s
[COMPLETE] ● https://docs.crewai.com/how-to/custom-manager-agen... | Status: True | Total: 19.33s
[SCRAPE].. ◆ https://docs.crewai.com/how-to/customizing-agents... | Time: 0.588s
[COMPLETE] ● https://docs.crewai.com/how-to/customizing-agents... | Status: True | Total: 19.99s
[FETCH]... ↓ https://docs.crewai.com/concepts/llamaindex-tools... | Status: True | Time: 20.05s
[SCRAPE].. ◆ https://docs.crewai.com/concepts/llamaindex-tools... | Time: 0.517s
[COMPLETE] ● https://docs.crewai.com/concepts/llamaindex-too