# Website Crawling and Data Extraction

In this section, we use `crawl4ai` to crawl one or more websites. The extracted text is saved as `.md` files in a folder (e.g., `./crawled_data`).

You can enter the URLs interactively using the provided widget.


In [1]:
# If you run into issues with the event loop in Jupyter Notebook, uncomment the next two lines:
import nest_asyncio
nest_asyncio.apply()

# Imports used for Crawl4AI part
import asyncio
import threading
import ipywidgets as widgets
from IPython.display import display
from typing import Set, Any, List
import os
import json
import hashlib
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin

# Import Crawl4AI classes based on the documentation
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

In [2]:
# --- Set Windows Proactor Event Loop (for Windows users) ---
import sys
if sys.platform == "win32":
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

### Run the following cell and the use the widget to set the urls for the crawler

In [None]:
# --- URL Input Widget ---
url_input = widgets.Textarea(
    value='https://crawl4ai.com/mkdocs/\nhttps://stable-learn.com/en/lightrag-introduction/',
    placeholder='Enter one URL per line',
    description='URLs:',
    layout=widgets.Layout(width='100%', height='80px')
)
display(url_input)

In [4]:
def get_safe_filename(url: str, extension: str = ".md") -> str:
    """
    Generate a safe and unique filename from a URL.
    
    The filename is based on the domain, path, and (if present) fragment of the URL.
    A short hash of the full URL is appended to ensure uniqueness.
    
    Examples:
      - "https://crawl4ai.com/" becomes something like "crawl4ai.com_index_<hash>.md"
      - "https://crawl4ai.com/#quick-start" becomes "crawl4ai.com_index_quick_start_<hash>.md"
      
      
    Params:
        url: The URL to generate a filename for.
        extension: The file extension to use (default: ".md").
        
    Returns:
        A safe and unique filename based on the URL.
    """
    parsed = urlparse(url)
    
    # Get the netloc and sanitize it (replace ':' with '_' if needed)
    netloc = parsed.netloc.replace(":", "_")
    
    # Use the path; default to "index" if empty
    path = parsed.path.strip("/")
    if not path:
        path = "index"
    
    # Use the fragment if present (replace any non-alphanumeric characters with underscores)
    fragment = parsed.fragment.strip()
    if fragment:
        # Keep only alphanumerics and a few safe characters
        fragment = "".join(c if c.isalnum() or c in "._-" else "_" for c in fragment)
    
    # Form the base filename from netloc, path, and fragment (if available)
    base = f"{netloc}_{path}"
    if fragment:
        base = f"{base}_{fragment}"
    
    # Sanitize the base further: keep only alphanumerics, dots, underscores, or hyphens
    safe_base = "".join(c if c.isalnum() or c in "._-" else "_" for c in base)
    
    # Compute a short hash of the full URL to ensure uniqueness
    hash_digest = hashlib.md5(url.encode("utf-8")).hexdigest()[:8]
    
    # Combine safe_base, hash, and extension
    safe_filename = f"{safe_base}_{hash_digest}{extension}"
    return safe_filename

In [5]:
def normalize_content(content: str) -> str:
    """
    Remove wrapping triple backticks (```) if present,
    as well as any language hints (like "json") that might appear
    immediately after the first set of backticks.
    
    Params:
        - content: The content to normalize
    
    Returns:
        - The normalized content
    """
    stripped = content.strip()
    # Remove any leading/trailing triple backticks
    # This pattern removes the first line if it starts with ``` (optionally followed by a language)
    # and the last line if it ends with ```
    if stripped.startswith("```") and stripped.endswith("```"):
        # Remove the first and last lines
        lines = stripped.splitlines()
        # If the first line is only backticks (or backticks with a language hint)
        if lines[0].strip().startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].strip().endswith("```"):
            lines = lines[:-1]
        normalized = "\n".join(lines)
        return normalized.strip()
    return content

def is_not_found_content(content: str) -> bool:
    """
    Return True if content can be parsed as JSON and equals {"detail": "Not Found"}
    (ignoring extra whitespace or differences in spacing).
    
    Params:
        - content: The content to check
    
    Returns:
        - True if the content matches the expected pattern, False otherwise
    """
    normalized = normalize_content(content)
    try:
        data = json.loads(normalized)
        # Check that it's a dict with a key "detail" whose value (case-insensitive) is "not found"
        if isinstance(data, dict) and data.get("detail", "").strip().lower() == "not found":
            return True
    except Exception:
        pass
    return False

In [6]:
def sanitize_href(href: str) -> str:
    """
    Remove angle brackets from the href and return a clean version.
    
    Params:
        - href: the URL to sanitize
    
    Returns:
        - the sanitized URL
    """
    return href.strip("<>")

def resolve_relative_url(href: str, default_base: str) -> str:
    """
    Resolve a relative href using a default base URL.
    If href is already absolute, it is returned unchanged.
    
    Params:
        - href:         the URL to resolve
        - default_base: the default base URL to use if href is relative
        
    Returns:
        - the resolved URL
    """
    clean_href = sanitize_href(href)
    if clean_href.startswith("http"):
        return clean_href
    else:
        return urljoin(default_base, clean_href)

In [None]:
# --- Convert Widget Content to a URL List ---
urls = url_input.value.strip().splitlines()
print("List of URLs to crawl:", urls)

# --- Set Up Browser Configuration ---
browser_cfg = BrowserConfig(
    browser_type="chromium",
    headless=True,
    verbose=True,
    viewport_width=1280,
    viewport_height=720,
)

# CrawlerRunConfig is set up to:
# - Exclude external links (only internal links remain in result.links)
run_cfg = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS,
    word_count_threshold=15,        # minimum word count for content
    exclude_external_links=True,
    stream=True,  # For efficient processing when using arun_many()/arun()
)

# --- Create output folder for crawled data ---
output_folder = "crawled_data"
os.makedirs(output_folder, exist_ok=True)

# --- Define the Recursive Crawling Function ---
async def crawl_recursive(
    crawler: AsyncWebCrawler,
    url: str,
    visited: Set[str],
    depth: int,
    max_depth: int
) -> None:
    """
    Recursively crawl a given URL using an AsyncWebCrawler, save its output, and follow internal links.

    Params:
        crawler (AsyncWebCrawler):  The crawler instance used to perform the web crawl.
        url (str):                  The URL to crawl.
        visited (Set[str]):         A set of URLs that have already been visited to avoid duplicate processing.
        depth (int):                The current recursion depth.
        max_depth (int):            The maximum allowed recursion depth.

    Returns:
        None
    """  
    if depth > max_depth or url in visited:
        return
    visited.add(url)
    
    # Perform the crawl for a single URL
    result = await crawler.arun(url, config=run_cfg)
    if result.success:
        print(f"Crawled URL: {result.url} at depth {depth}")
        
        # Attempt to use markdown output (preferably via markdown_v2 or markdown field)
        md_content = None
        if result.markdown:
            if isinstance(result.markdown, str):
                md_content = result.markdown
            elif hasattr(result.markdown, "raw_markdown"):
                md_content = result.markdown.raw_markdown
        elif result.markdown_v2 and hasattr(result.markdown_v2, "raw_markdown"):
            md_content = result.markdown_v2.raw_markdown
        
        if md_content:
            if is_not_found_content(md_content):
                print(f"Skipping saving markdown for {url} because it indicates Not Found.")
            else:
                filename = get_safe_filename(url, extension=".md")
                file_path = os.path.join(output_folder, filename)
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(md_content)
                print(f"Saved markdown to {file_path}")
        else:
            # Fallback to cleaned HTML if no markdown is available.
            content = result.cleaned_html or ""
            if is_not_found_content(md_content):
                print(f"Skipping saving markdown for {url} because it indicates Not Found.")
            else:
                filename = get_safe_filename(url, extension=".html")
                file_path = os.path.join(output_folder, filename)
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(content)
                print(f"Saved cleaned HTML to {file_path}")

        # Now, get the internal links from the CrawlResult.
        # According to the API, result.links is a dictionary that may contain an "internal" key.
        internal_links = result.links.get("internal", [])
        for link_info in internal_links:
            href = link_info.get("href")
            if href:
                # Convert relative URLs to absolute using the current URL as base
                absolute_url = resolve_relative_url(href, url)
                # Optionally, check that the domain is the same as the original URL
                if urlparse(absolute_url).netloc == urlparse(url).netloc:
                    await crawl_recursive(crawler, absolute_url, visited, depth + 1, max_depth)
    else:
        print(f"Failed to crawl URL: {url}")
        print(f"Error: {result.error_message}")

# --- Main Recursive Crawling Function ---
async def crawl_main(start_urls: List[str], max_depth: int = 2) -> None:
    """
    Main function to start recursive crawling for given start URLs.

    Params:
        start_urls (List[str]): List of URLs to begin crawling.
        max_depth (int):        Maximum allowed recursion depth.

    Returns:
        None
    """
    visited = set()
    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        for url in start_urls:
            await crawl_recursive(crawler, url, visited, depth=0, max_depth=max_depth)

In [None]:
def run_crawl_in_thread() -> None:
    """Run the recursive crawl in a separate thread."""
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(crawl_main(urls, max_depth=1))
    loop.close()

t = threading.Thread(target=run_crawl_in_thread)
t.start()
t.join()

# **LightRAG**

Setup the ollama embeddings and paramter for the lightRAG 

In [5]:
import os
import logging
import glob
import numpy as np
from lightrag import LightRAG, QueryParam
from lightrag.llm.ollama import ollama_model_complete, ollama_embedding
from lightrag.utils import EmbeddingFunc

In [6]:
# Define a wrapper for the embedding function that converts output to np.float32
async def my_embedding_func(texts: list[str]) -> np.ndarray:
    """
    Async wrapper for ollama_embedding that returns embeddings as a np.float32 array.
    
    Params:
        texts (List[str]): List of input texts to embed.
        
    Returns:
        np.ndarray: Embeddings as a float32 NumPy array.
    """
    embeddings = await ollama_embedding(
        texts, 
        embed_model="nomic-embed-text", 
        host="http://localhost:11434"
    )
    # Ensure the embeddings are in a NumPy array with dtype float32
    return np.array(embeddings, dtype=np.float32)


In [None]:
# Set logging level to see info messages
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

# Create a working directory where LightRAG will store its cache and index data
WORKING_DIR = "./lightRAG_db"
os.makedirs(WORKING_DIR, exist_ok=True)

# Initialize LightRAG with the Ollama model for completion and embedding,
# and use FAISS as the vector storage backend.
rag = LightRAG(
    working_dir=WORKING_DIR,
        addon_params={
        "insert_batch_size": 1             # Process 1 documents per batch
    },
    llm_model_func=ollama_model_complete,  # Function for generating completions via Ollama 
    llm_model_name="deepseek-r1:1.5b",     # Specify the Ollama model to use (Gemma 2B in this example) 
    llm_model_max_async=1,                 # Maximum concurrent requests to the LLM service
    llm_model_max_token_size=32768,        # LLM must support at least 32k tokens of context
    llm_model_kwargs={
        "host": "http://localhost:11434",       # Ollama service address
        "options": {"num_ctx": 32768,           # Additional options; set context window size
                    "reasoning_tag": "think"},  # For DeepSeek models, set reasoning_tag to "think"          
    },
    embedding_func=EmbeddingFunc(
        embedding_dim=768,                # Dimension of the embedding vectors
        max_token_size=8192,              # Maximum token size for embedding generation
        func=my_embedding_func,           # Function to generate embeddings
    ),
    vector_storage="FaissVectorDBStorage",  # Use FAISS for vector storage (use faiss-gpu if available)
    vector_db_storage_cls_kwargs={
        "cosine_better_than_threshold": 0.3  # Threshold for retrieval similarity; adjust as needed    
    },
    chunk_token_size=1000,
    chunk_overlap_token_size=100,   
)
# interesting models for my use case (gpu with 3gb ram): 
# qwen2.5:1.5b, granite3.1-moe:1b

# Rule of thumb: double parameter size of model to get gpu memory requirement: 1.5b -> 3gb, 7b -> 14gb
# Dont forget that you need to have head space for the context that goes into the model 

# deepseek-r1:1.5b - To return only the model's response, you can pass reasoning_tag in llm_model_kwargs.
# For example, for DeepSeek models, reasoning_tag should be set to think  

In [None]:
# --- Load Crawled Documents ---
data_folder = "crawled_data"
md_files = glob.glob(os.path.join(data_folder, "*.md"))
documents = []
for file in md_files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read().strip()
        # Optionally, skip documents that are empty
        if text:
            documents.append(text)
print(f"Loaded {len(documents)} documents from {data_folder}")

# --- Insert the Crawled Documents into LightRAG Asynchronously ---
await rag.ainsert(documents)
print("Documents inserted into LightRAG.")

In [None]:
# --- Querying the LightRAG System ---
# Define a query and test different retrieval modes: "naive", "local", "global", and "hybrid".
query = "Please explain why Crawl4AI uses markdown for its output."
modes = ["naive", "local", "global", "hybrid"]

for mode in modes:
    print(f"\nResults using {mode} mode:")
    result = rag.query(query, param=QueryParam(mode=mode))
    print(result)