In [1]:
import asyncio
from crawl4ai import (
    LLMExtractionStrategy,
    CrawlerRunConfig,
    AsyncWebCrawler,
    BrowserConfig,
    CacheMode,
    CrawlResult,
)
from pydantic import BaseModel

In [2]:
class ApartmentExtractionSchema(BaseModel):
    address: str
    price: str
    bedrooms: int
    bathrooms: int
    square_feet: int
    description: str
    amenities: list[str]
    # images: list[str] # TODO: Can the LLM handle URls directly?
    # url: str
    # source: str

In [3]:
async def main():
    llm_strat = LLMExtractionStrategy(
        provider="ollama/deepseek-r1:70b",
        schema=ApartmentExtractionSchema,
        extraction_type="schema",
        instruction="Extract entities from the following apartment listing page. Return the entities in a JSON format.",
        chunk_token_threshold=800,
        apply_chunking=True,
        input_format="html",
    )

    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strat,
        cache_mode=CacheMode.BYPASS,
    )

    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
        res = await crawler.arun(
            url="https://www.apartments.com/new-york-ny/",
            config=crawl_config,
        )

        if isinstance(res, CrawlResult) and res.success:
            with open("res.json", "w", encoding="utf-8") as f:
                f.write(res.extracted_content)
        else:
            raise ValueError("Failed to crawl the page", res.error_message)

In [4]:
asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop