In [22]:
!pip install crawl4ai



In [23]:
!apt update
!apt install -y wget gnupg
!apt install -y chromium-chromedriver
!pip install nest_asyncio playwright
!playwright install chromium


Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,632 B in 2s (1,718 B/s)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
34 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as rep

In [None]:
import asyncio, json, re
import nest_asyncio; nest_asyncio.apply()

from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig

def parse_specs(specs_str):
    parts = [p.strip() for p in re.split(r'•|,', specs_str)]
    out = {"Inches": None, "ScreenResolution": None, "Cpu": None,
           "Ram": None, "Memory": None, "Gpu": None, "OpSys": None}
    for part in parts:
        if re.search(r'\d+(\.\d+)?\"', part): out["Inches"] = part
        elif re.search(r'\d{3,4}x\d{3,4}', part): out["ScreenResolution"] = part
        elif re.search(r'Intel|AMD', part): out["Cpu"] = part
        elif re.search(r'GB RAM|GB DDR', part): out["Ram"] = part
        elif re.search(r'SSD|HDD', part): out["Memory"] = part
        elif re.search(r'Graphics|GPU', part): out["Gpu"] = part
        elif re.search(r'Windows|DOS|Ubuntu', part): out["OpSys"] = part
    return out

async def main():
    schema = {
        "name": "Amazon Laptop Specs",
        "baseSelector": "[data-component-type='s-search-result']",
        "fields": [
            {"name": "ListingTitle", "selector": "h2 a span.a-text-normal", "type": "text"},
            {"name": "Specs",        "selector": ".a-size-base-plus.s-line-clamp-2", "type": "text"},
            {"name": "Weight",       "selector": ".a-size-base.s-line-clamp-2",      "type": "text"},
            {"name": "Price",        "selector": ".a-price .a-offscreen",             "type": "text"}
        ]
    }

    extractor = JsonCssExtractionStrategy(schema=schema)

    browser_conf = BrowserConfig(headless=True)
    run_conf = CrawlerRunConfig(
        wait_for="css:[data-component-type='s-search-result']",
        extraction_strategy=extractor,
        scan_full_page=True,
        scroll_delay=1.0,
        delay_before_return_html=2.0
    )

    url = "https://www.amazon.in/s?k=laptops"
    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(url=url, config=run_conf)
        if not result.success:
            print("❌ Crawl failed:", result.error_message)
            return

        items = json.loads(result.extracted_content or "[]")
        for idx, it in enumerate(items, 1):
            specs = parse_specs(it.get("Specs", ""))

if __name__ == "__main__":
    asyncio.run(main())
