In [None]:
!pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"


Collecting crawl4ai@ git+https://github.com/unclecode/crawl4ai.git
  Cloning https://github.com/unclecode/crawl4ai.git to /tmp/pip-install-v9etbdej/crawl4ai_19af2dc538ea4069af9b721487f39104
  Running command git clone --filter=blob:none --quiet https://github.com/unclecode/crawl4ai.git /tmp/pip-install-v9etbdej/crawl4ai_19af2dc538ea4069af9b721487f39104
  Resolved https://github.com/unclecode/crawl4ai.git to commit 0afc3e9e5e38b09d0995042ecaa9c77de66842e1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting aiosqlite~=0.20 (from crawl4ai@ git+https://github.com/unclecode/crawl4ai.git)
  Downloading aiosqlite-0.20.0-py3-none-any.whl.metadata (4.3 kB)
Collecting litellm>=1.53.1 (from crawl4ai@ git+https://github.com/unclecode/crawl4ai.git)
  Downloading litellm-1.59.6-py3-none-any.whl.metadata (36 kB)
Collecting pillow~=10.4 (from crawl4ai@ git+https://github.co

In [None]:
# Install crawl4ai and dependencies (if not already installed)
!pip install crawl4ai
!playwright install  # Required for browser automation
!pip install nest_asyncio  # To support asyncio in environments like Colab or Jupyter

# Import nest_asyncio and apply it to allow asyncio in Colab/Jupyter environments
import nest_asyncio
nest_asyncio.apply()

print('Setup complete!')

Downloading Chromium 131.0.6778.33 (playwright build v1148)[2m from https://playwright.azureedge.net/builds/chromium/1148/chromium-linux.zip[22m
[1G161.3 MiB [] 0% 10.7s[0K[1G161.3 MiB [] 0% 35.3s[0K[1G161.3 MiB [] 0% 65.7s[0K[1G161.3 MiB [] 0% 14.6s[0K[1G161.3 MiB [] 0% 8.3s[0K[1G161.3 MiB [] 1% 6.0s[0K[1G161.3 MiB [] 2% 4.9s[0K[1G161.3 MiB [] 2% 4.1s[0K[1G161.3 MiB [] 3% 3.8s[0K[1G161.3 MiB [] 4% 3.4s[0K[1G161.3 MiB [] 5% 3.1s[0K[1G161.3 MiB [] 5% 3.0s[0K[1G161.3 MiB [] 6% 2.9s[0K[1G161.3 MiB [] 6% 3.1s[0K[1G161.3 MiB [] 7% 3.1s[0K[1G161.3 MiB [] 8% 3.0s[0K[1G161.3 MiB [] 8% 2.9s[0K[1G161.3 MiB [] 9% 2.8s[0K[1G161.3 MiB [] 9% 2.9s[0K[1G161.3 MiB [] 10% 2.8s[0K[1G161.3 MiB [] 11% 2.8s[0K[1G161.3 MiB [] 11% 2.7s[0K[1G161.3 MiB [] 12% 2.7s[0K[1G161.3 MiB [] 13% 2.6s[0K[1G161.3 MiB [] 14% 2.5s[0K[1G161.3 MiB [] 15% 2.5s[0K[1G161.3 MiB [] 16% 2.4s[0K[1G161.3 MiB [] 17% 2.3s[0K[1G161.3 MiB [] 18% 2.2s[0K[1G161.3 MiB [] 19% 2.2s[

TEXT SCRAPING USING CRAWL4AI


In [None]:

import os
import requests
import asyncio
import json
from urllib.parse import urljoin
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy


IMAGE_DIR = "images"


if not os.path.exists(IMAGE_DIR):
    os.makedirs(IMAGE_DIR)

def save_image(image_url, index, base_url):
    try:
        if image_url and not image_url.startswith("http"):
            image_url = urljoin(base_url, image_url)
        response = requests.get(image_url)
        response.raise_for_status()
        file_path = os.path.join(IMAGE_DIR, f"{index}.jpg")
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Image saved to {file_path}")
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image: {e}")
        return None

def save_json(data, filename="books_details.json"):
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)
    print(f"Data saved to {filename}")

async def main():
    schema = {
        "name": "Books",
        "baseSelector": "article.product_pod",
        "fields": [
            {
                "name": "title",
                "selector": "h3 > a",
                "type": "attribute",
                "attribute": "title"
            },
            {
                "name": "price",
                "selector": "p.price_color",
                "type": "text"
            },
            {
                "name": "availability",
                "selector": "p.instock.availability",
                "type": "text"
            },
            {
                "name": "rating",
                "selector": "p",
                "type": "attribute",
                "attribute": "class"
            },
            {
                "name": "image",
                "selector": "img.thumbnail",
                "type": "attribute",
                "attribute": "src"
            },
            {
                "name": "url",
                "selector": "h3 > a",
                "type": "attribute",
                "attribute": "href"
            }
        ]
    }

    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    current_page = 1
    max_pages = 4
    all_data = []

    async with AsyncWebCrawler() as crawler:
        while current_page <= max_pages:
            url = base_url.format(current_page)  # Add page number to URL
            print(f"Scraping page {current_page}: {url}")
            result = await crawler.arun(
                url=url,
                config=CrawlerRunConfig(
                    cache_mode=CacheMode.BYPASS,
                    extraction_strategy=JsonCssExtractionStrategy(schema),
                    excluded_tags=["form", "header", "footer"],
                    wait_for_images=True,
                    scan_full_page=True
                )
            )

            if result.success:
                data = json.loads(result.extracted_content)
                if not data:
                    print("No data found on this page. Stopping.")
                    break

                for index, book in enumerate(data, len(all_data) + 1):

                    image_url = book.get("image")
                    if image_url:
                        image_path = save_image(image_url, index, base_url)
                        if image_path:
                            book["image"] = image_path
                    if "url" in book:
                        book["url"] = urljoin(base_url, book["url"])

                all_data.extend(data)
                current_page += 1
            else:
                print(f"Failed to crawl the page: {result.error_message}")
                break
        save_json(all_data)

if __name__ == "__main__":
    asyncio.run(main())


[INIT].... → Crawl4AI 0.4.3b2
Scraping page 1: http://books.toscrape.com/catalogue/page-1.html
[FETCH]... ↓ http://books.toscrape.com/catalogue/page-1.html... | Status: True | Time: 3.55s
[SCRAPE].. ◆ Processed http://books.toscrape.com/catalogue/page-1.html... | Time: 89ms
[EXTRACT]. ■ Completed for http://books.toscrape.com/catalogue/page-1.html... | Time: 0.08097539100026552s
[COMPLETE] ● http://books.toscrape.com/catalogue/page-1.html... | Status: True | Total: 4.12s
Image saved to images/1.jpg
Image saved to images/2.jpg
Image saved to images/3.jpg
Image saved to images/4.jpg
Image saved to images/5.jpg
Image saved to images/6.jpg
Image saved to images/7.jpg
Image saved to images/8.jpg
Image saved to images/9.jpg
Image saved to images/10.jpg
Image saved to images/11.jpg
Image saved to images/12.jpg
Image saved to images/13.jpg
Image saved to images/14.jpg
Image saved to images/15.jpg
Image saved to images/16.jpg
Image saved to images/17.jpg
Image saved to images/18.jpg
Image saved

In [None]:
import csv
import json

# Function to convert JSON data to CSV
def json_to_csv(json_data, csv_filename="books_details.csv"):
    # Extract headers from the first item in JSON data (assuming all items have the same keys)
    headers = json_data[0].keys()

    # Open the CSV file for writing
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)

        # Write the headers to the CSV file
        writer.writeheader()

        # Write the rows (data) to the CSV file
        for row in json_data:
            writer.writerow(row)

    print(f"Data has been successfully converted to {csv_filename}")

# Example usage: Convert the JSON data to CSV
with open("books_details.json", "r", encoding='utf-8') as json_file:
    data = json.load(json_file)
    json_to_csv(data)


Data has been successfully converted to books_details.csv


In [None]:
from google.colab import files
import shutil

# Path to the directory you want to download
dir_path = '/content/images'

# Create a zip file of the directory
shutil.make_archive('/content/images_archive', 'zip', dir_path)

# Download the zip file
files.download('/content/images_archive.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>