In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time

In [67]:
def scrape_images(query, save_dir, num_images=100):
    """
    Scrapes images from a search engine and saves them to a directory.

    Args:
        query (str): The search term for the images.
        save_dir (str): The directory to save the images.
        num_images (int): The number of images to download.
    """

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"Created directory: {save_dir}")

    url = f'https://www.google.com/search?q={query}&tbm=isch'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    print(f"Searching for images of: {query}...")

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')
    downloaded_count = 0

    for i, img in enumerate(img_tags):
        if downloaded_count >= num_images:
            break

        try:
            img_url = img.get('src') or img.get('data-src')
            if not img_url or not img_url.startswith('http'):
                continue

            image_data = requests.get(img_url, headers=headers, timeout=10)
            image_data.raise_for_status()

            filename = f"{query.replace(' ', '_')}_{i:04d}.jpg"
            filepath = os.path.join(save_dir, filename)

            with open(filepath, 'wb') as f:
                f.write(image_data.content)

            print(f"Downloaded: {filename}")
            downloaded_count += 1
            time.sleep(0.1)

        except (requests.exceptions.RequestException, IOError) as e:
            print(f"Could not download image {i+1}: {e}")
            continue

    print(f"\nFinished scraping. Downloaded {downloaded_count} images to {save_dir}")

# --- Main script execution for a single category ---
base_dir = "dataset"

# Define the category and its search queries you want to scrape
category_to_scrape = "human_with_weapon" # <--- CHANGE THIS CATEGORY
queries_to_use = ["people with axe"] # <--- CHANGE THESE QUERIES

category_path = os.path.join(base_dir, category_to_scrape)
print(f"\n--- Starting to scrape for category: {category_to_scrape} ---")

for query in queries_to_use:
    scrape_images(query, category_path, num_images=100) # Adjust num_images as needed


--- Starting to scrape for category: human_with_weapon ---
Searching for images of: people with axe...
Downloaded: people_with_axe_0001.jpg
Downloaded: people_with_axe_0002.jpg
Downloaded: people_with_axe_0003.jpg
Downloaded: people_with_axe_0004.jpg
Could not download image 6: HTTPSConnectionPool(host='encrypted-tbn0.gstatic.com', port=443): Max retries exceeded with url: /images?q=tbn:ANd9GcRaCHovSw7Z5_FlHgRsGfhqUO-yLP5rfpqTYdItg2AtDojVWrvR-1hG8XkZaA&s (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002BC4CA44DA0>, 'Connection to encrypted-tbn0.gstatic.com timed out. (connect timeout=10)'))
Downloaded: people_with_axe_0006.jpg
Could not download image 8: HTTPSConnectionPool(host='encrypted-tbn0.gstatic.com', port=443): Max retries exceeded with url: /images?q=tbn:ANd9GcTiBw0Gpupzz06vcSf8xjKldWURVv6ROepKhhW19GAiquzE5UOsSayCQGUB2w&s (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002BC4CD28200>, 'Connection to encrypt