In [1]:
!pip install requests beautifulsoup4



In [6]:
import requests
from bs4 import BeautifulSoup
import os
import time

In [7]:
def scrape_images(query, save_dir, num_images=100):
    """
    Scrapes images from a search engine and saves them to a directory.

    Args:
        query (str): The search term for the images.
        save_dir (str): The directory to save the images.
        num_images (int): The number of images to download.
    """
    
    # Create the directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"Created directory: {save_dir}")
    
    # URL for a search engine image search
    # This example uses a simplified URL, you may need to find a more robust one
    # or use a library like `selenium` for dynamic pages.
    url = f'https://www.google.com/search?q={query}&tbm=isch'
    
    # Set a User-Agent to mimic a browser, preventing being blocked
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    print(f"Searching for images of: {query}...")
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all 'img' tags
    img_tags = soup.find_all('img')
    
    downloaded_count = 0
    
    for i, img in enumerate(img_tags):
        if downloaded_count >= num_images:
            break
        
        try:
            # Get the image source URL, which may be in 'src' or 'data-src'
            img_url = img.get('src') or img.get('data-src')
            if not img_url:
                continue

            # Ensure the URL is a full HTTP/HTTPS link
            if img_url.startswith('http'):
                image_data = requests.get(img_url, headers=headers, timeout=10)
                image_data.raise_for_status()
                
                # Create a unique filename
                filename = f"{query.replace(' ', '_')}_{i:04d}.jpg"
                filepath = os.path.join(save_dir, filename)
                
                with open(filepath, 'wb') as f:
                    f.write(image_data.content)
                
                print(f"Downloaded: {filename}")
                downloaded_count += 1
                time.sleep(0.1) # Be a good citizen and don't flood the server
                
        except (requests.exceptions.RequestException, IOError) as e:
            print(f"Could not download image {i+1}: {e}")
            continue

    print(f"\nFinished scraping. Downloaded {downloaded_count} images to {save_dir}")

# --- Main script execution ---
base_dir = "dataset"

# Define your categories and corresponding search queries
categories = {
    "no_threat": ["empty room", "blank image", "empty guard booth"],
    "human_only": ["people walking", "people standing", "crowd of people"],
    "weapon_only": ["gun close up", "rifle isolated", "knife on table"],
    "human_with_weapon": ["soldier with rifle", "police officer with gun", "person holding a handgun"]
}

# Iterate through categories and scrape images
for category, queries in categories.items():
    category_path = os.path.join(base_dir, category)
    print(f"\n--- Starting to scrape for category: {category} ---")
    for query in queries:
        scrape_images(query, category_path, num_images=20) # Adjust num_images as needed


--- Starting to scrape for category: no_threat ---
Created directory: dataset\no_threat
Searching for images of: empty room...
Error fetching URL: HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: /search?q=empty%20room&tbm=isch (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001DD32E01C10>, 'Connection to www.google.com timed out. (connect timeout=None)'))
Searching for images of: blank image...
Error fetching URL: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Searching for images of: empty guard booth...
Downloaded: empty_guard_booth_0001.jpg
Downloaded: empty_guard_booth_0002.jpg
Downloaded: empty_guard_booth_0003.jpg
Downloaded: empty_guard_booth_0004.jpg
Downloaded: empty_guard_booth_0005.jpg
Could not download image 7: HTTPSConnectionPool(host='encrypted-tbn0.gstatic.com', port=443): Max retries exceeded with url: /images?q=tbn:ANd9GcRnFPcrQ2YTbv0DpEvcwIMP6dTQ2tt6H