In [2]:
# import dependencies
import selenium
import os
import time
from selenium import webdriver
import requests
import io
from PIL import Image
import hashlib

In [3]:
# Put the path for your ChromeDriver here
DRIVER_PATH = '/Users/ADMIN/Desktop/chromedriverjust.exe'

In [4]:
# instantiate a webdriver
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [5]:
# go to google.com
wd.get('https://google.com')

In [6]:
# create a search box and type Black Americans
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('Black Americans')

In [7]:
# close webdriver
wd.quit()

### Searching for a particular phrase & get the image links


In [13]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

The function `fetch_image_urls` expects three input parameters:  
1. `query` : Search term, like Dog
2. `max_links_to_fetch` : Number of links the scraper is supposed to collect
3. `webdriver` : instantiated 

### Downloading the images
For the following snippet to work, we will first have to install PIL by running pip install Pillow.

In [8]:
pip install Pillow

Note: you may need to restart the kernel to use updated packages.


In [9]:
def persist_image(folder_path:str, url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

The persist_image function grabs an image URL url and downloads it into the folder_path. The function will assign the image a random 10-digit id.

### Putting it all together
The following function search_and_download combines the previous two functions and adds some resiliency to how we use the ChromeDriver. More precisely, we are using the ChromeDriver within a with context, which guarantees that the browser closes down ordinarily, even if something within the with context raises an error. search_and_download allows you to specify number_images, which by default is set to 5, but can be set to whatever number of images you want to download.

In [10]:
def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [19]:
search_term = 'Black Americans'

search_and_download(search_term=search_term,
                   driver_path=DRIVER_PATH,
                    number_images=50
                   )

Found: 100 search results. Extracting links from 0:100
Found: 50 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQzCEzt4kvllVImB_Noqv-bd8q-7fnGuUHk9Q&usqp=CAU - as ./images\black_americans\fd7763f4f5.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQeMTe-dNno9bNOMcu8EPr7LaKvviUEiVmvBg&usqp=CAU - as ./images\black_americans\e7f44e83ac.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSPebAvyCEew4W_uq5KcwLIY_QpbGIQsP37bA&usqp=CAU - as ./images\black_americans\171f0254d9.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSve1e9YeDt3i8WLGIbrttGK7tdX4Osh9XxWg&usqp=CAU - as ./images\black_americans\52c1865ddc.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS6W_tTnFB-CvIcCVgazD1FgrygKF2u1pMvUg&usqp=CAU - as ./images\black_americans\438f4c3d5f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRABgwsPhm-bMblYeqDT2REvGKzNe_Z_-k7WA&usqp

In [20]:
search_term = 'Asian Americans'

search_and_download(search_term=search_term,
                   driver_path=DRIVER_PATH,
                    number_images=50
                   )

Found: 100 search results. Extracting links from 0:100
Found: 50 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT1WlWsDOiZhoKBi89vTH7eQ4x5Ia6pm6thTQ&usqp=CAU - as ./images\asian_americans\53cf91c121.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRr_9dh7fKHML2EOwSlNcuZoyXnvHJjvE042w&usqp=CAU - as ./images\asian_americans\7fc2e3914e.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSAVpv6j6QeOrNONIOFfeLgErmklOyt4VmTWw&usqp=CAU - as ./images\asian_americans\08e1bead1d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTOGFGSyrtHD0fr5_4cexCZB5KyFsbRVN70-g&usqp=CAU - as ./images\asian_americans\8e44fb6d20.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQCMnif1jKLqmF9PxNcLupxWyp1GJXbHlJ7Ew&usqp=CAU - as ./images\asian_americans\a69a28db71.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS_2QIXZyNODWbnUflMv5si5F3j_8F2T2HLTg&usqp