In [6]:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
from selenium import webdriver
from io import BytesIO
import requests as rq
from PIL import Image
import hashlib
import time
import os

In [1]:
geckodriver_path = 'path/to/your/geckodriver'

In [None]:
service = Service(geckodriver_path)
driver = webdriver.Firefox(service = service)
base_url = 'https://www.istockphoto.com/ru/search/more-like-this/98462741?assettype=image'

os.makedirs('hydrocycles_db', exist_ok = True)
list_of_all_imgs_hashes = set()

try:
    driver.get(base_url)
    time.sleep(3)
except Exception as e:
    print(f'\nFailed to get to the another page; error code: {e}\n')
    driver.quit()
    exit()

page_count, i, is_end = 0, 0, False
while True:

    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "[data-testid='pagination-button-next']")
    except Exception as e:
        print(f'\nFailed to find the "Next Page" button...(seems to be the end of the site)\n')
        is_end = True

    img_elements = []
    try:
        
        # img_elements = driver.find_elements(By.XPATH, '//img')
        
        picture_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, 'picture'))
        )

        for picture in picture_elements:
            imgs = picture.find_elements(By.TAG_NAME, 'img')
            img_elements.extend(imgs)
        
    except Exception as e:
        print(f'\nFalied to get all the images from the page; error code: {e}\n')
        if not is_end:
            next_button.click() 
        else:
            break

    for img in img_elements:
        src = img.get_attribute('src')
        try:
            
            img_data = rq.get(src)
            img = Image.open(BytesIO(img_data.content)).convert('RGB')
            img_bytes = img.tobytes()
            img_hash = hashlib.md5(img_bytes).hexdigest()
        
            if img_hash not in list_of_all_imgs_hashes:
        
                i += 1
                img_name = f'hydrocycle_{i}.jpg'
                img.save(f'hydrocycles_db/{img_name}', format = 'JPEG')
        
                list_of_all_imgs_hashes.add(img_hash)
            else:
                print('\nDoublicate found! Skipping...\n')
                
        except Exception as e:
            print(f'\nImage {i} has no src...; error code: {e}\n')

    if is_end:
        break
    else:
        page_count += 1
        print(f'Page {page_count} has been parsed...')
        
        next_button.click()
        time.sleep(4)

time.sleep(1)
driver.quit()

print('\ndone\n')

In [2]:
# ...