In [1]:
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
from selenium import webdriver
from io import BytesIO
import requests as rq
from PIL import Image
import hashlib
import time
import os

In [2]:
geckodriver_path = 'path/to/your/geckodriver'

In [5]:
def login():
    driver.get('https://fleetphoto.ru/login.php')

    username_input = driver.find_element(By.ID, 'username')
    password_input = driver.find_element(By.ID, 'password')
    login_button = driver.find_element(By.ID, 'loginbtn')

    username_input.send_keys('...')
    password_input.send_keys('...')
    login_button.click()

In [6]:
def wait_for_page_load(driver, timeout = 5):
    start_time = time.time()
    while time.time() - start_time < timeout:
        state = driver.execute_script('return document.readyState')
        if state == 'complete':
            return True
        time.sleep(0.5)
    print(f'\nThe page hasn\'t been fully loaded for {timeout} seconds...\n')
    return False

In [15]:
def load_ship_page(driver, ship_link, retries = 3):
    for attempt in range(retries):
        driver.get(ship_link)
        if wait_for_page_load(driver):
            return True
        print(f'\nAttempt {attempt + 1}/{retries} failed for {ship_link}, retrying...\n')
    print(f'\nThe ship on [{ship_link}] hasn\'t been fully loaded after {retries} attempts, skipping...\n')
    return False

In [7]:
def click_next_button(driver, retries = 3):
    for attempt in range(retries):
        try:
            next_button = driver.find_element(By.ID, 'next')
            driver.execute_script('arguments[0].click();', next_button)
            if wait_for_page_load(driver):
                return True
            print(f'\nAttempt {attempt + 1}/{retries} to load next image failed, retrying...\n')
            driver.refresh()
        except Exception as e:
            print(f'\nAttempt {attempt + 1}/{retries} failed: {e}, retrying...\n')
            driver.refresh()
    print(f'\nThe next image for the ship hasn\'t been fully loaded after {retries} attempts, breaking...\n')
    return False

In [None]:
service = Service(geckodriver_path)
driver = webdriver.Firefox(service = service)
driver.implicitly_wait(10)

base_url = 'https://fleetphoto.ru/search.php?cid=18&mid=434&eid_own=0&eid_mgr=0&gid=0&state=1&num=&place1=&place2=&place3=&notes=&konk=0&cammod=&aid=-1&order=0&anypub=1&st=30'
login()

os.makedirs('motor_yachts_db_2', exist_ok = True)
list_of_all_imgs_hashes = set()

i, the_last_page = 1, False
while True:
    
    driver.get(base_url)
    if not wait_for_page_load(driver):
        print(f'\nFailed to load next page {i}...(breaking the main loop)\n')
        break
    ship_links = [urljoin('https://fleetphoto.ru', link.get_attribute('href'))
              for link in driver.find_elements(By.CLASS_NAME, 'prw')]
    
    try:
        next_page = driver.find_element(By.ID, 'NextLink')
        base_url = urljoin('https://fleetphoto.ru', next_page.get_attribute('href'))
    except Exception as e:
        print('\n### No "NextLink" found, last page reached... ###\n')
        the_last_page = True

    print(4 * '===============')
    print(f'-> {i} page parsing... <-\n')

    j, common_page_count = 1, 0
    for ship_link in ship_links:

        try:
            driver.get(ship_link)
            if not wait_for_page_load(driver):
                print(f'\nThe ship on [{ship_link}] hasn\'t been fully loaded, skipping...\n')
                continue
        except Exception as e:
            print(f'\nCouldn\'t reach the {ship_link} because of {e}...\n')
            continue

        count = 1
        list_of_current_imgs_hashes = set()
        while True:
            try:
                not_found_text = driver.find_element(By.TAG_NAME, 'h1').text
                if 'Picture\'s not found' in not_found_text:
                    print(f'\nNo image found for {ship_link}, skipping...\n')
                    break
            except Exception:
                pass

            if count == 1000:
                break

            try:
                ship_image_element = driver.find_element(By.ID, 'ph')
                ship_image_url = urljoin('https://fleetphoto.ru', ship_image_element.get_attribute('src'))
            except Exception as e:
                print(f'\nError retrieving image element: {e}\n')
                continue

            img_data = rq.get(ship_image_url)
            img = Image.open(BytesIO(img_data.content)).convert('RGB')
            img_bytes = img.tobytes()
            img_hash = hashlib.md5(img_bytes).hexdigest()

            if img_hash not in list_of_current_imgs_hashes:
                if img_hash not in list_of_all_imgs_hashes:

                    count += 1
                    img_name = f'motor_yacht_2-{i}-{j}-{count}.jpg'
                    img.save(f'motor_yachts_db_2/{img_name}', format = 'JPEG')

                    list_of_all_imgs_hashes.add(img_hash)
                    list_of_current_imgs_hashes.add(img_hash)

                    try:
                        next_button = driver.find_element(By.ID, 'next')
                        #next_button.click()
                        driver.execute_script('arguments[0].click();', next_button)
                        if not wait_for_page_load(driver):
                            print(f'\nThe next image for the [{ship_link}] ship hasn\'t been fully loaded, skipping...\n')
                            continue
                    except Exception as e:
                        print(f'\nError finding or clicking "Next" button: {e}\n')
                        break
                else:
                    break
            else:
                break
                
        print(f'{j}/30 is done on the {i} page: got {count}/1000 imgs')

        j += 1
        common_page_count += count

    print(f'\n{i} page is done... (got {common_page_count}/300 imgs in common)')
    print(4 * '===============')
    i += 1

    if the_last_page == True:
        break

print('parsing is done')

time.sleep(1)
driver.quit()

In [1]:
# ...