In [1]:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
from selenium import webdriver
from io import BytesIO
import requests as rq
from PIL import Image
import hashlib
import time
import os

In [2]:
geckodriver_path = 'path/to/your/geckodriver'

#### *Parser_v4.1.5* ####

In [None]:
service = Service(geckodriver_path)
driver = webdriver.Firefox(service = service)

common_url = 'https://www.vesselfinder.com/ru/'
all_pages_url = 'https://www.vesselfinder.com/ru/vessels?page=_&flag=RU'

os.makedirs('structured_sea_vessels_db', exist_ok = True)
list_of_all_imgs_names = []
list_of_all_imgs_hashes = []

i, got_empty_img = 0, False
while True:
    i += 1
    current_page_url = all_pages_url.replace('_', str(i))

    try:
        driver.get(current_page_url)
        page_is_done = False
    except Exception as e:
        print(f'Error with downloading the {i} page: {e}')
        continue

    while True:

        little_ships_imgs_curls = driver.find_elements(By.XPATH, '//table[@class="results"]/tbody/tr')
        for index in range(len(little_ships_imgs_curls)):
            
            little_ship_img_curl = little_ships_imgs_curls[index]
            try:
                name_element = little_ship_img_curl.find_element(By.CLASS_NAME, 'slna')
                type_element = little_ship_img_curl.find_element(By.CLASS_NAME, 'slty')
                ship_element = little_ship_img_curl.find_element(By.CLASS_NAME, 'ship-link')

                ship_name = name_element.text.strip().replace(' ', '-').replace('.', '-').lower()
                ship_type = type_element.text.strip().title().replace(' ', '').replace('/', '&')
                ship_href_little_img = ship_element.get_attribute('href')
            except Exception as e:
                print(f'Error with getting the ship data on the {i} page: {e}')
                continue

            try:
                img_tag = little_ship_img_curl.find_element(By.TAG_NAME, 'img')
                ship_src_little_img = img_tag.get_attribute('data-src')

                if ship_src_little_img == 'https://static.vesselfinder.net/images/cool-ship2@2.png?v1':
                    got_empty_img = True
                    break
            except Exception as e:
                print(f'Error with getting the little ship source on its image on the {i} page: {e}')
                continue

            try:
                driver.get(urljoin(common_url, ship_href_little_img))
            except Exception as e:
                print(f'Error with tring to download the current ship pre-page [{urljoin(common_url, ship_href_little_img)}] on the {i} page: {e}')
                continue

            try:
                pre_page_img = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'a.img-holder.s0'))
                )
                pre_page_href = pre_page_img.get_attribute('href')
            except Exception as e:
                print(f'Error with trying to get the final ship bigger image href [{pre_page_href}] on the {i} page: {e}')
                continue

            try:
                driver.get(urljoin(common_url, pre_page_href))
                time.sleep(2)
            except Exception as e:
                print(f'Error with downloading the final page of bigger ships [{urljoin(common_url, pre_page_href)}] on the {i} page: {e}')
                continue

            try:
                final_page_div = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, 'thumbs-container'))
                )
                ships_links_big_imgs = final_page_div.find_elements(By.CSS_SELECTOR, 'a.ship-thumb')
                ships_hrefs_big_imgs = [link.get_attribute('href') for link in ships_links_big_imgs]
            except Exception as e:
                print(f'Error with trying tp get the links of the bigger ships [{urljoin(common_url, pre_page_href)}] on the {i} page: {e}')
                continue

            count = 0
            for j in ships_hrefs_big_imgs:
                
                try:
                    driver.get(urljoin(common_url, j))
                    time.sleep(0.2)
                except Exception as e:
                    print(f'Error with downloading the image of the bigger ship [{urljoin(common_url, j)}] on the {i} page: {e}')
                    continue

                try:
                    final_image_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, 'main-photo'))
                    )
                    final_image_url = final_image_element.get_attribute('src')

                    img_data = rq.get(final_image_url, stream = True)
                    img_data.raise_for_status()
                except Exception as e:
                    print(f'Error with downloading the bigger image {final_image_url}: {e}')
                    continue

                try:
                    img = Image.open(BytesIO(img_data.content)).convert('RGB')
                    img_bytes = img.tobytes()
                    img_hash = hashlib.md5(img_bytes).hexdigest()
                    
                    count += 1
                    img_name = f'{ship_name}-{count}_{ship_type}.jpg'

                    if img_name in list_of_all_imgs_names or img_hash in list_of_all_imgs_hashes:
                        print(f'Found dublicatie of the {img_name}, skipping...')
                        continue

                    img.save(f'structured_sea_vessels_db/{img_name}', format = 'JPEG')
                    list_of_all_imgs_names.append(img_name)
                    list_of_all_imgs_hashes.append(img_hash)
                except Exception as e:
                    print(f'Error with proccessing the image [{final_image_url}]: {e}')
                    continue
            
            if index == len(little_ships_imgs_curls) - 1:
                page_is_done = True
                break

            try:
                driver.get(current_page_url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//table[@class="results"]/tbody/tr'))
                )
                little_ships_imgs_curls = driver.find_elements(By.XPATH, '//table[@class="results"]/tbody/tr')
            except Exception as e:
                print(f'Error with getting back to the {i}page : {e}')
                break

        if got_empty_img or page_is_done:
            break

    if got_empty_img:
        print('Got the "NO_IMAGE"-link. Stopping the main loop of the parser...\n')
        driver.delete_all_cookies()
        break

    print(f'{i} page is done...\n')

print('all pages are done.')
driver.quit()

In [40]:
# ...