In [56]:
import os
import time
import csv
import re
import yaml
import inspect
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options

In [57]:
# Get the current working directory
current_dir = os.getcwd()

# Move up a couple of folders
project_dir = os.path.abspath(os.path.join(current_dir, "../../"))

In [58]:
# Define the path to your YAML file
yaml_file = os.path.abspath(os.path.join(current_dir, "../../config.yaml"))

# Read the YAML file
with open(yaml_file, 'r') as file:
    config = yaml.safe_load(file)

# Access the values from the YAML file
vessel_type = config['vessel_type']
class_type = config['class']
base = config['base']

In [59]:
#edit sesuai kebutuhan
num = base + 8
filenum = str(num)

In [60]:
# Directory file csv
read_path = os.path.join(project_dir,"input", vessel_type,"split_{}.csv".format(filenum))
write_path = os.path.join(project_dir,"output", class_type, vessel_type,"{}.csv".format(filenum))

In [61]:
print(read_path)

C:\Users\abinp\Documents\ITS\side projects\new\ship data scrapping\input\tanker\split_16.csv


In [62]:
# Check if input file exists
if os.path.exists(read_path):
    print(f"CSV read file is found, continuing...")
else:
    print(f"CSV read file is not found")
    print('exiting....')
    
    
# Check if output file exists and create it if necessary
if os.path.exists(write_path):
    print(f"CSV write file  is found, continuing...")
    
else:
    print(f"CSV write file is not found, creating one...")
    os.makedirs(os.path.dirname(write_path), exist_ok=True) # Create output directory if it doesn't exist
    with open(write_path, 'w', newline='') as f:
        writer = csv.writer(f)

CSV read file is found, continuing...
CSV write file is not found, creating one...


In [63]:
def search(IMO):
    link = 'https://www.ccs.org.cn/ccswzen/internationalShipsList?columnid=201900002000000123&imono={}'.format(IMO)
    driver.get(link)
    
    search_button = driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[1]/ul[3]/li[2]/input[1]')
    search_button.click()

    ship_button = driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div[1]/table/tbody/tr/td[2]/button')
    ship_button.click()

    time.sleep(2)

    iframe = driver.find_element(By.TAG_NAME, "iframe")
    driver.switch_to.frame(iframe)


In [64]:
#data mining di website
def shipdata():
    global LOA , LPP , B , H , T
    global DWT , NT , GT , LT
    global NAME , REGISTER , IMO , vessel_type
    global link
    
    
    NAME = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[2]/td[1]/span')
    REGISTER = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[1]/td[1]/span')
    IMO = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[3]/td[1]/span')
    vessel_type = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[6]/td[1]/span')
    
    #principal dimension
    LOA = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[8]/td[2]/span')
    LPP = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[9]/td[1]/span')
    B = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[9]/td[2]/span')
    H = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[10]/td[1]/span')
    T = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[11]/td[1]/span')
    
    #ship tonnage
    GT = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[7]/td[1]/span')
    NT = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[7]/td[2]/span')
    DWT = driver.find_element(By.XPATH,'/html/body/div/div/div/table/tbody/tr[8]/td[1]/span')
    
    link = 'https://www.ccs.org.cn/ccswzen/internationalShipsList?columnid=201900002000000123&imono={}'.format(IMO.text)
    
    # print variable names and value




In [65]:
# Define the fieldnames for the CSV file
fieldnames = ['NAME', 'REGISTER', 'IMO', 'TYPE', 'LOA', 'LPP', 'B', 'H', 'T',  'GT', 'NT', 'DWT', 'Link']

In [66]:
# Set up the webdriver
driver = webdriver.Chrome()

# Counters for ships found and total IMOs
ship_count = 0
total_imo = 0

# Open the input file to get the total number of IMOs
with open(read_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    total_imo = sum(1 for _ in reader)

# Open the CSV file for writing
with open(write_path, mode='w', newline='',encoding='utf-8') as details_file:
    writer = csv.DictWriter(details_file, fieldnames=fieldnames)
    writer.writeheader()

    # Create a progress bar
    progress_bar = tqdm(total=total_imo, desc='Processing IMOs')

    # Open the input file and loop through each vessel name
    with open(read_path, newline='') as csvfile:
        reader = csv.reader(csvfile)

        for row in reader:
            IMO = row[0]
            progress_bar.update(1)
            progress_bar.set_postfix({'Ships Found ': ship_count})
            try:
                search(IMO)

                shipdata()
                ship_count += 1
            except:
                continue

            details_dict = {
                'NAME': NAME.text, 'REGISTER': REGISTER.text, 'IMO': IMO.text,
                'TYPE': vessel_type.text, 'LOA': LOA.text, 'LPP': LPP.text,
                'B': B.text, 'H': H.text, 'T': T.text, 'GT': GT.text,
                'NT': NT.text, 'DWT': DWT.text, 'Link': link
            }
            writer.writerow(details_dict)



    # Close the progress bar
    progress_bar.close()

# Close the webdriver
driver.quit()

# Display the total number of ships found
print('Total Ships Found:', ship_count)

Processing IMOs: 100%|████████████████████████████████████████████| 1001/1001 [33:54<00:00,  2.03s/it, Ships Found =50]


Total Ships Found: 50
