In [None]:
# Set this to True to scrape a tiny portion for quick debugging
debugging = False

### Web scraping data from BalticShipping website

To use this file, all you have to do is two things:

1. Upload IMO input CSV file to "/work/data/webscraping/"
2. Change the name assigned to 'input_file_name' in this cell below. 

Then the code below scrapes for those id's and saves to various output files. The output file names will begin with the name of your input file, followed by rest of name which is more details on the output.

For example input of:

    imo_list_2021_2022.csv

will result in this out put to /work/data/webscraping/:

    imo_list_2021_2022.not_found.2022-10-04_02-15-13.csv
    imo_list_2021_2022.search_result.2022-10-04_02-15-13.csv      # the main file of interest
    imo_list_2021_2022.search_result.2022-10-04_02-15-13.json

In [None]:
# Input source file. Replace this with *** YOUR *** IMO list.
# The file should be in the directory "/work/data/webscraping/"

# Put just the file name, do not include directory path

input_file_name = 'imo_list_2020_2021_2022.csv'

Imports and helper functions here.

In [None]:
import pandas as pd
import numpy as np
import requests
import json
import urllib.parse
from codecs import encode
import re
import time
from datetime import timedelta
import subprocess
import datetime



In [None]:
# This is the name that will be used as the base for all output files
base_file_name = re.findall(r'\w+', input_file_name)[0]
base_file_name

In [None]:
# All output will go here
base_directory = '/work/data/webscraping/'
# base_directory_out = base_directory + base_file_name + '_output/'
# base_directory_out

In [None]:
# helper functions
def display_all(df_all):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(display(df_all))

In [None]:
url = "https://www.balticshipping.com/"

This next function is the main scraping part. We started out trying to use the normal web page where part of the URL is the ship's IMO number. For example this will get the ship named OOCL Korea: https://www.balticshipping.com/vessel/imo/9627992. However, it was challenging getting BeautifulSoup and others to work properly because the web page's javascript delays rendering the page. After some research and debugging using the Google Chrome debugger's network trace, we found that the web page itself was calling a REST API to actually get the data. From that point we played around with figuring out how to call the API correctly. This function get_ship_info() is the final result. 

In [None]:
def get_ship_info(ship_id):

    clean_id = re.findall(r'\d+$', ship_id)[0]

    print('Ship info for IMO# {}...'.format(clean_id), end='')

    payload = {
        'templates[]=modal_validation_errors':0,
        'templates[]=modal_email_verificate':0,
        'templates[]=r_vessel_types_multi':0,
        'templates[]=r_positions_single':0,
        'templates[]=vessel_profile':0,
        'request[0][module]':'ships',
        'request[0][action]':'list',
        'request[0][id]':0,
        'request[0][data][0][name]':'imo',
        'request[0][data][0][value]':'{}'.format(clean_id),
        'request[0][sort]':'',
        'request[0][limit]':1,
        'request[0][stamp]':0,
        'request[1][module]':'top_stat',
        'request[1][action]':'list',
        'request[1][id]':1,
        'request[1][data]':'',
        'request[1][sort]':'',
        'request[1][limit]':'',
        'request[1][stamp]':0,
        'dictionary[]=countrys':0,
        '&dictionary[]=vessel_types':0,
        '&dictionary[]=positions':0
    }

    files=[

    ]

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
        'Accept': '*/*',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    }

    response = requests.request("POST", url, headers=headers, data=payload, files=files)
    

    # parse out and print the ship's data

    ship = {
      "id": "ERROR",
      "imo": clean_id,
    }

    try:
        j = response.json()
        if j['data']['request'][0]['ships_found'] > 0:
            ship = j['data']['request'][0]['ships'][0]['data']
            if 'gallery' in ship:
              del ship['gallery']
            #print(json.dumps(ship, indent=4))
            print("Found.")
        else:
            print("Error: Not Found")
    except Exception as e:
        print('Error parse json:', e)

    return ship


In [None]:
def get_all_ship_info(imos):
    ships = []
    for imo in imos:
        ship = get_ship_info(imo)
        ships.append(ship)
    return ships

### Web scrape! Get all ship info for the input IMO list above.

This part will take the longest. For 2700+ ships it takes about 19 minutes. Might need to make sure your computer does not fall asleep.

In [None]:
df_imos = pd.read_csv('{}{}'.format(base_directory, input_file_name))
df_imos.imo.values, df_imos.shape

In [None]:
%%time

if debugging:
    # A good example of what found and not found would look like from 2021-2022 list
    all_ships = get_all_ship_info(df_imos.imo.values[70:85])  
else:
    all_ships = get_all_ship_info(df_imos.imo.values)

### Save scraped data to JSON

We just did a time consuming data download, don't want to lose it if the system crashes. Just do a raw save to JSON file just in case. Useful for cross-referencing and debugging too.

In [None]:
str_datetime_now = (datetime.datetime.now() - timedelta(hours=7)).strftime("%Y-%m-%d_%H-%M-%S")
filename = '{}{}.search_result.{}.json'.format(base_directory, base_file_name, str_datetime_now)
print(filename)

all_ships_json = json.dumps(all_ships, indent=4)

with open(filename, 'w') as f:
    f.write(all_ships_json)

### Save scraped data to data frame then to CSV

This is the main file of interest for data cleaning and merging later. Convert json to dataframe, replace our 'ERROR' id's so that it will save it as empty. Useful for Nan when reading back in from CSV.

In [None]:
df_all = pd.DataFrame(all_ships)
df_all.shape

In [None]:
df_all['id'] = np.where(df_all['id'] == 'ERROR', np.NaN, df_all['id'])
print(df_all.shape)
df_all

In [None]:
str_datetime_now = (datetime.datetime.now() - timedelta(hours=7)).strftime("%Y-%m-%d_%H-%M-%S")
filename = '{}{}.search_result.{}.csv'.format(base_directory, base_file_name, str_datetime_now)
print(filename)
df_all.to_csv(filename, index=False)

Read it back from file just to see. Just to see. Good to check for Nan's were stored properly.

In [None]:
df_temp = pd.read_csv(filename)
df_temp

### Save list of ships not found to CSV

In [None]:
df_imo_not_found = df_all[df_all['id'].isna()]
df_imo_not_found = df_imo_not_found[['imo']].copy()
df_imo_not_found = df_imo_not_found.reset_index(drop=True)
df_imo_not_found

In [None]:
str_datetime_now = (datetime.datetime.now() - timedelta(hours=7)).strftime("%Y-%m-%d_%H-%M-%S")
filename = '{}{}.not_found.{}.csv'.format(base_directory, base_file_name, str_datetime_now)
print(filename)
df_imo_not_found.to_csv(filename)

### Done! 

Check "/work/data/webscraping/" for your output files.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>