## Web scraping of RouteScanner

A first notebook to web scrape Route Scanner.

In [1]:
# Quick and dirty hack because my PyCharm interpreter is not playing nice
ewouts_broken_PyCharm = True
if ewouts_broken_PyCharm:
    import sys
    sys.path.append('C:\\Users\\Ewout\\Documents\\python_venv\\Py310')
    sys.path.append('C:\\Users\\Ewout\\Documents\\python_venv\\Py310\\lib\\site-packages')

### Direct connections

In this part a dataframe is created with all direct connections between sea terminals in South America and The Netherlands/Belgium.

Data: https://www.routescanner.com/services/direct-connections

In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
import itertools
import pandas as pd
import pickle
from numpy import random
from time import sleep

#### Webscraping

In [3]:
# Define origin and destination countries
origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC"]
destination = ["NL", "BE"]

# Make list with all combinations
od_list = list(itertools.product(origin, destination))
print(od_list)

# Create URL list to scrape
urls = [f"https://www.routescanner.com/services/direct-connections/results?destinationCountries={d}&originCountries={o}&search=advanced" for o, d in od_list]

[('BR', 'NL'), ('BR', 'BE'), ('CO', 'NL'), ('CO', 'BE'), ('VE', 'NL'), ('VE', 'BE'), ('SR', 'NL'), ('SR', 'BE'), ('CW', 'NL'), ('CW', 'BE'), ('GY', 'NL'), ('GY', 'BE'), ('GF', 'NL'), ('GF', 'BE'), ('UY', 'NL'), ('UY', 'BE'), ('AR', 'NL'), ('AR', 'BE'), ('CL', 'NL'), ('CL', 'BE'), ('PE', 'NL'), ('PE', 'BE'), ('EC', 'NL'), ('EC', 'BE')]


In [4]:
def get_webpage(url, headless=False):
    # Instantiate options
    opts = Options()
    opts.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
    opts.add_argument("window-size=2880,1920")
    if headless:
        opts.headless = True

    # Set the location of the webdriver
    s = Service(os.getcwd() + "/drivers/chromedriver.exe")

    # Instantiate a webdriver
    driver = webdriver.Chrome(options=opts, service=s)

    # Load the HTML page
    driver.get(url)

    # Accept cookies
    driver.implicitly_wait(2)
    driver.find_element(By.CLASS_NAME,"acceptButton__P2szu").click()

    # Function to get soup from the drivers
    def get_soup():
        # Wait untill route data is loaded
        try:
            elem = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'card__hoI9D')))
        except:
            print('No connections found on this route')

        # Parse processed webpage with BeautifulSoup and save in list
        return BeautifulSoup(driver.page_source)

    # Save the first soup in a new list
    soups = [get_soup()]

    # Calculate number of results
    n_results = int(soups[0].find("h6", class_="totalResults__mTaYp").text.split(' ')[0])

    # Get results from other pages
    if n_results > 20:
        # Calculate number of extra pages
        extra_pages = (n_results-1) // 20
        # Calculate data for each extra page
        for n in range(2, extra_pages+2):
            # Go to extra page, get soup and append to soups list
            driver.get(f"{url}&page={n}")
            soups.append(get_soup())

    # Close the browser and return soups list and n_results
    driver.close()
    return soups, n_results

In [5]:
# A test run with a single webpage
soups1, n1 = get_webpage(urls[0])
print(f"{n1} results across {len(soups1)} pages")
soups1[0]

31 results across 2 pages


<html lang="en" translate="no" xmlns:fb="http://ogp.me/ns/fb#"><head><meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport"/><meta content="notranslate" name="google"/><meta charset="utf-8"/><title>Explore direct container connections from one place to another</title><meta content="Find the best door-to-door shipping routes by sea and inland. Compare options on lead time and CO2 emissions." name="description"/><meta content="routescanner, container shipping routes, maritime route, routeplanner, shortsea connections, deepsea connections, rail connections, barge connections" name="keywords"/><meta content="For the best container shipping route comparison, use Routescanner to find the best connection by deepsea, feeder, rail, barge and truck." itemprop="name"/><meta content="Routescanner – Make better container shipping decisions" property="og:title"/><meta content="Routescanner – Plan your door-to-door container shipping route" property="og:site_name"/><meta 

#### Extract data from HTML

In [6]:
# Returns route data for list of routes
def get_route_data(soups):
    # Create empty list for the route data
    route_data = []

    # Iterate through soup pages
    for soup in soups:
        # Create a list with all route cards
        routes = soup.find_all("div", class_="card__hoI9D")

        # For each route, find the company, origin, destination, service code, duration and if known frequency
        for route in routes:
            c = route.find("h6").text
            o, d = route.find("small", class_="locodes__tdIbs").text.split(' - ')
            s = route.find("p", class_="serviceCode__igooW").text
            t = route.find("div", class_="times__xgwdu").find_all("div")
            t = [i.text for i in t]
            route_data.append([c, o, d, s, *t])
    return route_data

In [7]:
# Get route data sample
data1 = get_route_data(soups1)

# Define dict keys / dataframe columns
columns = ["Company", "Origin", "Destination", "Service Code", "Duration", "Frequency"]

# Create dataframe from the data sample
df1 = pd.DataFrame(data1, columns=columns)
print(f"Created a DataFrame with {df1.index.size} entries")
df1.head()

Created a DataFrame with 40 entries


Unnamed: 0,Company,Origin,Destination,Service Code,Duration,Frequency
0,CMA CGM,BRFOR,NLRTM,North Europe French Guiana North Brazil,17 days,
1,MSC,BRSSZ,NLRTM,NWC TO SAEC - STRING I,18 - 19 days,1 time per week
2,Hapag-Lloyd,BRPEC,NLRTM,ECX-EUROPE EAST COAST EXPRESS,10 - 11 days,1 time per week
3,Hapag-Lloyd,BRSSZ,NLRTM,ECX-EUROPE EAST COAST EXPRESS,18 - 19 days,1 time per week
4,MSC,BRSSA,NLRTM,NWC TO SAEC - STRING I,13 - 14 days,1 time per week


#### Run on all routes

In [8]:
# The data can be loaded from a pickle, so the script doesn't need to re-run every time.
# Set use_pickle1 to False if you want to scrape all the data again.
use_pickle1 = True

if use_pickle1:
    route_df = pd.read_pickle("pickles/routes_between_ports.pickle")

else:
    # Empty list for data and len
    route_data = []
    combinations = len(urls)

    for n, url in enumerate(urls):
        # Scrape the webpage
        soups, _ = get_webpage(url, headless=True)
        # Get all the route data, process them and add
        new_route_data = get_route_data(soups)
        route_data = route_data + new_route_data
        # Nice print message
        print(f"Done with {n+1}/{combinations} webpages, {len(new_route_data)} new routes found (total {len(route_data)})")

    # Create a dataframe from all the route data
    route_df = pd.DataFrame(route_data, columns=columns)
route_df.head()

Unnamed: 0,Company,Origin,Destination,Service Code,Duration,Frequency
0,MSC,BRRIG,NLRTM,NWC TO SAEC - STRING I,25 days,
1,Hamburg Sud,BRPNG,NLRTM,SOUTH AMERICA EAST COAST -- NORTH EUROPE SLING...,21 days,
2,MSC,BRPNG,NLRTM,NWC TO SAEC - STRING I,21 days,1 time per week
3,Hapag-Lloyd,BRPNG,NLRTM,ECX-EUROPE EAST COAST EXPRESS,20 - 21 days,1 time per week
4,CMA CGM,BRSSZ,NLRTM,SOUTH AMERICA FR. NORTH EUR. PLATE SLING,19 days,


In [9]:
# Save as Pickle and CSV
route_df.to_pickle("pickles/routes_between_ports.pickle")
route_df.to_csv("data/routes_between_ports.csv")

### Planned routes

Scrapes planned connections from Routescanner and collects them in a DataFrame.

Data: https://www.routescanner.com/voyages

In [25]:
# Create list of origin and destination ports
o_ports = route_df["Origin"].unique()
d_ports = route_df["Destination"].unique()
print(f"{len(o_ports)} origin ports: {o_ports}\n{len(d_ports)} destination ports: {d_ports}")

# Make list with all combinations
od_ports = list(itertools.product(o_ports, d_ports))
print(f"{len(od_ports)} combinations of ports")

25 origin ports: ['BRRIG' 'BRPNG' 'BRSSZ' 'BRPEC' 'BRRIO' 'BRITJ' 'BRNAT' 'BRSSA' 'BRFOR'
 'COCTG' 'COTRB' 'COSMR' 'SRPBM' 'CWWIL' 'GFDDC' 'UYMVD' 'ARBUE' 'CLSAI'
 'CLVAP' 'CLCNL' 'PECLL' 'PEPAI' 'ECPSJ' 'ECGYE' 'ECPBO']
4 destination ports: ['NLRTM' 'BEANR' 'NLVLI' 'BEZEE']
100 combinations of ports


In [26]:
# Use today's date, by default
from datetime import date
today = date.today()
print(today)

2022-10-13


In [27]:
# Define search criteria
limit = 100
date = today             # Change this to anything later than today, in "YYYY-MM-DD" format
sort_on = "transfers"    # Sort options (str): "emission_co2", "transfers", "arrival", "duration"
modalities = ["sea"]     # Modalities options (list): ["sea", "barge", "rail", "truck"]
modalaties_string = "%2C".join(modalities)

# Create list of URLs to scrape
def generate_urls2(od_ports=od_ports):
    return [f"https://www.routescanner.com/voyages?limit={limit}&from={o_port}&fromType=locode&originsNearby=1&to={d_port}&toType=locode&destinationsNearby=1&departure={date}&sort={sort_on}&modalities={modalaties_string}" for o_port, d_port in od_ports]

urls2 = generate_urls2()
print(urls2[0])

https://www.routescanner.com/voyages?limit=100&from=BRRIG&fromType=locode&originsNearby=1&to=NLRTM&toType=locode&destinationsNearby=1&departure=2022-10-13&sort=transfers&modalities=sea


In [28]:
urls12 = generate_urls2(od_ports)
combis = list(zip(urls12, od_ports))
print(combis[0])

('https://www.routescanner.com/voyages?limit=100&from=BRRIG&fromType=locode&originsNearby=1&to=NLRTM&toType=locode&destinationsNearby=1&departure=2022-10-13&sort=transfers&modalities=sea', ('BRRIG', 'NLRTM'))


In [29]:
def get_webpages2(od_ports, headless=True):
    urls = generate_urls2(od_ports)
    u_od_zip = list(zip(urls, od_ports))

    # Instantiate options
    opts = Options()
    opts.binary_location = "/usr/bin/google-chrome"
    #opts.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
    opts.headless = True
    opts.add_argument('--remote-debugging-port=9222')
    opts.add_argument('--disable-gpu')
    opts.add_argument("--window-size=2880,2160")

    # Some options to make Chrome (hopefully) more
    opts.add_argument('--disable-blink-features=AutomationControlled')
    opts.add_experimental_option('useAutomationExtension', False)
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])

    # Set the location of the webdriver
    os.environ["WDM_PROGRESS_BAR"] = '0'
    chrome_service = Service(ChromeDriverManager().install())
    already_got = False

    def start_browser(url):
        print("(Re)starting browser")
        # Instantiate a webdriver
        driver = webdriver.Chrome(options=opts, service=chrome_service)
        # Load the HTML page
        driver.get(url)
        already_got = True
        # Accept cookies
        driver.implicitly_wait(5)
        driver.find_element(By.CLASS_NAME,"acceptButton__P2szu").click()
        return driver

    driver = start_browser(urls[0])

    soups = []
    sleeptime = 20
    driver_age = 0
    for n, (url, od) in enumerate(u_od_zip):
        i = 1
        while True: # Keep trying until successfully scraped
            try:
                # If the driver too old, first restart the browser
                if driver_age >= 4:
                    driver.quit()
                    driver_age = 0
                    driver = start_browser(url)
                # Load the HTML page, if not done earlier by a browser start
                if not already_got:
                    driver.get(url)
                already_got = False
                # Wait until route data is loaded
                elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "voyages__NVlid ")))
                # Parse processed webpage with BeautifulSoup and append to list
                soups.append((BeautifulSoup(driver.page_source, features="html.parser"), od))
                # Increase the driver age and decrease the sleeptime
                driver_age += 1
                sleeptime = max(7, sleeptime - 3)
                # Print and sleep
                print(f"Scraped route {n+1}/{len(u_od_zip)}, took {i} tries (sleeptime {sleeptime})")
                sleep(random.uniform(sleeptime, sleeptime+2))
            except:
                if i > 6 or sleeptime > 120:
                    print(f"Stopped after {i} sequential failed attempts. {n} routes successfully collected.")
                    return soups
                sleeptime = sleeptime + 10
                print(f"Failed attempt {i} on route {n+1}, trying again (sleeptime {sleeptime})")

                i += 1
                # Restart browser after 2 unsuccessful tries, or if the driver gets old
                if i >= 3 and i % 3 == 0 or driver_age >= 4:
                    driver.quit()
                    driver_age = 0
                    sleep(random.uniform(sleeptime, sleeptime+2))
                    driver = start_browser(url)
                else:
                    sleep(random.uniform(sleeptime, sleeptime+2))
                continue
            #print(f"Done {n}, iteration {i}, sleeping {sleeptime}")
            break

    # Close the browser and return soups list and n_results
    driver.close()
    driver.quit()
    return soups

In [30]:
# Returns route data for list of routes
def get_route_data2(soups):
    # Create empty list for the route data
    route_data = []

    for soup, od in soups:
        # Create a list with all route cards
        routes = soup.find_all("li", class_="voyage voyage__e0GtV")
        o, d = od

        # For each route, find the company, origin, destination, service code, duration and if known frequency
        for route in routes:
            lead = route.find("li", class_="voyageMetaColumn__qkBJv").text

            small = route.find_all("li", class_="voyageMetaColumn__qkBJv voyageMetaColumnSmall__QMXjZ", limit=2)
            transfers, co2 = small[0].text, small[1].text

            time_fields = route.find_all("li", class_="voyageMetaColumn__qkBJv smallerText__ED-90", limit=2)
            dep = [span.text for span in time_fields[0].find_all("span") if span.text != ' - ']
            arr = [span.text for span in time_fields[1].find_all("span") if span.text != ' - ']

            c = route.find_all("span", class_="carrierNames__Dn5bC")
            companies = [n.text for n in c]

            # Split the rout in different legs, and get the ports and companies for each
            legs = route.find_all("ul", class_="leg__9DiZ9")
            ports = []
            for leg in legs:
                # Get the origin, in between, and destination ports for each leg
                o_port = [leg.find("li", class_="origin__Vu5JN").text]
                stop_ports = leg.find_all("span", class_="stopLocode__V9cgh")
                stop_ports = [n.text for n in stop_ports]
                d_port = [leg.find("li", class_="destination__lN9tN").text]
                # Combine all ports in one list, and append that to the ports list
                ports.append(o_port + stop_ports + d_port)

            # Validate number of legs collected:
            if len(ports) != int(transfers)+1:
                print("Warning: Ports data may be incomplete")

            # Add everything to the route_data list
            route_data.append([lead, transfers, o, d, dep, arr, co2, ports, companies])
    return route_data

In [17]:
# Get some soup
soup_od_2 = get_webpages2([od_ports[0]])

# Get route data sample
data2 = get_route_data2(soup_od_2)

# Define dict keys / dataframe columns
columns2 = ["Lead time", "Transfers", "Origin", "Destination", "Departure", "Arrival", "kg CO2e/TEU", "Ports", "Carriers"]

# Create dataframe from the data sample
df2 = pd.DataFrame(data2, columns=columns2)
print(f"Created a DataFrame with {df2.index.size} entries")
df2

Scraped route 1/1, took 1 tries (sleeptime 2)
Created a DataFrame with 8 entries


Unnamed: 0,Lead time,Transfers,Origin,Destination,Departure,Arrival,kg CO2e/TEU,Ports,Carriers
0,26 days,1,BRRIG,NLRTM,"[21 OCT, 06:00, UTC-03:00]","[16 NOV, 10:00, UTC+01:00]",1060,"[[RIG, ITJ], [ITJ, PNG, SSZ, RIO, SSA, PEC, PT...","[Hapag-Lloyd, Maersk, ONE, Hapag-Lloyd, MSC]"
1,31 days,1,BRRIG,NLRTM,"[18 OCT, 00:00, UTC-03:00]","[18 NOV, 00:00, UTC+01:00]",1065,"[[RIG, IOA, ITJ, SSZ, SPB, PTM], [PTM, TNG, RTM]]","[CMA CGM, Hamburg Sud, Maersk, CMA CGM, OOCL]"
2,23 days,1,BRRIG,NLRTM,"[19 OCT, 18:02, UTC-03:00]","[11 NOV, 00:00, UTC+01:00]",1695,"[[RIG, PNG], [PNG, SSZ, PTM, TNG, LGP, RTM]]","[CMA CGM, Hamburg Sud, Maersk]"
3,36 days,1,BRRIG,NLRTM,"[18 OCT, 07:00, UTC-03:00]","[22 NOV, 16:40, UTC+01:00]",1725,"[[RIG, PNG, ITJ, SSZ, RIO, SSA, SUA, CTB], [CT...","[MSC, MSC]"
4,32 days,1,BRRIG,NLRTM,"[21 OCT, 06:00, UTC-03:00]","[22 NOV, 07:00, UTC+01:00]",1785,"[[RIG, ITJ, SSZ, RIO, CTG], [CTG, CAU, RTM]]","[Hapag-Lloyd, Maersk, MSC, ONE, COSCO, Hapag-L..."
5,34 days,2,BRRIG,NLRTM,"[15 OCT, 17:00, UTC-03:00]","[18 NOV, 00:41, UTC+01:00]",1205,"[[RIG, ITJ, PNG, SSZ, RIO, SSA, PEC, PTM], [PT...","[Hapag-Lloyd, Hapag-Lloyd]"
6,37 days,2,BRRIG,NLRTM,"[17 OCT, 12:33, UTC-03:00]","[23 NOV, 11:51, UTC+01:00]",1505,"[[RIG, MVD], [MVD, ANR], [ANR, RTM]]",[MSC]
7,34 days,2,BRRIG,NLRTM,"[18 OCT, 07:00, UTC-03:00]","[21 NOV, 09:26, UTC+01:00]",1975,"[[RIG, PNG, ITJ, SSZ, RIO, SSA, SUA, CTB, CAU]...","[MSC, MSC]"


In [31]:
# The data can be loaded from a pickle, so the script doesn't need to re-run every time.
# Set use_pickl2 to False if you want to scrape all the data again.
use_pickle2 = True

if use_pickle2:
    connection_df = pd.read_pickle(f"pickles/connections_{date}.pickle")

else:
    # Empty list for data and len
    connection_data = []

    # Scrape the webpage
    soups_ods_2 = get_webpages2(od_ports, headless=True)
    # Get all the connection data, process them and add
    connection_data = get_route_data2(soups_ods_2)

    # Create a dataframe from all the route data
    connection_df = pd.DataFrame(connection_data, columns=columns2)

print(f"DataFrame has {connection_df.index.size} entries")
connection_df.head()

Scraped route 1/100, took 1 tries (sleeptime 17)
Scraped route 2/100, took 1 tries (sleeptime 14)
Failed attempt 1 on route 3, trying again (sleeptime 24)
Scraped route 3/100, took 2 tries (sleeptime 21)
Failed attempt 1 on route 4, trying again (sleeptime 31)
Scraped route 4/100, took 2 tries (sleeptime 28)
Scraped route 5/100, took 1 tries (sleeptime 25)
Failed attempt 1 on route 6, trying again (sleeptime 35)
Scraped route 6/100, took 2 tries (sleeptime 32)
Failed attempt 1 on route 7, trying again (sleeptime 42)
Failed attempt 2 on route 7, trying again (sleeptime 52)
Restarting browser
Scraped route 7/100, took 3 tries (sleeptime 49)
Scraped route 8/100, took 1 tries (sleeptime 46)
Scraped route 9/100, took 1 tries (sleeptime 43)
Scraped route 10/100, took 1 tries (sleeptime 40)
Scraped route 11/100, took 1 tries (sleeptime 37)
Failed attempt 1 on route 12, trying again (sleeptime 47)
Failed attempt 2 on route 12, trying again (sleeptime 57)
Restarting browser
Scraped route 12/100

Unnamed: 0,Lead time,Transfers,Origin,Destination,Departure,Arrival,kg CO2e/TEU,Ports,Carriers
0,26 days,1,BRRIG,NLRTM,"[21 OCT, 06:00, UTC-03:00]","[16 NOV, 10:00, UTC+01:00]",1060,"[[RIG, ITJ], [ITJ, PNG, SSZ, RIO, SSA, PEC, PT...","[Hapag-Lloyd, Maersk, ONE, Hapag-Lloyd, MSC]"
1,31 days,1,BRRIG,NLRTM,"[18 OCT, 00:00, UTC-03:00]","[18 NOV, 00:00, UTC+01:00]",1065,"[[RIG, IOA, ITJ, SSZ, SPB, PTM], [PTM, TNG, RTM]]","[CMA CGM, Hamburg Sud, Maersk, CMA CGM, OOCL]"
2,23 days,1,BRRIG,NLRTM,"[19 OCT, 18:02, UTC-03:00]","[11 NOV, 00:00, UTC+01:00]",1695,"[[RIG, PNG], [PNG, SSZ, PTM, TNG, LGP, RTM]]","[CMA CGM, Hamburg Sud, Maersk]"
3,36 days,1,BRRIG,NLRTM,"[18 OCT, 07:00, UTC-03:00]","[22 NOV, 16:40, UTC+01:00]",1725,"[[RIG, PNG, ITJ, SSZ, RIO, SSA, SUA, CTB], [CT...","[MSC, MSC]"
4,32 days,1,BRRIG,NLRTM,"[21 OCT, 06:00, UTC-03:00]","[22 NOV, 07:00, UTC+01:00]",1785,"[[RIG, ITJ, SSZ, RIO, CTG], [CTG, CAU, RTM]]","[Hapag-Lloyd, Maersk, MSC, ONE, COSCO, Hapag-L..."


In [32]:
# Save as Pickle and CSV
connection_df.to_pickle(f"pickles/connections_{date}.pickle")
connection_df.to_csv(f"data/connections_{date}.csv")