## Web scraping of RouteScanner

A first notebook to web scrape Route Scanner.

In [1]:
# Quick and dirty hack because my PyCharm interpreter is not playing nice
ewouts_broken_PyCharm = True
if ewouts_broken_PyCharm:
    import sys
    sys.path.append('C:\\Users\\Ewout\\Documents\\python_venv\\Py310')
    sys.path.append('C:\\Users\\Ewout\\Documents\\python_venv\\Py310\\lib\\site-packages')

### Direct connections

In this part a dataframe is created with all direct connections between sea terminals in South America and The Netherlands/Belgium.

Data: https://www.routescanner.com/services/direct-connections

**TODO:** Make sure more than 20 routes per webpage can be scraped (the results on page 2 and onwards)

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import os
import itertools
import pandas as pd
import pickle

#### Webscraping

In [3]:
# Define origin and destination countries
origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC"]
destination = ["NL", "BE"]

# Make list with all combinations
od_list = list(itertools.product(origin, destination))
print(od_list)

# Create URL list to scrape
urls = [f"https://www.routescanner.com/services/direct-connections/results?destinationCountries={o}&originCountries={d}&search=advanced" for o, d in od_list]

[('BR', 'NL'), ('BR', 'BE'), ('CO', 'NL'), ('CO', 'BE'), ('VE', 'NL'), ('VE', 'BE'), ('SR', 'NL'), ('SR', 'BE'), ('CW', 'NL'), ('CW', 'BE'), ('GY', 'NL'), ('GY', 'BE'), ('GF', 'NL'), ('GF', 'BE'), ('UY', 'NL'), ('UY', 'BE'), ('AR', 'NL'), ('AR', 'BE'), ('CL', 'NL'), ('CL', 'BE'), ('PE', 'NL'), ('PE', 'BE'), ('EC', 'NL'), ('EC', 'BE')]


In [4]:
def get_webpage(url, headless=False):
    # Instantiate options
    opts = Options()
    opts.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
    if headless:
        opts.headless = True

    # Set the location of the webdriver
    chrome_driver = os.getcwd() + "/drivers/chromedriver.exe"

    # Instantiate a webdriver
    driver = webdriver.Chrome(options=opts, executable_path=chrome_driver)

    # Load the HTML page
    driver.get(url)

    # Accept cookies
    driver.implicitly_wait(2)
    driver.find_element(By.CLASS_NAME,"acceptButton__P2szu").click()

    # Wait untill route data is loaded
    try:
        elem = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'card__hoI9D')))
    except:
        print('No connections found on this route')

    # Parse processed webpage with BeautifulSoup
    soup = BeautifulSoup(driver.page_source)

    # Close the browser
    driver.close()
    return soup

In [5]:
# A test run with a single webpage
soup1 = get_webpage(urls[0])

  driver = webdriver.Chrome(options=opts, executable_path=chrome_driver)


#### Extract data from HTML

In [6]:
# Returns route data for list of routes
def get_route_data(soup):
    # Create a list with all route cards
    routes = soup.find_all("div", class_="card__hoI9D")

    # Create empty list for the route data
    route_data = []

    # For each route, find the company, origin, destination, service code, duration and if known frequency
    for route in routes:
        c = route.find("h6").text
        o, d = route.find("small", class_="locodes__tdIbs").text.split(' - ')
        s = route.find("p", class_="serviceCode__igooW").text
        t = route.find("div", class_="times__xgwdu").find_all("div")
        t = [i.text for i in t]
        route_data.append([c, o, d, s, *t])
    return route_data

In [7]:
# Get route data sample
data1 = get_route_data(soup1)

# Define dict keys / dataframe columns
columns = ["Company", "Origin", "Destination", "Service Code", "Duration", "Frequency"]

# Create dataframe from the data sample
df1 = pd.DataFrame(data1, columns=columns)
df1.head()

Unnamed: 0,Company,Origin,Destination,Service Code,Duration,Frequency
0,MSC,NLRTM,BRRIG,NWC TO SAEC - STRING I,43 days,2 times per week
1,MSC,NLRTM,BRITJ,NWC TO SAEC - STRING I,35 days,1 time per week
2,MSC,NLRTM,BRRIO,NWC TO SAEC - STRING I,30 days,
3,Hapag-Lloyd,NLRTM,BRITJ,ECX-EUROPE EAST COAST EXPRESS,34 days,
4,MSC,NLRTM,BRRIO,NWC TO SAEC - STRING I,31 days,1 time per week


#### Run on all routes

In [8]:
# The data can be loaded from a pickle, so the script doesn't need to re-run every time.
# Set use_pickle1 to False if you want to scrape all the data again.
use_pickle1 = True

if use_pickle1:
    route_df = pd.read_pickle("pickles/routes_between_ports.pickle")

else:
    # Empty list for data and len
    route_data = []
    combinations = len(urls)

    for n, url in enumerate(urls):
        # Scrape the webpage
        soup = get_webpage(url, headless=True)
        # Get all the route data, process them and add
        new_route_data = get_route_data(soup)
        route_data = route_data + new_route_data
        # Nice print message
        print(f"Done with {n}/{combinations} webpages, {len(new_route_data)} new routes found (total {len(route_data)})")

    # Create a dataframe from all the route data
    route_df = pd.DataFrame(route_data, columns=columns)
route_df.head()

Unnamed: 0,Company,Origin,Destination,Service Code,Duration,Frequency
0,MSC,NLRTM,BRRIG,NWC TO SAEC - STRING I,43 days,2 times per week
1,MSC,NLRTM,BRITJ,NWC TO SAEC - STRING I,35 days,1 time per week
2,MSC,NLRTM,BRRIO,NWC TO SAEC - STRING I,30 days,
3,Hapag-Lloyd,NLRTM,BRITJ,ECX-EUROPE EAST COAST EXPRESS,34 days,
4,MSC,NLRTM,BRRIO,NWC TO SAEC - STRING I,31 days,1 time per week


In [9]:
# Save as Pickle and CSV
route_df.to_pickle("pickles/routes_between_ports.pickle")
route_df.to_csv("data/routes_between_ports.csv")