In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import itertools
from datetime import date

In [2]:
# Some definitions in explanation of the code:
# Connection: ALl possibilities from the origin port to the destination port
# Route: A specific possibility from origin to destination on a specific departure date
# Transfer: Within a route, the container can switch from one vessel to the other and continue journey

In [3]:
#sets up the options of the chromedriver
opts = Options()
opts.add_argument("window-size=1280,720") #locks the window size
opts.add_argument("user-agent=Chrome/106.0.5249.119") #Prevents sites from blocking traffic
headless = False

if headless: #if True, open chrome on the background without window
    opts.headless = True

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

In [4]:
#!!! Instructions on port selection start
# The same ports and methods as for scraping_routscanner_v2 were used.
#           origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC", "VN", "PY", "GY", "KH"]
#           destination = ["NL", "BE"]
# The UN-LOCODES were picked from the following CSV: (contains country codes
#           country_df = pd.read_csv("../utils/country-codes.csv")
# Furthermore the ports in South-America, Vietnam and Benelux were selected
#        with open('../pickles/msc_country_port_codes.pickle', 'rb') as handle:
#                country_port_codes = pickle.load(handle)

# However unlike scraping_routescanner_v2, the site of Maersk doesn't accept port-codes such as NLRTM
# Therefore, the correct port names that work in the site of Maersk were selected by hand
# Using both the port name according to the list above and the latitude and longitude
#!!! Instructions on port selection end

o_names = ["Arica, Chile","Belem (Para), Brazil","Buenaventura (Valle Del Cauca), Colombia","Buenos Aires (Buenos Aires), Argentina","Asuncion, Paraguay","Pilar, Paraguay","Callao, Peru","Campana (Buenos Aires), Argentina","Cartagena (Bolivar), Colombia","Coronel, Chile","Da Nang (Da Nang), Vietnam","Encarnacion, Paraguay","Georgetown, Guyana","Guayaquil, Ecuador","Haiphong (Hai Phong), Vietnam","Ho Chi Minh city - ICD Phuoc Long (Ho Chi Minh), Vietnam","Iquique, Chile","Itajai (Santa Catarina), Brazil","Itapoa (Santa Catarina), Brazil","Sihanoukville, Cambodia","La Guaira, Venezuela","Lirquen, Chile","Manaus (Amazonas), Brazil","Montevideo, Uruguay","Navegantes (Santa Catarina), Brazil","Nueva Palmira, Uruguay","Paita, Peru","Paramaribo, Suriname","Paranagua (Parana), Brazil","Pecem (Ceara), Brazil","Phnom Penh, Cambodia","Porto Velho (Rondonia), Brazil","Posorja - Guayas, Ecuador","Puerto Angamos, Chile","Puerto Bolivar - El Oro, Ecuador","Puerto Cabello, Venezuela","Villeta, Paraguay","Qui Nhon (Binh Dinh), Vietnam","Rio de Janeiro (Rio de Janeiro), Brazil","Rio Grande (Rio Grande do Sul), Brazil","Rosario (Santa Fe), Argentina","Salvador (Bahia), Brazil","San Antonio, Chile","San Vicente, Chile","Santarem (Para), Brazil","Santos (Sao Paulo), Brazil","Suape (Pernambuco), Brazil","Terport, Paraguay","Turbo (Antioquia), Colombia","Ushuaia (Tierra del Fuego), Argentina","Valparaiso, Chile","Vila do Conde (Para), Brazil","Vitoria (Espirito Santo), Brazil","Vung Tau (Ba Ria - Vung Tau), Vietnam","ZARATE (Buenos Aires), Argentina"]

# Puerto seguro flavial has been moved to villeta. This place seemed more logical according to lat and long
# Terport villeta paraguay had no latitude or longitude to check,
# But luckily there was only one port called Terport in Maersk

d_names = ["Amsterdam (Noord-Holland), Netherlands","Antwerp (Antwerp), Belgium","Moerdijk (Noord-Brabant), Netherlands","Rotterdam (Zuid-Holland), Netherlands","Vlissingen (Zeeland), Netherlands","Zeebrugge (West Flanders), Belgium"]

od_names = list(itertools.product(o_names, d_names))
print(f"{(n_combs := len(od_names))} combinations of ports ({len(o_names)} origins * {len(d_names)} destinations)")

330 combinations of ports (55 origins * 6 destinations)


In [5]:
### This part fills in all the origin destination locations and saves the soup which will be processed later on
soups = []

def get_webpages(od_names):
    #Open Maersk point to point site
    driver.get("https://www.maersk.com/schedules/pointToPoint")
    time.sleep(3)
    #Click to allow cookies
    driver.find_element(By.XPATH,"//*[@id='coiPage-1']/div[2]/button[3]").click()

    for i in od_names:
        #Open the site again
        driver.get("https://www.maersk.com/schedules/pointToPoint")
        time.sleep(3)

        #fill in the origin location
        originloc = driver.find_element(By.ID,'originLocation')
        originloc.send_keys(i[0])

        #a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
        time.sleep(4) #Makes sure that the element is actually clickable
        action = ActionChains(driver)
        action.move_to_element_with_offset(originloc, 0, 50)
        action.click()
        action.perform()

        #Fills in the destination location automatically.
        destinationloc = driver.find_element(By.ID,'destinationLocation')
        destinationloc.send_keys(i[1])

        #a dropdown menu has to be clicked in order to confirm the origin location. This clicks the correct port
        time.sleep(3)
        action = ActionChains(driver)
        action.move_to_element_with_offset(destinationloc, 0, 50)
        action.click()
        action.perform()

        #Click the search button
        search_button = driver.find_element(By.XPATH,'//*[@id="app"]/div[2]/span/form/div[6]/button')
        search_button.click()

        # There are 2 known possibilities that result in not finding routes:
        # 1: There is no route
        # 2: Sometimes Maersk site gives an error for either origin or destination
        #    even when the names are correctly filled in. Error seems to appear randomly
        # Try makes sure the code doesn't fail even if a route is not found
        # It works by checking if the button for 'show route details' can be clicked
        time.sleep(5)
        if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span")) > 0:
            driver.find_element(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[3]/div/div[4]/button/span").click()

            if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span")) > 0:
                driver.find_element(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[4]/div/div[4]/button/span").click()

                if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span")) > 0:
                    driver.find_element(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[5]/div/div[4]/button/span").click()

                    if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span")) > 0:
                        driver.find_element(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[6]/div/div[4]/button/span").click()

                        if len(driver.find_elements(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span")) > 0:
                            driver.find_element(By.XPATH,"//*[@id='app']/div[2]/div[1]/div[7]/div/div[4]/button/span").click()

            #Copy's the page to use in Beautifulsoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source)
            soups.append(soup)

        else:
            print("No route found for:",i)

    #Closes the webdriver after a few seconds
    driver.stop_client()
    driver.quit()

get_webpages(od_names)

No route found for: ('Arica, Chile', 'Zeebrugge (West Flanders), Belgium')
No route found for: ('Belem (Para), Brazil', 'Amsterdam (Noord-Holland), Netherlands')
No route found for: ('Belem (Para), Brazil', 'Antwerp (Antwerp), Belgium')
No route found for: ('Belem (Para), Brazil', 'Moerdijk (Noord-Brabant), Netherlands')
No route found for: ('Belem (Para), Brazil', 'Rotterdam (Zuid-Holland), Netherlands')
No route found for: ('Belem (Para), Brazil', 'Vlissingen (Zeeland), Netherlands')


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=107.0.5304.107)
Stacktrace:
Backtrace:
	Ordinal0 [0x00D8ACD3+2075859]
	Ordinal0 [0x00D1EE61+1633889]
	Ordinal0 [0x00C1B7BD+571325]
	Ordinal0 [0x00C02E1A+470554]
	Ordinal0 [0x00C6AA0B+895499]
	Ordinal0 [0x00C7AC96+961686]
	Ordinal0 [0x00C67136+880950]
	Ordinal0 [0x00C3FEFD+720637]
	Ordinal0 [0x00C40F3F+724799]
	GetHandleVerifier [0x0103EED2+2769538]
	GetHandleVerifier [0x01030D95+2711877]
	GetHandleVerifier [0x00E1A03A+521194]
	GetHandleVerifier [0x00E18DA0+516432]
	Ordinal0 [0x00D2682C+1665068]
	Ordinal0 [0x00D2B128+1683752]
	Ordinal0 [0x00D2B215+1683989]
	Ordinal0 [0x00D36484+1729668]
	BaseThreadInitThunk [0x757AFEF9+25]
	RtlGetAppContainerNamedObjectPath [0x77507BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77507B8E+238]


In [6]:
def process_data_route(route,list_ports,route_data):
    #The origin port is the first port in the list, destination the last
    origin = list_ports[0]
    destination = list_ports[-1]

    # The information about the destination and therefore arrival date can be found in the last box
    info_destination = route.find(class_="ptp-results__transport-plan--item-final")
    arrival_date = info_destination.find(class_="transport-label font--small")
    arrival_date = arrival_date.find_all(class_="font--small")
    arrival_date = arrival_date[1].text

    info_departure = route.find(class_="transport-label font--small")

    departure_date = info_departure.find(class_="font--small").text

    # Make an empty list for all used vessels. If only 1 vessel is used only 1 item will be in this list
    vessels = []

    # The vessel name is intially given as ie. "Departing on CAP SAN LORENZO / 249S"
    # This makes sure that only the Cap San Lorenzo part is stored
    # Furthermore these steps only work for the first vessel that is being used
    vessel_name = info_departure.find(class_="rich-text").text
    if vessel_name == "-":
        vessel_name = "-"
    else:
        vessel_name = vessel_name.split()
        vessel_name.remove('Departing')
        vessel_name.remove('on')
        vessel_name.remove("/")
        vessel_name.pop(-1)
        vessel_name = ' '.join(vessel_name)


    vessel_info = route.find(class_="vessel")

    imo = vessel_info.find(class_="imo").text
    imo = imo.removeprefix('IMO Number')

    service = vessel_info.find(class_="service").text
    service = service.removeprefix('Service')

    flag = vessel_info.find(class_="flag").text
    flag = flag.removeprefix('Flag')

    callsign = vessel_info.find(class_="callsign").text
    callsign = callsign.removeprefix('Call Sign')

    built_year_ship = vessel_info.find(class_="built").text
    built_year_ship = built_year_ship.removeprefix('Built')

    # Store the information about the first used vessel as a list
    # If other vessels are also used, these will be also be stored as a list
    vessels.append([vessel_name,imo,service,flag,callsign,built_year_ship])

    if len(list_ports)>2: # If there is a transfer, store data and also run process_data_transfer
        route_data.append([origin,destination,departure_date,arrival_date])
        process_data_transfer(route,list_ports,route_data,vessels)
    else:
        # Just store the route_data
        route_data.append([origin,destination,departure_date,arrival_date,[origin,destination],vessels,[departure_date,arrival_date,]])
        return route_data

In [7]:
def process_data_transfer(route,list_ports,route_data,vessels):
    transfer_arrival_departure =[]

    for i in range(1,len(list_ports)-1):
        #item 1 is a port, 2 a ship, 3 a port and so on
        #The following if statement makes sure that data of a port
        #is actually read as a port
        if (i % 2) == 1:
            transfer_port = route.find_all(class_="ptp-results__transport-plan--item")[i]

            info_arrival = transfer_port.find(class_="transport-label font--small")

            arrival_date = info_arrival.find_all(class_="font--small")[1].text

            transfer_arrival_departure.append(arrival_date)

            transfer_ship = route.find_all(class_="ptp-results__transport-plan--item")[i+1]

            info_departure = transfer_ship.find(class_="transport-label font--small")
            departure_date = info_departure.find(class_="font--small").text

            transfer_arrival_departure.append(departure_date)

            vessel_name = info_departure.find(class_="rich-text").text
            if vessel_name == "-":
                vessel_name = "-"
            else:
                vessel_name = vessel_name.split()
                vessel_name.remove('Departing')
                vessel_name.remove('on')
                vessel_name.remove("/")
                vessel_name.pop(-1)
                vessel_name = ' '.join(vessel_name)

            vessel_info = transfer_ship.find(class_="vessel")

            imo = vessel_info.find(class_="imo").text
            imo = imo.removeprefix('IMO Number')

            service = vessel_info.find(class_="service").text
            service = service.removeprefix('Service')

            flag = vessel_info.find(class_="flag").text
            flag = flag.removeprefix('Flag')

            callsign = vessel_info.find(class_="callsign").text
            callsign = callsign.removeprefix('Call Sign')

            built_year_ship = vessel_info.find(class_="built").text
            built_year_ship = built_year_ship.removeprefix('Built')

            vessels.append([vessel_name,imo,service,flag,callsign,built_year_ship])

    # This part is quite complicated
    # The data on the origin, destination and first vessel were already stored in route_data in process_data_route
    # We will alter this data by adding the information about the transfer ports and vessels

    # We first store the data on departure date that was already stored in process_data_route somewhere else
    arrival_departure = []
    arrival_departure.append(route_data[-1][2])

    # Then store all the transfer arrival and departure date
    for i in transfer_arrival_departure:
        arrival_departure.append(i)

    # Last store the arrival date for the whole route
    arrival_departure.append(route_data[-1][3])

    # Store the other transfer data in route_data
    route_data[-1].append(list_ports)
    route_data[-1].append(vessels)
    # Store the data on all departure and arrival dates (including transfer) in the route_data
    route_data[-1].append(arrival_departure)
    return route_data

In [8]:
### Process_data_route, process_data_transfer and initialize_processing
### all process the soups into usable data
### First initialize_processing selects a soup and prepares it for processing
### Then process_data_route will process the information about the origin, destination, arrival data en departure date and the first vessel
### Last process_data_transfer will be used if a transfer takes place.
### A transfer means that the container is moved from one vessel to the another and continues the journey

# Make a list in which all the data of all routes can be stored
route_data = []

def initialize_processing(soups):
    for i in range(len(soups)):
        #Lists all the data on routes. The data on routes is already grouped.
        routes = soups[i].find_all("div", class_="ptp-results__transport-plan")

        #The Maersk site does not show all ports that are on the route
        #It only provides information on ports which are either origin, destination or transfer ports
        #Transfer in this context means a port where the container is moved to a different ship.
        #The following few lines detects all ports in a route

        for route in routes:
            ports = route.find_all("div", class_="location")
            list_ports =[]
            for p in ports:
                city = p.find("div", class_="font--default--bold").text
                terminal = p.find("div", class_="font--small").text
                port = city + ' ' + terminal
                list_ports.append(port)

            process_data_route(route,list_ports,route_data)

In [9]:
initialize_processing(soups)

In [10]:
# This turns the processed data into a Pandas dataframe
columns = ["Origin","Destination","Departure time", "Arrival time","Ports","Vessels","Dates"]

connection_df = pd.DataFrame(route_data, columns=columns)
connection_df

Unnamed: 0,Origin,Destination,Departure time,Arrival time,Ports,Vessels,Dates
0,Arica Terminal Puerto Arica (TPA),Amsterdam Container Terminal Vrede Amsterdam,23 Nov 2022 21:30,28 Dec 2022 17:00,"[Arica Terminal Puerto Arica (TPA), Balboa Bal...","[[JENS MAERSK, -, -, -, -, -], [MAERSK BULAN, ...","[23 Nov 2022 21:30, 06 Dec 2022 12:30, 09 Dec ..."
1,Arica Terminal Puerto Arica (TPA),Amsterdam Container Terminal Vrede Amsterdam,30 Nov 2022 15:30,04 Jan 2023 10:00,"[Arica Terminal Puerto Arica (TPA), Balboa Bal...","[[FORT DESAIX, -, -, -, -, -], [MAERSK BRANI, ...","[30 Nov 2022 15:30, 13 Dec 2022 12:30, 16 Dec ..."
2,Arica Terminal Puerto Arica (TPA),Amsterdam Container Terminal Vrede Amsterdam,07 Dec 2022 15:30,11 Jan 2023 10:00,"[Arica Terminal Puerto Arica (TPA), Balboa Bal...","[[JEPPESEN MAERSK, -, -, -, -, -], [MAERSK BAT...","[07 Dec 2022 15:30, 20 Dec 2022 12:30, 23 Dec ..."
3,Arica Terminal Puerto Arica (TPA),Amsterdam Container Terminal Vrede Amsterdam,14 Dec 2022 15:30,18 Jan 2023 10:00,"[Arica Terminal Puerto Arica (TPA), Balboa Bal...","[[JENS MAERSK, -, -, -, -, -], [SAFMARINE BENG...","[14 Dec 2022 15:30, 27 Dec 2022 12:30, 30 Dec ..."
4,Arica Terminal Puerto Arica (TPA),Amsterdam Container Terminal Vrede Amsterdam,21 Dec 2022 15:30,27 Jan 2023 10:00,"[Arica Terminal Puerto Arica (TPA), Balboa Bal...","[[FORT DESAIX, -, -, -, -, -], [MAERSK BUTON, ...","[21 Dec 2022 15:30, 03 Jan 2023 12:30, 08 Jan ..."
5,Arica Terminal Puerto Arica (TPA),Antwerp PSA Antwerp K913 Noordzee,23 Nov 2022 21:30,31 Dec 2022 06:00,"[Arica Terminal Puerto Arica (TPA), Callao APM...","[[JENS MAERSK, -, -, -, -, -], [MAERSK BRANI, ...","[23 Nov 2022 21:30, 29 Nov 2022 07:00, 03 Dec ..."
6,Arica Terminal Puerto Arica (TPA),Antwerp PSA Antwerp K913 Noordzee,30 Nov 2022 15:30,07 Jan 2023 06:00,"[Arica Terminal Puerto Arica (TPA), Callao APM...","[[FORT DESAIX, -, -, -, -, -], [MAERSK BATAM, ...","[30 Nov 2022 15:30, 06 Dec 2022 07:00, 10 Dec ..."
7,Arica Terminal Puerto Arica (TPA),Antwerp PSA Antwerp K913 Noordzee,07 Dec 2022 15:30,14 Jan 2023 06:00,"[Arica Terminal Puerto Arica (TPA), Callao APM...","[[JEPPESEN MAERSK, -, -, -, -, -], [SAFMARINE ...","[07 Dec 2022 15:30, 13 Dec 2022 07:00, 17 Dec ..."
8,Arica Terminal Puerto Arica (TPA),Antwerp PSA Antwerp K913 Noordzee,14 Dec 2022 15:30,23 Jan 2023 06:00,"[Arica Terminal Puerto Arica (TPA), Callao APM...","[[JENS MAERSK, -, -, -, -, -], [MAERSK BUTON, ...","[14 Dec 2022 15:30, 20 Dec 2022 07:00, 24 Dec ..."
9,Arica Terminal Puerto Arica (TPA),Antwerp PSA Antwerp K913 Noordzee,21 Dec 2022 15:30,28 Jan 2023 06:00,"[Arica Terminal Puerto Arica (TPA), Callao APM...","[[FORT DESAIX, -, -, -, -, -], [MAERSK BRATAN,...","[21 Dec 2022 15:30, 27 Dec 2022 07:00, 31 Dec ..."


In [11]:
# Store as both pickle and CSV
today = date.today()
connection_df.to_pickle(f"../pickles/maersk_daily/connections_{today}.pickle")
connection_df.to_csv(f"../data/maersk_daily/connections_{today}.csv")