In [2]:
import itertools
import pandas as pd
import requests
import pickle

### Scaper
https://www.msc.com/en/search-a-schedule

In [12]:
# Define origin and destination countries
origin = ["BR", "CO", "VE", "SR", "CW", "GY", "GF", "UY", "AR", "CL", "PE", "EC", "VN", "PY", "GY", "KH"]
destination = ["NL", "BE"]

In [13]:
# Create DataFrames from UN-LOCODE CSVs
country_df = pd.read_csv("../utils/country-codes.csv")
country_df.set_index("CountryCode", inplace=True)
country_dict = country_df.to_dict()["CountryName"]

In [14]:
o_dict = {code: name for code, name in country_dict.items() if code in origin}
d_dict = {code: name for code, name in country_dict.items() if code in destination}

In [15]:
with open('../pickles/msc_country_port_codes.pickle', 'rb') as handle:
    country_port_codes = pickle.load(handle)

In [16]:
o_ports = [port for port in country_port_codes["Ports"] if port["CountryIsoCode"] in origin]
d_ports = [port for port in country_port_codes["Ports"] if port["CountryIsoCode"] in destination]

o_ids = [port['PortId'] for port in o_ports]
d_ids = [port['PortId'] for port in d_ports]

port_codes = {port['PortId']: port['LocationCode'] for port in o_ports + d_ports}

In [17]:
# Make list with all combinations
od_ids = list(itertools.product(o_ids, d_ids))
print(f"{(n_combs := len(od_ids))} combinations of ports ({len(o_ids)} origins * {len(d_ids)} destinations)")

330 combinations of ports (55 origins * 6 destinations)


In [18]:
# Use today's date, by default
from datetime import date
today = date.today()
print(today)

2022-10-31


In [61]:
# Get machine_token for device
def request_machine_token():
    token_url = "https://www.routescanner.com/home-vars"

    token_headers = {
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Content-Type": "application/json",
    }
    response = requests.request("GET", token_url, headers=token_headers)

    r = response.json()
    machine_token = r['machineToken']
    return machine_token
machine_token = request_machine_token()

In [64]:
# Scrape RouteScanner
url = "https://api.routescanner.com/voyages/v1"

headers = {
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9,nl-NL;q=0.8,nl;q=0.7",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Origin": "https://www.routescanner.com",
    "machine_token": machine_token,
}

data = []
for n, (o, d) in enumerate(od_ids[:20]):
        o_code = port_codes[int(o)]
        d_code = port_codes[int(d)]

        sort_by = "transfers"  # transfers, emission_co2, arrival or duration
        modalities = "sea"     # rail, barge, and truck can be added

        querystring = {"offset":"0","limit":"100","origin":o_code,"originType":"locode","destination":d_code,"destinationType":"locode","destinationsNearby":"true","originsNearby":"true","minDeparture":today,"sort":sort_by,"modalities":modalities}

        response = requests.request("GET", url, headers=headers, params=querystring)

        # Check if request was succesfull
        if response.status_code != 200:
            print(f"Warning: Status code != 200 (on {o_code} to {d_code}, #{n}).")
            continue

        # Save data
        rdict = response.json()
        new_data = rdict
        data.append(new_data)

        if n % 5 == 0:
            print(f"Scraped {n}/{n_combs}")

Scraped 0/330


In [54]:
pd.DataFrame(data)

Unnamed: 0,hash,origin,destination,totalResults,voyages,requestId
0,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Amsterdam', 'locat...",3,"[{'totalTravelTimeInMinutes': 44912, 'totalDis...",3f80985b23538eaf
1,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Port of Antwerp', ...",2,"[{'totalTravelTimeInMinutes': 39330, 'totalDis...",f90db8a3e98a070b
2,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Moerdijk', 'locati...",2,"[{'totalTravelTimeInMinutes': 39445, 'totalDis...",34899c9dde38b6a3
3,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Port of Rotterdam'...",3,"[{'totalTravelTimeInMinutes': 44745, 'totalDis...",00ae72320fee79cf
4,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Port of Vlissingen...",2,"[{'totalTravelTimeInMinutes': 39426, 'totalDis...",ee22e139cec14511
5,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Port of Arica', 'l...","{'type': 'locode', 'name': 'Port of Zeebrugge'...",2,"[{'totalTravelTimeInMinutes': 33360, 'totalDis...",0ea8d397831a2a4e
6,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Belem', 'location'...","{'type': 'locode', 'name': 'Amsterdam', 'locat...",3,"[{'totalTravelTimeInMinutes': 31193, 'totalDis...",ac32b14d89827c72
7,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Belem', 'location'...","{'type': 'locode', 'name': 'Port of Antwerp', ...",2,"[{'totalTravelTimeInMinutes': 33665, 'totalDis...",feac350b30d2fef6
8,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Belem', 'location'...","{'type': 'locode', 'name': 'Moerdijk', 'locati...",2,"[{'totalTravelTimeInMinutes': 33780, 'totalDis...",5d8b3c5e540692e4
9,e3ff42847940387a44f8f7622b25488a,"{'type': 'locode', 'name': 'Belem', 'location'...","{'type': 'locode', 'name': 'Port of Rotterdam'...",3,"[{'totalTravelTimeInMinutes': 31026, 'totalDis...",d0df59fe0f873ce0


In [55]:
data

[{'hash': 'e3ff42847940387a44f8f7622b25488a',
  'origin': {'type': 'locode',
   'name': 'Port of Arica',
   'location': {'lat': -18.474563, 'lng': -70.32761},
   'locode': 'CLARI'},
  'destination': {'type': 'locode',
   'name': 'Amsterdam',
   'location': {'lat': 52.37403, 'lng': 4.88969},
   'locode': 'NLAMS'},
  'totalResults': 3,
  'voyages': [{'totalTravelTimeInMinutes': 44912,
    'totalDistanceInMeters': 12894138,
    'totalCo2EmissionsInKg': 1150,
    'voyageStart': '2022-11-03T22:30:00-03:00',
    'voyageEnd': '2022-12-05T07:02:14+01:00',
    'transferCo2EmissionsInKg': [15, 15],
    'legs': [{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-2f785767018a',
       'type': 'terminal',
       'name': 'Terminal Puerto Arica (TPA)',
       'location': {'lat': -18.475591981372105, 'lng': -70.32629734475317},
       'locode': 'CLARI',
       'locodeName': 'Port of Arica'},
      'destination': {'uuid': 'e4a94011-35c3-4614-ba06-a9e1df5fcbf9',
       'type': 'terminal',
       'name': 'APM 

In [59]:
# Save list with dicts as Pickle
with open(f'../pickles/routescanner_daily_v2/connections_{today}.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [56]:
# Flatten route data
route_data = []
for d in data:
    od = {"Origin": d["origin"]["locode"],
          "Destination": d["destination"]["locode"]}
    for route in d["voyages"]:
        route_data.append(od | route)

In [57]:
df = pd.DataFrame(route_data)
print(f"Done. DataFrame has {df.index.size} entries")
df.head(10)

Done. DataFrame has 433 entries


Unnamed: 0,Origin,Destination,totalTravelTimeInMinutes,totalDistanceInMeters,totalCo2EmissionsInKg,voyageStart,voyageEnd,transferCo2EmissionsInKg,legs,carriers
0,CLARI,NLAMS,44912,12894138,1150,2022-11-03T22:30:00-03:00,2022-12-05T07:02:14+01:00,"[15, 15]",[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...,
1,CLARI,NLAMS,30676,13132075,1860,2022-11-06T13:32:30-05:00,2022-11-28T02:49:28+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o...",
2,CLARI,NLAMS,28258,12962474,1885,2022-11-06T13:32:30-05:00,2022-11-26T10:30:48+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o...",
3,CLARI,BEANR,39330,12629105,1045,2022-11-03T22:30:00-03:00,2022-12-01T10:00:00+01:00,[15],[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...,[{'companyId': 'fd43615f-f076-4d7c-8f91-c44c50...
4,CLARI,BEANR,27987,12804509,1750,2022-11-06T13:32:30-05:00,2022-11-26T06:00:00+01:00,[15],"[{'origin': {'type': 'locode', 'name': 'Port o...",
5,CLARI,NLMOE,39445,12696287,1110,2022-11-03T22:30:00-03:00,2022-12-01T11:55:10+01:00,"[15, 15]",[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...,
6,CLARI,NLMOE,28090,12864649,1810,2022-11-06T13:32:30-05:00,2022-11-26T07:43:06+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o...",
7,CLARI,NLRTM,44745,12796586,1060,2022-11-03T22:30:00-03:00,2022-12-05T04:15:00+01:00,[15],[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...,[{'companyId': 'fd43615f-f076-4d7c-8f91-c44c50...
8,CLARI,NLRTM,30508,13033807,1770,2022-11-06T13:32:30-05:00,2022-11-28T00:01:00+01:00,[15],"[{'origin': {'type': 'locode', 'name': 'Port o...",
9,CLARI,NLRTM,28145,12896634,1835,2022-11-06T13:32:30-05:00,2022-11-26T08:37:56+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o...",


In [58]:
# Save as CSV
df.to_csv(f"../data/routescanner_daily_v2/connections_{today}.csv")