In [75]:
# use pathlib to scan for files
import pathlib

# use polars to read csv files faster
import polars as pl 
# progress bar
import tqdm.auto as tqdm
# geopandas for coordinates
import geopandas as gpd
import pandas as pd
# dask for reading multiple files in parallel
import dask.dataframe as dd
import dask.distributed 
import dask_geopandas
import shapely

In [19]:
cluster = dask.distributed.LocalCluster()          # Fully-featured local Dask cluster
client = cluster.get_client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 49633 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:49633/status,

0,1
Dashboard: http://127.0.0.1:49633/status,Workers: 5
Total threads: 10,Total memory: 64.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:49634,Workers: 5
Dashboard: http://127.0.0.1:49633/status,Total threads: 10
Started: Just now,Total memory: 64.00 GiB

0,1
Comm: tcp://127.0.0.1:49650,Total threads: 2
Dashboard: http://127.0.0.1:49653/status,Memory: 12.80 GiB
Nanny: tcp://127.0.0.1:49637,
Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-ayl5onmd,Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-ayl5onmd

0,1
Comm: tcp://127.0.0.1:49649,Total threads: 2
Dashboard: http://127.0.0.1:49654/status,Memory: 12.80 GiB
Nanny: tcp://127.0.0.1:49639,
Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-r9mf0tyo,Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-r9mf0tyo

0,1
Comm: tcp://127.0.0.1:49648,Total threads: 2
Dashboard: http://127.0.0.1:49652/status,Memory: 12.80 GiB
Nanny: tcp://127.0.0.1:49641,
Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-v1adz8lp,Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-v1adz8lp

0,1
Comm: tcp://127.0.0.1:49647,Total threads: 2
Dashboard: http://127.0.0.1:49656/status,Memory: 12.80 GiB
Nanny: tcp://127.0.0.1:49643,
Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-biz20o6r,Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-biz20o6r

0,1
Comm: tcp://127.0.0.1:49651,Total threads: 2
Dashboard: http://127.0.0.1:49655/status,Memory: 12.80 GiB
Nanny: tcp://127.0.0.1:49645,
Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-dns6sgn3,Local directory: /var/folders/fh/tcbrjg6n28b0lzjzh07b5t6m0000gn/T/dask-scratch-space/worker-dns6sgn3


# Combine all IVS data files into one dataset
This notebook reads data from [goederenvervoer](https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/) IVS data and stores it as one files for easier processing.

In [16]:
# Download / update the files with the following command
# wget -c -nd -r -np -l 1 -A zip 'https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/'

In [17]:
# Get the merged-unlo codes from Fedor (this is sent to UN in order to give them the opportunity to update their codes). 
unlo_path = pathlib.Path('~/data/unlo/arcgis-results-c0067387-935e-4cec-84e1-c32f8d503a67-merged-unlo.gpkg').expanduser()
ivs_path = pathlib.Path('~/data/ivs').expanduser()

In [18]:
# define all column names 
schema = dict([
    ('Jaarmaand', pl.Int64),
    ('Jaar', pl.Int64),
    ('Maand', pl.Int64),
    ('Weeknr', pl.Int64),
    ('v05_06_begindt_evenement_iso', pl.String),
    ('v05_06_Begindt_evenement', pl.String),
    ('UNLO_herkomst', pl.String),
    ('UNLO_bestemming', pl.String),
    ('v15_1_Scheepstype_RWS', pl.String),
    ('SK_CODE', pl.String),
    ('v18_Laadvermogen', pl.Float64),
    ('v28_Beladingscode', pl.Int64), 
    ('v38_Vervoerd_gewicht', pl.Int64), # check units in kg?
    ('v30_4_Containers_TEU_S', pl.Int64),
    ('nstr_nw', pl.String), # categories: see https://www.cbs.nl/en-gb/our-services/methods/definitions/commodity-nomenclature-nstr
    ('nst2007_nw', pl.String)
])


In [8]:
df = pl.read_csv(ivs_path / 'IVS_weekmonitor_01APR2021.csv', separator=';', quote_char='"', schema=schema, ignore_errors=False)

In [9]:
paths = list(ivs_path.glob('*.csv'))

In [10]:
for path in tqdm.tqdm(paths):
    df = pl.read_csv(path, separator=';', quote_char='"', schema=schema, ignore_errors=True)
    df.write_parquet(path.with_suffix('.parquet'))

  0%|          | 0/1079 [00:00<?, ?it/s]

Now we can re-read all the singular files. They should now all have static column types. We'll convert it into one file using dask. Dask can read multiple files and treat them as one. 

In [53]:
ddf = dd.read_parquet(list(ivs_path.glob('IVS*.parquet')))
ddf = ddf.drop_duplicates()
ddf = ddf.persist()



In [24]:
# drop all double records (this might need some 

ddf.to_parquet('ivs-2024.parquet', overwrite=True)



In [25]:
# convert to one file (to read into memory)
ddf.compute().to_parquet(ivs_path / 'ivs-2024-one-file.parquet')




In [67]:
# test read performance
ivs_df = pd.read_parquet(ivs_path / 'ivs-2024-one-file.parquet')

In [93]:
unlo_gdf = gpd.read_file(unlo_path)
def unlo_code(row):
    return row['country_code'] + row['location_code']
unlo_gdf['unlo_code'] = unlo_gdf.apply(unlo_code, axis=1)
unlo_gdf = unlo_gdf[['unlo_code', 'geometry']].set_index('unlo_code')
# add the final missing record in Veghel
unlo_gdf.loc['NLVEG'] = shapely.Point(5.509574, 51.619686)

In [97]:
ivs_gdf = (
    ivs_df
        .merge(unlo_gdf, left_on='UNLO_herkomst', right_index=True, how='left')
        .merge(unlo_gdf, left_on='UNLO_bestemming', right_index=True, how='left')
)                                                                                         

In [102]:
def points2line(row):
    linestring = None
    if row['geometry_x'] and row['geometry_y']:
        linestring = shapely.LineString([row['geometry_x'], row['geometry_y']])
    return linestring
    
ivs_gdf['geometry'] = ivs_gdf.apply(points2line, axis=1)

In [110]:
ivs_gdf = ivs_gdf.drop(columns=['geometry_x', 'geometry_y'])
ivs_gdf = gpd.GeoDataFrame(ivs_gdf, geometry='geometry')

In [111]:
ivs_gdf.to_file(ivs_path / 'ivs-2024-geocoded.gpkg')

In [112]:
!open ivs_path

The file /Users/baart_f/src/digitaltwin-waterway/notebooks/ivs/ivs_path does not exist.


In [113]:
import numpy as np

In [116]:
rng = np.random.default_rng(0)
x = rng.random()
rng = np.random.default_rng(0)
y = rng.random()

x, y

(0.6369616873214543, 0.6369616873214543)

[0;31mDocstring:[0m
random(size=None, dtype=np.float64, out=None)

Return random floats in the half-open interval [0.0, 1.0).

Results are from the "continuous uniform" distribution over the
stated interval.  To sample :math:`Unif[a, b), b > a` use `uniform`
or multiply the output of `random` by ``(b - a)`` and add ``a``::

    (b - a) * random() + a

Parameters
----------
size : int or tuple of ints, optional
    Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
    ``m * n * k`` samples are drawn.  Default is None, in which case a
    single value is returned.
dtype : dtype, optional
    Desired dtype of the result, only `float64` and `float32` are supported.
    Byteorder must be native. The default value is np.float64.
out : ndarray, optional
    Alternative output array in which to place the result. If size is not None,
    it must have the same shape as the provided size and must match the type of
    the output values.

Returns
-------
out : float or ndarray of f