In [9]:
# use pathlib to scan for files
import pathlib

# use polars to read csv files faster
import polars as pl 
# progress bar
import tqdm.auto as tqdm
# geopandas for coordinates
import geopandas as gpd
# dask for reading multiple files in parallel
import dask.dataframe as dd

# Combine all IVS data files into one dataset
This notebook reads data from [goederenvervoer](https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/) IVS data and stores it as one files for easier processing.

In [19]:
# Download / update the files with the following command
# wget -c -nd -r -np -l 1 -A zip 'https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/'

In [2]:
# define all column names 
schema = dict([
    ('Jaarmaand', pl.Int64),
    ('Jaar', pl.Int64),
    ('Maand', pl.Int64),
    ('Weeknr', pl.Int64),
    ('v05_06_begindt_evenement_iso', pl.String),
    ('v05_06_Begindt_evenement', pl.String),
    ('UNLO_herkomst', pl.String),
    ('UNLO_bestemming', pl.String),
    ('v15_1_Scheepstype_RWS', pl.String),
    ('SK_CODE', pl.String),
    ('v18_Laadvermogen', pl.Float64),
    ('v28_Beladingscode', pl.Int64), 
    ('v38_Vervoerd_gewicht', pl.Int64), # check units in kg?
    ('v30_4_Containers_TEU_S', pl.Int64),
    ('nstr_nw', pl.String), # categories: see https://www.cbs.nl/en-gb/our-services/methods/definitions/commodity-nomenclature-nstr
    ('nst2007_nw', pl.String)
])


In [3]:
df = pl.read_csv('./IVS_weekmonitor_01APR2021.csv', separator=';', quote_char='"', schema=schema, ignore_errors=False)

In [4]:
data_dir = pathlib.Path('.')
paths = list(data_dir.glob('*.csv'))

In [5]:
for path in tqdm.tqdm(paths):
    df = pl.read_csv(path, separator=';', quote_char='"', schema=schema, ignore_errors=True)
    df.write_parquet(path.with_suffix('.parquet'))

  0%|          | 0/1079 [00:00<?, ?it/s]

Now we can re-read all the singular files. They should now all have static column types. We'll convert it into one file using dask. Dask can read multiple files and treat them as one. 

In [7]:
ddf = dd.read_parquet('*.parquet')

In [8]:
# drop all double records (this might need some 
ddf = ddf.drop_duplicates()
ddf.to_parquet('ivs-2024.parquet', overwrite=True)

In [10]:
# convert to one file (to read into memory)
ddf.compute().to_parquet('ivs-2024-one-file.parquet')


In [13]:
# test read performance
ivs_df = pl.read_parquet('ivs-2024-one-file.parquet')