# 1. dfsu-Zarr-dfsu conversion

Starting with `MIKEConverter` from `src/zarrcatalogue/converters/mike.py`

In [12]:
# required imports
import sys
sys.path.append('../src')
from pathlib import Path
from zarrcatalogue.converters.mike import MIKEConverter
from zarrcatalogue.utils import analyze_zarr_storage

import mikeio
import numpy as np
import zarr

In [13]:
# Initialize converter
converter = MIKEConverter()

data_path = Path('../tests/testdata/oresundHD_run1.dfsu')
#data_path = Path('/teamspace/studios/this_studio/DataCatalogue/tests/testdata/oresundHD_run1.dfsu')

zarr_path = Path('oresundHD_run1.zarr')
#zarr_path = Path('/teamspace/studios/this_studio/oresundHD_run1.zarr')

### convert single file from dfsu to zarr

In [2]:
# Convert file with custom chunks and compression
chunks = {'time': 2, 'elements': 320}  # Example chunking
metadata = converter.to_zarr(
    data_path, 
    zarr_path,
    chunks=chunks,
    compression_level=7
)

check results of conversion to assess whether or not chunking was suitable for a problem

In [3]:
stats = analyze_zarr_storage(zarr_path)

# Display results
print(f"Total number of files: {stats['total_files']}")
print(f"Total size: {stats['total_size_formatted']}")

Total number of files: 19
Total size: 384.86 KB


### convert from zarr (back) to dfsu

and check consistency of output

In [3]:
converter.from_zarr(zarr_path, "foo.dfsu")

{'model_type': 'MIKE',
 'converter_version': '0.1.0',
 'conversion_time': '2025-04-01T09:33:15.809831',
 'input_file': '/teamspace/studios/this_studio/oresundHD_run1.zarr',
 'output_file': 'foo.dfsu',
 'mikeio_version': '2.4.0',
 'geometry_type': 'GeometryFM2D',
 'n_elements': 3612,
 'n_nodes': 2046,
 'n_timesteps': 5,
 'variables': ['Total water depth',
  'V velocity',
  'U velocity',
  'Surface elevation'],
 'time_range': ['2018-03-07 00:00:00', '2018-03-11 00:00:00']}

In [9]:
display(mikeio.open(data_path), mikeio.open("foo.dfsu"))
display(mikeio.open("foo.dfsu").time.diff().unique()) # timesteps

<mikeio.Dfsu2DH>
number of elements: 3612
number of nodes: 2046
projection: UTM-33
items:
  0:  Surface elevation <Surface Elevation> (meter)
  1:  Total water depth <Water Depth> (meter)
  2:  U velocity <u velocity component> (meter per sec)
  3:  V velocity <v velocity component> (meter per sec)
time: 2018-03-07 00:00:00 - 2018-03-11 00:00:00 (5 records)

<mikeio.Dfsu2DH>
number of elements: 3612
number of nodes: 2046
projection: UTM-33
items:
  0:  Surface elevation <Surface Elevation> (meter)
  1:  Total water depth <Water Depth> (meter)
  2:  U velocity <u velocity component> (meter per sec)
  3:  V velocity <v velocity component> (meter per sec)
time: 2018-03-07 00:00:00 - 2018-03-11 00:00:00 (5 records)

TimedeltaIndex([NaT, '1 days'], dtype='timedelta64[ns]', freq=None)

------

# 2. Building a Data catalogue 

A data catalogue can basically include all data, from observed timeseries to simulation results in 1D, 2D, 3D.
All data will be automatically labeled with metadata during conversion. Optional metadata can be added by the user. 
The goal is to have everything 1. quickly searchable and 2. accessible from a single source. 


## Initialize Catalogue


In [17]:
from zarrcatalogue.catalog import SimulationCatalog, CustomJSONEncoder
import json
import shutil

catalog_path = Path("catalog") # will create a folder "catalog" in the specified path

if catalog_path.exists():
    shutil.rmtree(catalog_path)

# Initialize catalog
catalog = SimulationCatalog(catalog_path)

### add single simulation

with custom metadata and tags

In [18]:
# Add a simulation
simulation_entry = catalog.add_simulation(
    sim_id="oresundHD_run1",
    source_file=data_path, # reusing from above
    metadata={
        "scenario": "baseline",
        "model_version": "1.0",
        "description": "Oresund simulation"
    },
    tags=["2D", "HD", "baseline", "Bathymetry2017"]
)

# Print the entry using the custom encoder
print("Added simulation:")
print(json.dumps(simulation_entry, indent=2, cls=CustomJSONEncoder))

Added simulation:
{
  "id": "oresundHD_run1",
  "source_file": "/teamspace/studios/this_studio/DataCatalogue/tests/testdata/oresundHD_run1.dfsu",
  "zarr_store": "catalog/simulations/oresundHD_run1/data.zarr",
  "added_date": "2025-04-01T12:24:29.849847",
  "converter": "MIKE",
  "converter_version": "0.1.0",
  "conversion_metadata": {
    "model_type": "MIKE",
    "converter_version": "0.1.0",
    "conversion_time": "2025-04-01T12:24:29.842667",
    "input_file": "/teamspace/studios/this_studio/DataCatalogue/tests/testdata/oresundHD_run1.dfsu",
    "mikeio_version": "2.4.0",
    "geometry_type": "GeometryFM2D",
    "n_elements": 3612,
    "n_nodes": 2046,
    "n_timesteps": 5,
    "variables": [
      "Surface elevation",
      "Total water depth",
      "U velocity",
      "V velocity"
    ],
    "time_range": [
      "2018-03-07 00:00:00",
      "2018-03-11 00:00:00"
    ],
    "element_info": {
      "max_nodes_per_element": 3,
      "min_nodes_per_element": 3,
      "element_types

### Import in bulk to catalogue

In [None]:
#from typing import Dict
#from datetime import datetime


# Optional: Define a metadata generator function
def generate_metadata(file_path: Path) -> Dict:
    """Generate metadata from file path."""
    return {
        "source_file": str(file_path),
        "scenario": file_path.stem.split('_')[0],
        "date_processed": datetime.now().isoformat()
    }

# Bulk import simulations
results = catalog.bulk_import(
    source_dir=Path("/teamspace/studios/this_studio/data"),
    pattern="*.dfsu",
    metadata_generator=generate_metadata,
    tags=["bulk_import", "2025"],
    parallel=True,
    max_workers=4,
    skip_existing=True
)

# Print successful imports
print("\nSuccessfully imported simulations:")
for entry in results["successful"]:
    print(f"- {entry['id']}: {entry['source_file']}")

-----

# 3. Search and analyze catalogue

#### get summary

In [21]:
catalog.get_summary()

{'n_simulations': 1,
 'geometry_types': ['GeometryFM2D'],
 'variables': ['Surface elevation',
  'Total water depth',
  'U velocity',
  'V velocity'],
 'tags': ['2D', 'HD', 'baseline'],
 'last_updated': '2025-04-01T12:24:29.849862'}

#### search

by metadata attributes, tags or recency

In [20]:
# Search catalog
results = catalog.search(
    geometry_type="GeometryFM2D",
    #variables=["U velocity"],
    #tags=["baseline"],
)
results


Unnamed: 0,simulation_id,id,source_file,zarr_store,added_date,converter,converter_version,conversion_metadata,user_metadata,tags
0,oresundHD_run1,oresundHD_run1,/teamspace/studios/this_studio/DataCatalogue/t...,catalog/simulations/oresundHD_run1/data.zarr,2025-04-01T12:24:29.849847,MIKE,0.1.0,"{'model_type': 'MIKE', 'converter_version': '0...","{'scenario': 'baseline', 'model_version': '1.0...","[2D, HD, baseline]"
