# Testing X to Zarr Conversion

Starting with `MIKEConverter` from `src/zarrcatalogue/converters/mike.py`

In [8]:
import sys
sys.path.append('../src')
from pathlib import Path
from zarrcatalogue.converters.mike import MIKEConverter
import mikeio
import numpy as np
import zarr

# Initialize converter
converter = MIKEConverter()

# Path to your test data

#data_path = Path('/teamspace/studios/this_studio/data/basin_2dv.dfsu')
#data_path = Path('/teamspace/studios/this_studio/data/basin_3d.dfsu')
#data_path = Path('/teamspace/studios/this_studio/data/oresund_sigma_z.dfsu')
data_path = Path('/teamspace/studios/this_studio/data/oresundHD_run1.dfsu')

zarr_path = Path('/teamspace/studios/this_studio/data_zarr/oresundHD_run1.zarr')

In [9]:
# Convert file with custom chunks and compression
chunks = {'time': 2, 'elements': 320}  # Example chunking
metadata = converter.to_zarr(
    data_path, 
    zarr_path,
    chunks=chunks,
    compression_level=7
)

# Print metadata
print("\nConversion metadata:")
for key, value in metadata.items():
    print(f"{key}: {value}")

# Validate conversion
validation_results = converter.validate_conversion(data_path, zarr_path)
print("\nValidation results:")
for key, value in validation_results.items():
    print(f"{key}: {value}")

# Examine Zarr structure
store = zarr.open(zarr_path, 'r')
print("\nZarr structure:")
print(store.tree())

# Basic data validation
print("\nData validation:")
original_ds = mikeio.read(data_path)
zarr_store = zarr.open(zarr_path, 'r')

# Compare first timestep of first variable
var_name = original_ds.names[0]
original_data = original_ds[var_name].to_numpy()[0]
zarr_data = zarr_store[f'data/{var_name}'][0]

print(f"\nComparing {var_name} data:")
print(f"Original shape: {original_data.shape}")
print(f"Zarr shape: {zarr_data.shape}")
print(f"Max difference: {np.max(np.abs(original_data - zarr_data))}")


Conversion metadata:
model_type: MIKE
converter_version: 0.1.0
conversion_time: 2024-12-10T15:05:38.288162
input_file: /teamspace/studios/this_studio/data/oresundHD_run1.dfsu
mikeio_version: 2.2.0
geometry_type: GeometryFM2D
n_elements: 3612
n_nodes: 2046
n_timesteps: 5
variables: ['Surface elevation', 'Total water depth', 'U velocity', 'V velocity']
time_range: ['2018-03-07 00:00:00', '2018-03-11 00:00:00']
element_info: {'max_nodes_per_element': 3, 'min_nodes_per_element': 3, 'element_types_present': [3], 'n_elements_3_nodes': 3612}
chunks: {'time': 2, 'elements': 320}
compression_level: 7

Validation results:
element_count_match: True
node_count_match: True
time_steps_match: True
variables_match: True
geometry_type_match: True
all_valid: True

Zarr structure:
/
 ├── data
 │   ├── Surface elevation (5, 3612) float32
 │   ├── Total water depth (5, 3612) float32
 │   ├── U velocity (5, 3612) float32
 │   ├── V velocity (5, 3612) float32
 │   └── time (5,) float64
 └── topology
     ├─

# Adding to catalogue 

## single file

In [2]:
import sys
sys.path.append('../src')
# Example usage with proper JSON serialization
from pathlib import Path
from zarrcatalogue.catalog import SimulationCatalog
import json

# First, let's clean up any corrupted files
import shutil
from pathlib import Path

catalog_path = Path("/teamspace/studios/this_studio/catalog")
if catalog_path.exists():
    shutil.rmtree(catalog_path)

# Initialize catalog
catalog = SimulationCatalog(catalog_path)


In [3]:

# Add a simulation
simulation_entry = catalog.add_simulation(
    sim_id="basin_2dv_20241210",
    source_file=Path('/teamspace/studios/this_studio/data/basin_2dv.dfsu'),
    metadata={
        "scenario": "baseline",
        "model_version": "2.2.0",
        "description": "Vertical profile simulation"
    },
    tags=["vertical_profile", "baseline"]
)

# Print the entry using the custom encoder
print("Added simulation:")
print(json.dumps(simulation_entry, indent=2, cls=CustomJSONEncoder))

Added simulation:
{
  "id": "basin_2dv_20241210",
  "source_file": "/teamspace/studios/this_studio/data/basin_2dv.dfsu",
  "zarr_store": "/teamspace/studios/this_studio/catalog/simulations/basin_2dv_20241210/data.zarr",
  "added_date": "2024-12-10T16:14:06.585226",
  "converter": "MIKE",
  "converter_version": "0.1.0",
  "conversion_metadata": {
    "model_type": "MIKE",
    "converter_version": "0.1.0",
    "conversion_time": "2024-12-10T16:14:06.584966",
    "input_file": "/teamspace/studios/this_studio/data/basin_2dv.dfsu",
    "mikeio_version": "2.2.0",
    "geometry_type": "GeometryFMVerticalProfile",
    "n_elements": 640,
    "n_nodes": 715,
    "n_timesteps": 3,
    "variables": [
      "U velocity",
      "V velocity",
      "W velocity"
    ],
    "time_range": [
      "2004-01-01 00:00:00",
      "2004-01-01 00:20:00"
    ],
    "element_info": {
      "max_nodes_per_element": 4,
      "min_nodes_per_element": 4,
      "element_types_present": [
        4
      ],
      "n_e

## Bulk

In [7]:
import sys
sys.path.append('../src')
# Example usage with proper JSON serialization
from pathlib import Path
from zarrcatalogue.catalog import SimulationCatalog
import json

from typing import Dict
from datetime import datetime


# Initialize catalog
catalog = SimulationCatalog(Path("/teamspace/studios/this_studio/catalog"))

# Optional: Define a metadata generator function
def generate_metadata(file_path: Path) -> Dict:
    """Generate metadata from file path."""
    return {
        "source_file": str(file_path),
        "scenario": file_path.stem.split('_')[0],
        "date_processed": datetime.now().isoformat()
    }

# Bulk import simulations
results = catalog.bulk_import(
    source_dir=Path("/teamspace/studios/this_studio/data"),
    pattern="*.dfsu",
    metadata_generator=generate_metadata,
    tags=["bulk_import", "2024"],
    parallel=True,
    max_workers=4,
    skip_existing=True
)

# Print successful imports
print("\nSuccessfully imported simulations:")
for entry in results["successful"]:
    print(f"- {entry['id']}: {entry['source_file']}")

Found 4 files to process
4 files remaining after removing existing entries


Processing files: 100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


Import Summary:
Successful imports: 4
Failed imports: 0

Successfully imported simulations:
- basin_2dv: /teamspace/studios/this_studio/data/basin_2dv.dfsu
- basin_3d: /teamspace/studios/this_studio/data/basin_3d.dfsu
- oresundHD_run1: /teamspace/studios/this_studio/data/oresundHD_run1.dfsu
- oresund_sigma_z: /teamspace/studios/this_studio/data/oresund_sigma_z.dfsu





# Search, Analyze catalogue

## summary / overview

In [4]:
from zarrcatalogue.catalog import SimulationCatalog

catalog = SimulationCatalog(Path("/teamspace/studios/this_studio/catalog"))

In [5]:
print(catalog.get_summary())

{'n_simulations': 5, 'geometry_types': ['GeometryFM2D', 'GeometryFM3D', 'GeometryFMVerticalProfile'], 'variables': ['Salinity', 'Surface elevation', 'Temperature', 'Total water depth', 'U velocity', 'V velocity', 'W velocity'], 'tags': ['2024', 'baseline', 'bulk_import', 'vertical_profile'], 'last_updated': '2024-12-10T16:17:47.421943'}


## search

In [6]:
# Search catalog
results = catalog.search(
    geometry_type="GeometryFM2D",
    #variables=["U velocity"],
    #tags=["baseline"]
)
results


Unnamed: 0,simulation_id,id,source_file,zarr_store,added_date,converter,converter_version,conversion_metadata,user_metadata,tags
0,oresundHD_run1,oresundHD_run1,/teamspace/studios/this_studio/data/oresundHD_...,/teamspace/studios/this_studio/catalog/simulat...,2024-12-10T16:17:47.349039,MIKE,0.1.0,"{'model_type': 'MIKE', 'converter_version': '0...",{'source_file': '/teamspace/studios/this_studi...,"[bulk_import, 2024]"


# Zarr to MIKE Conversion

In [7]:
import sys
sys.path.append('../src')
from pathlib import Path
from zarrcatalogue.converters.mike import MIKEConverter


converter = MIKEConverter()
metadata = converter.from_zarr(
    #zarr_path=Path("/teamspace/studios/this_studio/catalog/simulations/basin_2dv/data.zarr"),
    #output_file=Path("/teamspace/studios/this_studio/data/basin_2dv_backconversion.dfsu")
    zarr_path=Path("/teamspace/studios/this_studio/catalog/simulations/oresundHD_run1/data.zarr"),
    output_file=Path("/teamspace/studios/this_studio/data/oresundHD_run1_backconversion.dfsu")
)

In [12]:
# reload the converted file and compare to original
import mikeio

ds = mikeio.read("/teamspace/studios/this_studio/data/oresundHD_run1.dfsu")
ds_backconversion = mikeio.read("/teamspace/studios/this_studio/data/oresundHD_run1_backconversion.dfsu")
display(ds, ds_backconversion)

<mikeio.Dataset>
dims: (time:5, element:3612)
time: 2018-03-07 00:00:00 - 2018-03-11 00:00:00 (5 records)
geometry: Dfsu2D (3612 elements, 2046 nodes)
items:
  0:  Surface elevation <Surface Elevation> (meter)
  1:  Total water depth <Water Depth> (meter)
  2:  U velocity <u velocity component> (meter per sec)
  3:  V velocity <v velocity component> (meter per sec)

<mikeio.Dataset>
dims: (time:5, element:3612)
time: 2018-03-07 00:00:00 - 2018-03-11 00:00:00 (5 records)
geometry: Dfsu2D (3612 elements, 2046 nodes)
items:
  0:  Surface elevation <Surface Elevation> (meter)
  1:  Total water depth <Water Depth> (meter)
  2:  U velocity <u velocity component> (meter per sec)
  3:  V velocity <v velocity component> (meter per sec)