# Testing X to Zarr Conversion

In [8]:
import sys
sys.path.append('../src')
from pathlib import Path
from zarrcatalogue.converters.mike import MIKEConverter
import mikeio
import numpy as np
import zarr

# Initialize converter
converter = MIKEConverter()

# Path to your test data

#data_path = Path('/teamspace/studios/this_studio/data/basin_2dv.dfsu')
#data_path = Path('/teamspace/studios/this_studio/data/basin_3d.dfsu')
#data_path = Path('/teamspace/studios/this_studio/data/oresund_sigma_z.dfsu')
data_path = Path('/teamspace/studios/this_studio/data/oresundHD_run1.dfsu')

zarr_path = Path('/teamspace/studios/this_studio/data_zarr/oresundHD_run1.zarr')

In [9]:
# Convert file with custom chunks and compression
chunks = {'time': 2, 'elements': 320}  # Example chunking
metadata = converter.to_zarr(
    data_path, 
    zarr_path,
    chunks=chunks,
    compression_level=7
)

# Print metadata
print("\nConversion metadata:")
for key, value in metadata.items():
    print(f"{key}: {value}")

# Validate conversion
validation_results = converter.validate_conversion(data_path, zarr_path)
print("\nValidation results:")
for key, value in validation_results.items():
    print(f"{key}: {value}")

# Examine Zarr structure
store = zarr.open(zarr_path, 'r')
print("\nZarr structure:")
print(store.tree())

# Basic data validation
print("\nData validation:")
original_ds = mikeio.read(data_path)
zarr_store = zarr.open(zarr_path, 'r')

# Compare first timestep of first variable
var_name = original_ds.names[0]
original_data = original_ds[var_name].to_numpy()[0]
zarr_data = zarr_store[f'data/{var_name}'][0]

print(f"\nComparing {var_name} data:")
print(f"Original shape: {original_data.shape}")
print(f"Zarr shape: {zarr_data.shape}")
print(f"Max difference: {np.max(np.abs(original_data - zarr_data))}")


Conversion metadata:
model_type: MIKE
converter_version: 0.1.0
conversion_time: 2024-12-10T15:05:38.288162
input_file: /teamspace/studios/this_studio/data/oresundHD_run1.dfsu
mikeio_version: 2.2.0
geometry_type: GeometryFM2D
n_elements: 3612
n_nodes: 2046
n_timesteps: 5
variables: ['Surface elevation', 'Total water depth', 'U velocity', 'V velocity']
time_range: ['2018-03-07 00:00:00', '2018-03-11 00:00:00']
element_info: {'max_nodes_per_element': 3, 'min_nodes_per_element': 3, 'element_types_present': [3], 'n_elements_3_nodes': 3612}
chunks: {'time': 2, 'elements': 320}
compression_level: 7

Validation results:
element_count_match: True
node_count_match: True
time_steps_match: True
variables_match: True
geometry_type_match: True
all_valid: True

Zarr structure:
/
 ├── data
 │   ├── Surface elevation (5, 3612) float32
 │   ├── Total water depth (5, 3612) float32
 │   ├── U velocity (5, 3612) float32
 │   ├── V velocity (5, 3612) float32
 │   └── time (5,) float64
 └── topology
     ├─

# Adding to catalogue

In [6]:
import sys
sys.path.append('../src')
# Example usage with proper JSON serialization
from pathlib import Path
from zarrcatalogue.catalog import SimulationCatalog
import json

class CustomJSONEncoder(json.JSONEncoder):
    """Custom JSON encoder that handles numpy types."""
    def default(self, obj):
        import numpy as np
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
            np.int16, np.int32, np.int64, np.uint8,
            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super().default(obj)

# First, let's clean up any corrupted files
import shutil
from pathlib import Path

catalog_path = Path("/teamspace/studios/this_studio/catalog")
if catalog_path.exists():
    shutil.rmtree(catalog_path)

# Initialize catalog
catalog = SimulationCatalog(catalog_path)


In [7]:

# Add a simulation
simulation_entry = catalog.add_simulation(
    sim_id="basin_2dv_20241210",
    source_file=Path('/teamspace/studios/this_studio/data/basin_2dv.dfsu'),
    metadata={
        "scenario": "baseline",
        "model_version": "2.2.0",
        "description": "Vertical profile simulation"
    },
    tags=["vertical_profile", "baseline"]
)

# Print the entry using the custom encoder
print("Added simulation:")
print(json.dumps(simulation_entry, indent=2, cls=CustomJSONEncoder))

Added simulation:
{
  "id": "basin_2dv_20241210",
  "source_file": "/teamspace/studios/this_studio/data/basin_2dv.dfsu",
  "zarr_store": "/teamspace/studios/this_studio/catalog/simulations/basin_2dv_20241210/data.zarr",
  "added_date": "2024-12-10T15:40:22.079617",
  "converter": "MIKE",
  "converter_version": "0.1.0",
  "conversion_metadata": {
    "model_type": "MIKE",
    "converter_version": "0.1.0",
    "conversion_time": "2024-12-10T15:40:22.079433",
    "input_file": "/teamspace/studios/this_studio/data/basin_2dv.dfsu",
    "mikeio_version": "2.2.0",
    "geometry_type": "GeometryFMVerticalProfile",
    "n_elements": 640,
    "n_nodes": 715,
    "n_timesteps": 3,
    "variables": [
      "U velocity",
      "V velocity",
      "W velocity"
    ],
    "time_range": [
      "2004-01-01 00:00:00",
      "2004-01-01 00:20:00"
    ],
    "element_info": {
      "max_nodes_per_element": 4,
      "min_nodes_per_element": 4,
      "element_types_present": [
        4
      ],
      "n_e

In [8]:
# Search catalog
results = catalog.search(
    geometry_type="GeometryFMVerticalProfile",
    variables=["U velocity"],
    tags=["baseline"]
)

# Print summary
print(catalog.get_summary())

{'n_simulations': 1, 'geometry_types': ['GeometryFMVerticalProfile'], 'variables': ['U velocity', 'V velocity', 'W velocity'], 'tags': ['baseline', 'vertical_profile'], 'last_updated': '2024-12-10T15:40:22.079624'}
