Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ dependencies = [
"dask (>=2024.12.0)",
"tqdm (>=4.67.0,<5.0.0)",
"psutil (>=6.1.0,<7.0.0)",
"pydantic (>=2.8.2,<3.0.0)",
"pydantic-settings (>=2.4.0,<3.0.0)",
"fsspec (>=2024.10.0)",
"segy (>=0.4.0,<0.5.0)",
"rich (>=13.9.4,<14.0.0)",
Expand Down
18 changes: 18 additions & 0 deletions src/mdio/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""MDIO schemas for different data types."""

from mdio.schema.compressors import ZFP
from mdio.schema.compressors import Blosc
from mdio.schema.dimension import NamedDimension
from mdio.schema.dtype import ScalarType
from mdio.schema.dtype import StructuredField
from mdio.schema.dtype import StructuredType


__all__ = [
"Blosc",
"ZFP",
"NamedDimension",
"ScalarType",
"StructuredField",
"StructuredType",
]
43 changes: 43 additions & 0 deletions src/mdio/schema/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Base models to subclass from."""

from pydantic import ConfigDict
from pydantic import Field
from pydantic.json_schema import GenerateJsonSchema

from mdio.schema.compressors import ZFP
from mdio.schema.compressors import Blosc
from mdio.schema.core import CamelCaseStrictModel
from mdio.schema.dimension import NamedDimension
from mdio.schema.dtype import DataTypeModel


JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect


class BaseDataset(CamelCaseStrictModel):
"""A base class for MDIO datasets.

We add schema dialect to extend the config of `StrictCamelBaseModel`.
We use the default Pydantic schema generator `GenerateJsonSchema` to
define the JSON schema dialect accurately.
"""

model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT})


class BaseArray(DataTypeModel, CamelCaseStrictModel):
"""A base array schema."""

dimensions: list[NamedDimension] | list[str] = Field(
..., description="List of Dimension collection or reference to dimension names."
)
compressor: Blosc | ZFP | None = Field(
default=None, description="Compression settings."
)


class NamedArray(BaseArray):
"""An array with a name."""

name: str = Field(..., description="Name of the array.")
long_name: str | None = Field(default=None, description="Fully descriptive name.")
44 changes: 44 additions & 0 deletions src/mdio/schema/chunk_grid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""This module contains data models for Zarr's chunk grid."""

from __future__ import annotations

from pydantic import Field

from mdio.schema.core import CamelCaseStrictModel


class RegularChunkShape(CamelCaseStrictModel):
"""Represents regular chunk sizes along each dimension."""

chunk_shape: list[int] = Field(
..., description="Lengths of the chunk along each dimension of the array."
)


class RectilinearChunkShape(CamelCaseStrictModel):
"""Represents irregular chunk sizes along each dimension."""

chunk_shape: list[list[int]] = Field(
...,
description="Lengths of the chunk along each dimension of the array.",
)


class RegularChunkGrid(CamelCaseStrictModel):
"""Represents a rectangular and regularly spaced chunk grid."""

name: str = Field(default="regular", description="The name of the chunk grid.")

configuration: RegularChunkShape = Field(
..., description="Configuration of the regular chunk grid."
)


class RectilinearChunkGrid(CamelCaseStrictModel):
"""Represents a rectangular and irregularly spaced chunk grid."""

name: str = Field(default="rectilinear", description="The name of the chunk grid.")

configuration: RectilinearChunkShape = Field(
..., description="Configuration of the irregular chunk grid."
)
159 changes: 159 additions & 0 deletions src/mdio/schema/compressors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""This module contains a Pydantic model to parameterize compressors.

Important Objects:
- Blosc: A Pydantic model that represents a Blosc compression setup.
- ZFP: Class that represents the ZFP compression model.
"""

from __future__ import annotations

from enum import IntEnum
from enum import StrEnum

from pydantic import Field
from pydantic import model_validator

from mdio.schema.core import CamelCaseStrictModel


class BloscAlgorithm(StrEnum):
"""Enum for Blosc algorithm options."""

BLOSCLZ = "blosclz"
LZ4 = "lz4"
LZ4HC = "lz4hc"
ZLIB = "zlib"
ZSTD = "zstd"


class BloscShuffle(IntEnum):
"""Enum for Blosc shuffle options."""

NOSHUFFLE = 0
SHUFFLE = 1
BITSHUFFLE = 2
AUTOSHUFFLE = -1


class Blosc(CamelCaseStrictModel):
"""Data Model for Blosc options."""

name: str = Field(default="blosc", description="Name of the compressor.")
algorithm: BloscAlgorithm = Field(
default=BloscAlgorithm.LZ4,
description="The Blosc compression algorithm to be used.",
)
level: int = Field(default=5, ge=0, le=9, description="The compression level.")
shuffle: BloscShuffle = Field(
default=BloscShuffle.SHUFFLE,
description="The shuffle strategy to be applied before compression.",
)
blocksize: int = Field(
default=0,
description="The size of the block to be used for compression.",
)

def make_instance(self): # noqa: ANN201
"""Translate parameters to compressor kwargs.."""
from zarr.codecs import Blosc as _Blosc

return _Blosc(
cname=self.algorithm,
clevel=self.level,
shuffle=self.shuffle,
blocksize=self.blocksize,
)


zfp_mode_map = {
"fixed_rate": 2,
"fixed_precision": 3,
"fixed_accuracy": 4,
"reversible": 5,
}


class ZFPMode(StrEnum):
"""Enum for ZFP algorithm modes."""

FIXED_RATE = "fixed_rate"
FIXED_PRECISION = "fixed_precision"
FIXED_ACCURACY = "fixed_accuracy"
REVERSIBLE = "reversible"

@property
def int_code(self) -> int:
"""Return the integer code of ZFP mode."""
return zfp_mode_map[self.value]


class ZFP(CamelCaseStrictModel):
"""Data Model for ZFP options."""

name: str = Field(default="zfp", description="Name of the compressor.")
mode: ZFPMode = Field()

tolerance: float | None = Field(
default=None,
description="Fixed accuracy in terms of absolute error tolerance.",
)

rate: float | None = Field(
default=None,
description="Fixed rate in terms of number of compressed bits per value.",
)

precision: int | None = Field(
default=None,
description="Fixed precision in terms of number of uncompressed bits per value.",
)

write_header: bool = Field(
default=True,
description="Encode array shape, scalar type, and compression parameters.",
)

@model_validator(mode="after")
def check_requirements(self) -> ZFP:
"""Check if ZFP parameters make sense."""
mode = self.mode

# Check if reversible mode is provided without other parameters.
if mode == ZFPMode.REVERSIBLE and any(
getattr(self, key) is not None for key in ["tolerance", "rate", "precision"]
):
msg = "Other fields must be None in REVERSIBLE mode"
raise ValueError(msg)

if mode == ZFPMode.FIXED_ACCURACY and self.tolerance is None:
msg = "Tolerance required for FIXED_ACCURACY mode"
raise ValueError(msg)

if mode == ZFPMode.FIXED_RATE and self.rate is None:
msg = "Rate required for FIXED_RATE mode"
raise ValueError(msg)

if mode == ZFPMode.FIXED_PRECISION and self.precision is None:
msg = "Precision required for FIXED_PRECISION mode"
raise ValueError(msg)

return self

def make_instance(self): # noqa: ANN201
"""Translate parameters to compressor kwargs.."""
from zarr.codecs import ZFPY as _ZFPY

return _ZFPY(
mode=self.mode.int_code,
tolerance=self.tolerance,
rate=self.rate,
precision=self.precision,
)


class CompressorModel(CamelCaseStrictModel):
"""Model representing compressor configuration."""

compressor: Blosc | ZFP | None = Field(
default=None, description="Compression settings."
)
49 changes: 49 additions & 0 deletions src/mdio/schema/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""This module implements the core components of the MDIO schemas."""

from __future__ import annotations

from typing import Any
from typing import get_type_hints

from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic.alias_generators import to_camel


def model_fields(model: type[BaseModel]) -> dict[str, tuple[Any, Any]]:
"""Extract Pydantic BaseModel fields.

Args:
model: (Type) The model object for which the fields will be extracted.

Returns:
A dictionary containing the fields of the model along with
their corresponding types and default values.

Example:
>>> class MyModel(BaseModel):
... name: str
... age: int = 0
...
>>> model_fields(MyModel)
{'name': (str, <default_value>), 'age': (int, 0)}
"""
annotations = get_type_hints(model)

fields = {}
for field_name, field in model.model_fields.items():
fields[field_name] = (annotations[field_name], field)

return fields


class StrictModel(BaseModel):
"""A model with forbidden extras."""

model_config = ConfigDict(extra="forbid", populate_by_name=True)


class CamelCaseStrictModel(StrictModel):
"""A model with forbidden extras and camel case aliases."""

model_config = ConfigDict(alias_generator=to_camel)
12 changes: 12 additions & 0 deletions src/mdio/schema/dimension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Dimension schema."""

from pydantic import Field

from mdio.schema.core import CamelCaseStrictModel


class NamedDimension(CamelCaseStrictModel):
"""Represents a single dimension with a name and size."""

name: str = Field(..., description="Unique identifier for the dimension.")
size: int = Field(..., gt=0, description="Total size of the dimension.")
Loading
Loading