diff --git a/pyproject.toml b/pyproject.toml index 92c59f2e..f5b3abfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "dask (>=2024.12.0)", "tqdm (>=4.67.0,<5.0.0)", "psutil (>=6.1.0,<7.0.0)", + "pydantic (>=2.8.2,<3.0.0)", + "pydantic-settings (>=2.4.0,<3.0.0)", "fsspec (>=2024.10.0)", "segy (>=0.4.0,<0.5.0)", "rich (>=13.9.4,<14.0.0)", diff --git a/src/mdio/schema/__init__.py b/src/mdio/schema/__init__.py new file mode 100644 index 00000000..acdce84a --- /dev/null +++ b/src/mdio/schema/__init__.py @@ -0,0 +1,18 @@ +"""MDIO schemas for different data types.""" + +from mdio.schema.compressors import ZFP +from mdio.schema.compressors import Blosc +from mdio.schema.dimension import NamedDimension +from mdio.schema.dtype import ScalarType +from mdio.schema.dtype import StructuredField +from mdio.schema.dtype import StructuredType + + +__all__ = [ + "Blosc", + "ZFP", + "NamedDimension", + "ScalarType", + "StructuredField", + "StructuredType", +] diff --git a/src/mdio/schema/base.py b/src/mdio/schema/base.py new file mode 100644 index 00000000..372aebda --- /dev/null +++ b/src/mdio/schema/base.py @@ -0,0 +1,43 @@ +"""Base models to subclass from.""" + +from pydantic import ConfigDict +from pydantic import Field +from pydantic.json_schema import GenerateJsonSchema + +from mdio.schema.compressors import ZFP +from mdio.schema.compressors import Blosc +from mdio.schema.core import CamelCaseStrictModel +from mdio.schema.dimension import NamedDimension +from mdio.schema.dtype import DataTypeModel + + +JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect + + +class BaseDataset(CamelCaseStrictModel): + """A base class for MDIO datasets. + + We add schema dialect to extend the config of `StrictCamelBaseModel`. + We use the default Pydantic schema generator `GenerateJsonSchema` to + define the JSON schema dialect accurately. + """ + + model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT}) + + +class BaseArray(DataTypeModel, CamelCaseStrictModel): + """A base array schema.""" + + dimensions: list[NamedDimension] | list[str] = Field( + ..., description="List of Dimension collection or reference to dimension names." + ) + compressor: Blosc | ZFP | None = Field( + default=None, description="Compression settings." + ) + + +class NamedArray(BaseArray): + """An array with a name.""" + + name: str = Field(..., description="Name of the array.") + long_name: str | None = Field(default=None, description="Fully descriptive name.") diff --git a/src/mdio/schema/chunk_grid.py b/src/mdio/schema/chunk_grid.py new file mode 100644 index 00000000..11729b28 --- /dev/null +++ b/src/mdio/schema/chunk_grid.py @@ -0,0 +1,44 @@ +"""This module contains data models for Zarr's chunk grid.""" + +from __future__ import annotations + +from pydantic import Field + +from mdio.schema.core import CamelCaseStrictModel + + +class RegularChunkShape(CamelCaseStrictModel): + """Represents regular chunk sizes along each dimension.""" + + chunk_shape: list[int] = Field( + ..., description="Lengths of the chunk along each dimension of the array." + ) + + +class RectilinearChunkShape(CamelCaseStrictModel): + """Represents irregular chunk sizes along each dimension.""" + + chunk_shape: list[list[int]] = Field( + ..., + description="Lengths of the chunk along each dimension of the array.", + ) + + +class RegularChunkGrid(CamelCaseStrictModel): + """Represents a rectangular and regularly spaced chunk grid.""" + + name: str = Field(default="regular", description="The name of the chunk grid.") + + configuration: RegularChunkShape = Field( + ..., description="Configuration of the regular chunk grid." + ) + + +class RectilinearChunkGrid(CamelCaseStrictModel): + """Represents a rectangular and irregularly spaced chunk grid.""" + + name: str = Field(default="rectilinear", description="The name of the chunk grid.") + + configuration: RectilinearChunkShape = Field( + ..., description="Configuration of the irregular chunk grid." + ) diff --git a/src/mdio/schema/compressors.py b/src/mdio/schema/compressors.py new file mode 100644 index 00000000..45c31d4f --- /dev/null +++ b/src/mdio/schema/compressors.py @@ -0,0 +1,159 @@ +"""This module contains a Pydantic model to parameterize compressors. + +Important Objects: + - Blosc: A Pydantic model that represents a Blosc compression setup. + - ZFP: Class that represents the ZFP compression model. +""" + +from __future__ import annotations + +from enum import IntEnum +from enum import StrEnum + +from pydantic import Field +from pydantic import model_validator + +from mdio.schema.core import CamelCaseStrictModel + + +class BloscAlgorithm(StrEnum): + """Enum for Blosc algorithm options.""" + + BLOSCLZ = "blosclz" + LZ4 = "lz4" + LZ4HC = "lz4hc" + ZLIB = "zlib" + ZSTD = "zstd" + + +class BloscShuffle(IntEnum): + """Enum for Blosc shuffle options.""" + + NOSHUFFLE = 0 + SHUFFLE = 1 + BITSHUFFLE = 2 + AUTOSHUFFLE = -1 + + +class Blosc(CamelCaseStrictModel): + """Data Model for Blosc options.""" + + name: str = Field(default="blosc", description="Name of the compressor.") + algorithm: BloscAlgorithm = Field( + default=BloscAlgorithm.LZ4, + description="The Blosc compression algorithm to be used.", + ) + level: int = Field(default=5, ge=0, le=9, description="The compression level.") + shuffle: BloscShuffle = Field( + default=BloscShuffle.SHUFFLE, + description="The shuffle strategy to be applied before compression.", + ) + blocksize: int = Field( + default=0, + description="The size of the block to be used for compression.", + ) + + def make_instance(self): # noqa: ANN201 + """Translate parameters to compressor kwargs..""" + from zarr.codecs import Blosc as _Blosc + + return _Blosc( + cname=self.algorithm, + clevel=self.level, + shuffle=self.shuffle, + blocksize=self.blocksize, + ) + + +zfp_mode_map = { + "fixed_rate": 2, + "fixed_precision": 3, + "fixed_accuracy": 4, + "reversible": 5, +} + + +class ZFPMode(StrEnum): + """Enum for ZFP algorithm modes.""" + + FIXED_RATE = "fixed_rate" + FIXED_PRECISION = "fixed_precision" + FIXED_ACCURACY = "fixed_accuracy" + REVERSIBLE = "reversible" + + @property + def int_code(self) -> int: + """Return the integer code of ZFP mode.""" + return zfp_mode_map[self.value] + + +class ZFP(CamelCaseStrictModel): + """Data Model for ZFP options.""" + + name: str = Field(default="zfp", description="Name of the compressor.") + mode: ZFPMode = Field() + + tolerance: float | None = Field( + default=None, + description="Fixed accuracy in terms of absolute error tolerance.", + ) + + rate: float | None = Field( + default=None, + description="Fixed rate in terms of number of compressed bits per value.", + ) + + precision: int | None = Field( + default=None, + description="Fixed precision in terms of number of uncompressed bits per value.", + ) + + write_header: bool = Field( + default=True, + description="Encode array shape, scalar type, and compression parameters.", + ) + + @model_validator(mode="after") + def check_requirements(self) -> ZFP: + """Check if ZFP parameters make sense.""" + mode = self.mode + + # Check if reversible mode is provided without other parameters. + if mode == ZFPMode.REVERSIBLE and any( + getattr(self, key) is not None for key in ["tolerance", "rate", "precision"] + ): + msg = "Other fields must be None in REVERSIBLE mode" + raise ValueError(msg) + + if mode == ZFPMode.FIXED_ACCURACY and self.tolerance is None: + msg = "Tolerance required for FIXED_ACCURACY mode" + raise ValueError(msg) + + if mode == ZFPMode.FIXED_RATE and self.rate is None: + msg = "Rate required for FIXED_RATE mode" + raise ValueError(msg) + + if mode == ZFPMode.FIXED_PRECISION and self.precision is None: + msg = "Precision required for FIXED_PRECISION mode" + raise ValueError(msg) + + return self + + def make_instance(self): # noqa: ANN201 + """Translate parameters to compressor kwargs..""" + from zarr.codecs import ZFPY as _ZFPY + + return _ZFPY( + mode=self.mode.int_code, + tolerance=self.tolerance, + rate=self.rate, + precision=self.precision, + ) + + +class CompressorModel(CamelCaseStrictModel): + """Model representing compressor configuration.""" + + compressor: Blosc | ZFP | None = Field( + default=None, description="Compression settings." + ) diff --git a/src/mdio/schema/core.py b/src/mdio/schema/core.py new file mode 100644 index 00000000..34a09066 --- /dev/null +++ b/src/mdio/schema/core.py @@ -0,0 +1,49 @@ +"""This module implements the core components of the MDIO schemas.""" + +from __future__ import annotations + +from typing import Any +from typing import get_type_hints + +from pydantic import BaseModel +from pydantic import ConfigDict +from pydantic.alias_generators import to_camel + + +def model_fields(model: type[BaseModel]) -> dict[str, tuple[Any, Any]]: + """Extract Pydantic BaseModel fields. + + Args: + model: (Type) The model object for which the fields will be extracted. + + Returns: + A dictionary containing the fields of the model along with + their corresponding types and default values. + + Example: + >>> class MyModel(BaseModel): + ... name: str + ... age: int = 0 + ... + >>> model_fields(MyModel) + {'name': (str, ), 'age': (int, 0)} + """ + annotations = get_type_hints(model) + + fields = {} + for field_name, field in model.model_fields.items(): + fields[field_name] = (annotations[field_name], field) + + return fields + + +class StrictModel(BaseModel): + """A model with forbidden extras.""" + + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + +class CamelCaseStrictModel(StrictModel): + """A model with forbidden extras and camel case aliases.""" + + model_config = ConfigDict(alias_generator=to_camel) diff --git a/src/mdio/schema/dimension.py b/src/mdio/schema/dimension.py new file mode 100644 index 00000000..da9c90a3 --- /dev/null +++ b/src/mdio/schema/dimension.py @@ -0,0 +1,12 @@ +"""Dimension schema.""" + +from pydantic import Field + +from mdio.schema.core import CamelCaseStrictModel + + +class NamedDimension(CamelCaseStrictModel): + """Represents a single dimension with a name and size.""" + + name: str = Field(..., description="Unique identifier for the dimension.") + size: int = Field(..., gt=0, description="Total size of the dimension.") diff --git a/src/mdio/schema/dtype.py b/src/mdio/schema/dtype.py new file mode 100644 index 00000000..ba4f7724 --- /dev/null +++ b/src/mdio/schema/dtype.py @@ -0,0 +1,64 @@ +"""Schemas for scalar types. + +We take booleans, unsigned and signed integers, floats, and +complex numbers from numpy data types and allow those. +""" + +from __future__ import annotations + +from enum import StrEnum + +import numpy as np +from pydantic import Field + +from mdio.schema.core import CamelCaseStrictModel + + +ALLOWED_TYPES = [ + # Boolean + np.bool_.__name__, + # Signed integers + np.int8.__name__, + np.int16.__name__, + np.int32.__name__, + np.int64.__name__, + # Unsigned integers + np.uint8.__name__, + np.uint16.__name__, + np.uint32.__name__, + np.uint64.__name__, + # Floating point + np.float16.__name__, + np.float32.__name__, + np.float64.__name__, + np.float128.__name__, + # Complex + np.complex64.__name__, + np.complex128.__name__, + np.clongdouble.__name__, +] + + +ScalarType = StrEnum("ScalarType", {t.upper(): t for t in ALLOWED_TYPES}) +ScalarType.__doc__ = """Scalar array data type.""" + + +class StructuredField(CamelCaseStrictModel): + """Structured array field with name, format.""" + + format: ScalarType = Field(...) + name: str = Field(...) + + +class StructuredType(CamelCaseStrictModel): + """Structured array type with packed fields.""" + + fields: list[StructuredField] = Field() + + +class DataTypeModel(CamelCaseStrictModel): + """Structured array type with fields and total item size.""" + + data_type: ScalarType | StructuredType = Field( + ..., description="Type of the array." + ) diff --git a/src/mdio/schema/metadata.py b/src/mdio/schema/metadata.py new file mode 100644 index 00000000..49f13e2e --- /dev/null +++ b/src/mdio/schema/metadata.py @@ -0,0 +1,31 @@ +"""Metadata schemas and conventions.""" + +from typing import Any + +from pydantic import Field + +from mdio.schema.chunk_grid import RectilinearChunkGrid +from mdio.schema.chunk_grid import RegularChunkGrid +from mdio.schema.core import CamelCaseStrictModel + + +class ChunkGridMetadata(CamelCaseStrictModel): + """Definition of chunk grid.""" + + chunk_grid: RegularChunkGrid | RectilinearChunkGrid | None = Field( + default=None, + description="Chunk grid specification for the array.", + ) + + +class VersionedMetadataConvention(CamelCaseStrictModel): + """Data model for versioned metadata convention.""" + + +class UserAttributes(CamelCaseStrictModel): + """User defined attributes as key/value pairs.""" + + attributes: dict[str, Any] | None = Field( + default=None, + description="User defined attributes as key/value pairs.", + ) diff --git a/src/mdio/schema/units.py b/src/mdio/schema/units.py new file mode 100644 index 00000000..95e6943c --- /dev/null +++ b/src/mdio/schema/units.py @@ -0,0 +1,51 @@ +"""Common units for resource assessment data.""" + +from __future__ import annotations + +from enum import Enum +from enum import unique + +from pydantic import Field +from pydantic import create_model + +from mdio.schema.core import CamelCaseStrictModel + + +@unique +class UnitEnum(str, Enum): + """An Enum representing units as strings, from pint.""" + + +def create_unit_model( + unit_enum: type[UnitEnum], + model_name: str, + quantity: str, + module: str, +) -> type[CamelCaseStrictModel]: + """Dynamically creates a pydantic model from a unit Enum. + + Args: + unit_enum: UnitEnum representing the units for a specific quantity. + model_name: The name of the model to be created. + quantity: String representing the quantity for which the unit model is created. + module: Name of the module in which the model is to be created. + This should be the `__name__` attribute of the module. + + Returns: + A Pydantic Model representing the unit model derived from the BaseModel. + + Example: + unit_enum = UnitEnum + model_name = "LengthUnitModel" + quantity = "length" + create_unit_model(unit_enum, model_name, quantity) + """ + fields = {quantity: (unit_enum, Field(..., description=f"Unit of {quantity}."))} + + return create_model( + model_name, + **fields, + __base__=CamelCaseStrictModel, + __doc__=f"Model representing units of {quantity}.", + __module__=module, + ) diff --git a/src/mdio/schema/v0/__init__.py b/src/mdio/schema/v0/__init__.py new file mode 100644 index 00000000..9e0aa47c --- /dev/null +++ b/src/mdio/schema/v0/__init__.py @@ -0,0 +1,6 @@ +"""Schema specific to MDIO v0.""" + +from mdio.schema.v0.dataset import DatasetModelV0 + + +__all__ = ["DatasetModelV0"] diff --git a/src/mdio/schema/v0/dataset.py b/src/mdio/schema/v0/dataset.py new file mode 100644 index 00000000..96a9c183 --- /dev/null +++ b/src/mdio/schema/v0/dataset.py @@ -0,0 +1,88 @@ +"""Dataset model for MDIO V0.""" + +from __future__ import annotations + +from pydantic import AwareDatetime +from pydantic import Field + +from mdio.schema.base import BaseArray +from mdio.schema.base import BaseDataset +from mdio.schema.core import CamelCaseStrictModel +from mdio.schema.core import StrictModel + + +class DimensionModelV0(CamelCaseStrictModel): + """Represents dimension schema for MDIO v0.""" + + name: str = Field(..., description="Name of the dimension.") + coords: list[int] = Field(..., description="Coordinate labels (ticks).") + + +class DatasetMetadataModelV0(StrictModel): + """Represents dataset attributes schema for MDIO v0.""" + + api_version: str = Field( + ..., + description="MDIO version.", + ) + + created: AwareDatetime = Field( + ..., + description="Creation time with TZ info.", + ) + + dimension: list[DimensionModelV0] = Field( + ..., + description="Dimensions.", + ) + + mean: float | None = Field( + default=None, + description="Mean value of the samples.", + ) + + # Statistical information + std: float | None = Field( + default=None, description="Standard deviation of the samples." + ) + + rms: float | None = Field( + default=None, description="Root mean squared value of the samples." + ) + + min: float | None = Field( + default=None, + description="Minimum value of the samples.", + ) + + max: float | None = Field( + default=None, + description="Maximum value of the samples.", + ) + + trace_count: int | None = Field( + default=None, description="Number of traces in the SEG-Y file." + ) + + +class VariableModelV0(BaseArray): + """Represents an MDIO v0 variable schema.""" + + +class DatasetModelV0(BaseDataset): + """Represents an MDIO v0 dataset schema.""" + + seismic: list[VariableModelV0] = Field( + ..., + description="Variable containing seismic.", + ) + + headers: list[VariableModelV0] = Field( + ..., + description="Variable containing headers.", + ) + + metadata: DatasetMetadataModelV0 = Field( + ..., + description="Dataset metadata.", + ) diff --git a/src/mdio/schema/v1/__init__.py b/src/mdio/schema/v1/__init__.py new file mode 100644 index 00000000..d26e9981 --- /dev/null +++ b/src/mdio/schema/v1/__init__.py @@ -0,0 +1,6 @@ +"""Schema specific to MDIO v1.""" + +from mdio.schema.v1.dataset import Dataset + + +__all__ = ["Dataset"] diff --git a/src/mdio/schema/v1/dataset.py b/src/mdio/schema/v1/dataset.py new file mode 100644 index 00000000..f7f7fb1c --- /dev/null +++ b/src/mdio/schema/v1/dataset.py @@ -0,0 +1,49 @@ +"""Dataset model for MDIO V1.""" + +from pydantic import AwareDatetime +from pydantic import Field +from pydantic import create_model + +from mdio.schema.base import BaseDataset +from mdio.schema.core import CamelCaseStrictModel +from mdio.schema.core import model_fields +from mdio.schema.metadata import UserAttributes +from mdio.schema.v1.variable import Variable + + +class DatasetInfo(CamelCaseStrictModel): + """Contains information about a dataset.""" + + name: str = Field(..., description="Name or identifier for the dataset.") + + api_version: str = Field( + ..., + description="The version of the MDIO API that the dataset complies with.", + ) + + created_on: AwareDatetime = Field( + ..., + description=( + "The timestamp indicating when the dataset was first created, " + "including timezone information. Expressed in ISO 8601 format." + ), + ) + + +DatasetMetadata = create_model( + "DatasetMetadata", + **model_fields(DatasetInfo), + **model_fields(UserAttributes), + __base__=CamelCaseStrictModel, +) +DatasetMetadata.__doc__ = "The metadata about the dataset." + + +class Dataset(BaseDataset): + """Represents an MDIO v1 dataset. + + A dataset consists of variables and metadata. + """ + + variables: list[Variable] = Field(..., description="Variables in MDIO dataset") + metadata: DatasetMetadata = Field(..., description="Dataset metadata.") diff --git a/src/mdio/schema/v1/stats.py b/src/mdio/schema/v1/stats.py new file mode 100644 index 00000000..66adf0a8 --- /dev/null +++ b/src/mdio/schema/v1/stats.py @@ -0,0 +1,69 @@ +"""Statistics schema for MDIO v1 arrays. + +This module provides two Histogram classes (CenteredBinHistogram and +EdgeDefinedHistogram),a summary statistics class, and a summary statistics +metadata class. +SummaryStatistics: a class that represents the minimum summary statistics +of an array consisting of count, sum, sum of squares, min, max, and a histogram. +SummaryStatisticsMetadata: represents metadata for statistics, with a field +for v1 of the stats. +CenteredBinHistogram takes the center points of each bin in a histogram, +while EdgeDefinedHistogram takes the left edges and widths of each bin. +Both classes extend from the base class BaseHistogram, which represents +a histogram with count of each bin. +""" + +from __future__ import annotations + +from typing import TypeAlias + +from pydantic import Field + +from mdio.schema.core import CamelCaseStrictModel +from mdio.schema.metadata import VersionedMetadataConvention + + +class BaseHistogram(CamelCaseStrictModel): + """Represents a histogram with bin counts.""" + + counts: list[int] = Field(..., description="Count of each each bin.") + + +class CenteredBinHistogram(BaseHistogram): + """Class representing a center bin histogram.""" + + bin_centers: list[float | int] = Field(..., description="List of bin centers.") + + +class EdgeDefinedHistogram(BaseHistogram): + """A class representing an edge-defined histogram.""" + + bin_edges: list[float | int] = Field( + ..., description="The left edges of the histogram bins." + ) + bin_widths: list[float | int] = Field( + ..., description="The widths of the histogram bins." + ) + + +Histogram: TypeAlias = CenteredBinHistogram | EdgeDefinedHistogram + + +class SummaryStatistics(CamelCaseStrictModel): + """Data model for some statistics in MDIO v1 arrays.""" + + count: int = Field(..., description="The number of data points.") + sum: float = Field(..., description="The total of all data values.") + sum_squares: float = Field(..., description="The total of all data values squared.") + min: float = Field(..., description="The smallest value in the variable.") + max: float = Field(..., description="The largest value in the variable.") + histogram: Histogram = Field(..., description="Binned frequency distribution.") + + +class StatisticsMetadata(VersionedMetadataConvention): + """Data Model representing metadata for statistics.""" + + stats_v1: SummaryStatistics | list[SummaryStatistics] | None = Field( + default=None, + description="Minimal summary statistics.", + ) diff --git a/src/mdio/schema/v1/units.py b/src/mdio/schema/v1/units.py new file mode 100644 index 00000000..260d8fe6 --- /dev/null +++ b/src/mdio/schema/v1/units.py @@ -0,0 +1,126 @@ +"""Unit schemas specific to MDIO v1.""" + +from __future__ import annotations + +from typing import TypeAlias + +from pint import UnitRegistry +from pydantic import Field + +from mdio.schema.metadata import VersionedMetadataConvention +from mdio.schema.units import UnitEnum +from mdio.schema.units import create_unit_model + + +ureg = UnitRegistry() +ureg.default_format = "~C" # compact, abbreviated (symbol). + + +class LengthUnitEnum(UnitEnum): + """Enum class representing metric units of length.""" + + MILLIMETER = ureg.millimeter + CENTIMETER = ureg.centimeter + METER = ureg.meter + KILOMETER = ureg.kilometer + + INCH = ureg.inch + FOOT = ureg.foot + YARD = ureg.yard + MILE = ureg.mile + + +LengthUnitModel = create_unit_model( + LengthUnitEnum, "LengthUnitModel", "length", __name__ +) + + +class TimeUnitEnum(UnitEnum): + """Enum class representing units of time.""" + + NANOSECOND = ureg.nanosecond + MICROSECOND = ureg.microsecond + MILLISECOND = ureg.millisecond + SECOND = ureg.second + MINUTE = ureg.minute + HOUR = ureg.hour + DAY = ureg.day + + +TimeUnitModel = create_unit_model(TimeUnitEnum, "TimeUnitModel", "time", __name__) + + +class DensityUnitEnum(UnitEnum): + """Enum class representing units of density.""" + + GRAMS_PER_CC = ureg.gram / ureg.centimeter**3 + KILOGRAMS_PER_M3 = ureg.kilogram / ureg.meter**3 + POUNDS_PER_GAL = ureg.pounds / ureg.gallon + + +DensityUnitModel = create_unit_model( + DensityUnitEnum, "DensityUnitModel", "density", __name__ +) + + +class SpeedUnitEnum(UnitEnum): + """Enum class representing units of speed.""" + + METER_PER_SECOND = ureg.meter / ureg.second + FEET_PER_SECOND = ureg.feet / ureg.second + + +SpeedUnitModel = create_unit_model(SpeedUnitEnum, "SpeedUnitModel", "speed", __name__) + + +class AngleUnitEnum(UnitEnum): + """Enum class representing units of angle.""" + + DEGREES = ureg.degree + RADIANS = ureg.radian + + +AngleUnitModel = create_unit_model(AngleUnitEnum, "AngleUnitModel", "angle", __name__) + + +class FrequencyUnitEnum(UnitEnum): + """Enum class representing units of frequency.""" + + HERTZ = ureg.hertz + + +FrequencyUnitModel = create_unit_model( + FrequencyUnitEnum, "FrequencyUnitModel", "frequency", __name__ +) + + +class VoltageUnitEnum(UnitEnum): + """Enum class representing units of voltage.""" + + MICROVOLT = ureg.microvolt + MILLIVOLT = ureg.millivolt + VOLT = ureg.volt + + +VoltageUnitModel = create_unit_model( + VoltageUnitEnum, "VoltageUnitModel", "voltage", __name__ +) + + +# Composite model types +AllUnitModel: TypeAlias = ( + LengthUnitModel + | TimeUnitModel + | AngleUnitModel + | DensityUnitModel + | SpeedUnitModel + | FrequencyUnitModel + | VoltageUnitModel +) + + +# Versioned metadata conventions for units +class AllUnits(VersionedMetadataConvention): + """All Units.""" + + units_v1: AllUnitModel | list[AllUnitModel] | None = Field(default=None) diff --git a/src/mdio/schema/v1/variable.py b/src/mdio/schema/v1/variable.py new file mode 100644 index 00000000..f0d986a1 --- /dev/null +++ b/src/mdio/schema/v1/variable.py @@ -0,0 +1,52 @@ +"""This module defines variables for MDIO v1 schema. + +`LabeledArray` is a basic array unit which includes basic properties like +name, dimension, data type, compressor etc. `Coordinate` extends the +`LabeledArray` class, it represents the Coordinate array in the MDIO format. +It has dimensions which are fully defined and can hold additional metadata. +`Variable` is another class that extends the `LabeledArray`. It represents a +variable in MDIO format. It can have coordinates and can also hold metadata. +""" + +from pydantic import Field +from pydantic import create_model + +from mdio.schema.base import NamedArray +from mdio.schema.core import CamelCaseStrictModel +from mdio.schema.core import model_fields +from mdio.schema.dtype import ScalarType +from mdio.schema.metadata import ChunkGridMetadata +from mdio.schema.metadata import UserAttributes +from mdio.schema.v1.stats import StatisticsMetadata +from mdio.schema.v1.units import AllUnits + + +class Coordinate(NamedArray): + """An MDIO coordinate array with metadata.""" + + data_type: ScalarType = Field(..., description="Data type of coordinate.") + metadata: list[AllUnits | UserAttributes] | None = Field( + default=None, description="Coordinate metadata." + ) + + +VariableMetadata = create_model( + "VariableMetadata", + **model_fields(ChunkGridMetadata), + **model_fields(AllUnits), + **model_fields(StatisticsMetadata), + **model_fields(UserAttributes), + __base__=CamelCaseStrictModel, +) + + +class Variable(NamedArray): + """An MDIO variable that has coordinates and metadata.""" + + coordinates: list[Coordinate] | list[str] | None = Field( + default=None, + description="Coordinates of the MDIO variable dimensions.", + ) + metadata: VariableMetadata | None = Field( + default=None, description="Variable metadata." + ) diff --git a/uv.lock b/uv.lock index 58caa725..e191c910 100644 --- a/uv.lock +++ b/uv.lock @@ -1500,7 +1500,7 @@ wheels = [ [[package]] name = "multidimio" -version = "0.8.5" +version = "0.9.0" source = { editable = "." } dependencies = [ { name = "click" }, @@ -1508,6 +1508,8 @@ dependencies = [ { name = "dask" }, { name = "fsspec" }, { name = "psutil" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, { name = "rich" }, { name = "segy" }, { name = "tqdm" }, @@ -1569,6 +1571,8 @@ requires-dist = [ { name = "fsspec", specifier = ">=2024.10.0" }, { name = "gcsfs", marker = "extra == 'cloud'", specifier = ">=2024.10.0" }, { name = "psutil", specifier = ">=6.1.0,<7.0.0" }, + { name = "pydantic", specifier = ">=2.8.2,<3.0.0" }, + { name = "pydantic-settings", specifier = ">=2.4.0,<3.0.0" }, { name = "rich", specifier = ">=13.9.4,<14.0.0" }, { name = "s3fs", marker = "extra == 'cloud'", specifier = "==2024.12.0" }, { name = "segy", specifier = ">=0.4.0,<0.5.0" },