From d277d01a37dcbaaf2b1690c6f206f6c7adcb2cf8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 28 Jul 2025 09:53:10 +0200 Subject: [PATCH 01/25] add geozarr model --- eopf_geozarr/data_api/geozarr.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 eopf_geozarr/data_api/geozarr.py diff --git a/eopf_geozarr/data_api/geozarr.py b/eopf_geozarr/data_api/geozarr.py new file mode 100644 index 0000000..7d3c41b --- /dev/null +++ b/eopf_geozarr/data_api/geozarr.py @@ -0,0 +1,79 @@ +from __future__ import annotations +from typing import Any +from typing_extensions import TypedDict + +from pydantic import BaseModel, Field +from pydantic_zarr.v2 import ArraySpec, GroupSpec + +class GeoZarrDataArrayAttrs(BaseModel): + """ + Attributes for a GeoZarr DataArray. + + Attributes + ---------- + array_dimensions : tuple[str, ...] + Alias for the _ARRAY_DIMENSIONS attribute, which lists the dimension names for this array. + standard_name : str + The CF standard name of the variable. + grid_mapping : object + The grid mapping of the variable, which is a reference to a grid mapping variable that + describes the spatial reference of the variable. + grid_mapping_name : str + The name of the grid mapping, which is a string that describes the type of grid mapping + used for the variable. + """ + + # todo: validate that this names listed here are the names of zarr arrays + # unless the variable is an auxiliary variable + # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates + array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") + standard_name: str + grid_mapping: object + grid_mapping_name: str + + +class GeoZarrDataArray(ArraySpec[GeoZarrDataArrayAttrs]): + """ + A GeoZarr DataArray variable. + + + References + ---------- + https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray + """ + + +def check_valid_coordinates(model: GroupSpec[Any, Any]) -> GroupSpec[Any, Any]: + """ + Check if the coordinates of a GeoZarr DataArray are valid. + + Parameters + ---------- + model : GroupSpec[Any, Any] + The GeoZarr DataArray model to check. + + Returns + ------- + GroupSpec[Any, Any] + The validated GeoZarr DataArray model. + """ + if model.members is None: + raise ValueError("Model members cannot be None") + + arrays: dict[str, GeoZarrDataArray] = {k: v for k, v in model.members.items() if isinstance(v, GeoZarrDataArray)} + for key, array in arrays.items(): + for idx, dim in enumerate(array.attributes.array_dimensions): + if dim not in model.members: + raise ValueError(f"Dimension '{dim}' for array '{key}' is not defined in the model members.") + member = model.members[dim] + if isinstance(member, GroupSpec): + raise ValueError(f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead.") + if member.shape[0] != array.shape[idx]: + raise ValueError(f"Dimension '{dim}' for array '{key}' has a shape mismatch: " + f"{member.shape[0]} != {array.shape[idx]}.") + return model + + +class GeoZarrDataset(GroupSpec[Any, GroupSpec[Any, Any] | GeoZarrDataArray]): + ... + From c940bd8842e40e502fbb596bc0db84d58f1b3d53 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Jul 2025 17:21:47 +0200 Subject: [PATCH 02/25] initial working pydantic models for geozarr --- eopf_geozarr/data_api/geozarr.py | 79 -------- eopf_geozarr/data_api/geozarr/__init__.py | 0 eopf_geozarr/data_api/geozarr/common.py | 79 ++++++++ eopf_geozarr/data_api/geozarr/v2.py | 177 ++++++++++++++++++ eopf_geozarr/data_api/geozarr/v3.py | 0 eopf_geozarr/tests/test_data_api/__init__.py | 0 .../tests/test_data_api/test_geozarr.py | 141 ++++++++++++++ 7 files changed, 397 insertions(+), 79 deletions(-) delete mode 100644 eopf_geozarr/data_api/geozarr.py create mode 100644 eopf_geozarr/data_api/geozarr/__init__.py create mode 100644 eopf_geozarr/data_api/geozarr/common.py create mode 100644 eopf_geozarr/data_api/geozarr/v2.py create mode 100644 eopf_geozarr/data_api/geozarr/v3.py create mode 100644 eopf_geozarr/tests/test_data_api/__init__.py create mode 100644 eopf_geozarr/tests/test_data_api/test_geozarr.py diff --git a/eopf_geozarr/data_api/geozarr.py b/eopf_geozarr/data_api/geozarr.py deleted file mode 100644 index 7d3c41b..0000000 --- a/eopf_geozarr/data_api/geozarr.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import annotations -from typing import Any -from typing_extensions import TypedDict - -from pydantic import BaseModel, Field -from pydantic_zarr.v2 import ArraySpec, GroupSpec - -class GeoZarrDataArrayAttrs(BaseModel): - """ - Attributes for a GeoZarr DataArray. - - Attributes - ---------- - array_dimensions : tuple[str, ...] - Alias for the _ARRAY_DIMENSIONS attribute, which lists the dimension names for this array. - standard_name : str - The CF standard name of the variable. - grid_mapping : object - The grid mapping of the variable, which is a reference to a grid mapping variable that - describes the spatial reference of the variable. - grid_mapping_name : str - The name of the grid mapping, which is a string that describes the type of grid mapping - used for the variable. - """ - - # todo: validate that this names listed here are the names of zarr arrays - # unless the variable is an auxiliary variable - # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates - array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - standard_name: str - grid_mapping: object - grid_mapping_name: str - - -class GeoZarrDataArray(ArraySpec[GeoZarrDataArrayAttrs]): - """ - A GeoZarr DataArray variable. - - - References - ---------- - https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray - """ - - -def check_valid_coordinates(model: GroupSpec[Any, Any]) -> GroupSpec[Any, Any]: - """ - Check if the coordinates of a GeoZarr DataArray are valid. - - Parameters - ---------- - model : GroupSpec[Any, Any] - The GeoZarr DataArray model to check. - - Returns - ------- - GroupSpec[Any, Any] - The validated GeoZarr DataArray model. - """ - if model.members is None: - raise ValueError("Model members cannot be None") - - arrays: dict[str, GeoZarrDataArray] = {k: v for k, v in model.members.items() if isinstance(v, GeoZarrDataArray)} - for key, array in arrays.items(): - for idx, dim in enumerate(array.attributes.array_dimensions): - if dim not in model.members: - raise ValueError(f"Dimension '{dim}' for array '{key}' is not defined in the model members.") - member = model.members[dim] - if isinstance(member, GroupSpec): - raise ValueError(f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead.") - if member.shape[0] != array.shape[idx]: - raise ValueError(f"Dimension '{dim}' for array '{key}' has a shape mismatch: " - f"{member.shape[0]} != {array.shape[idx]}.") - return model - - -class GeoZarrDataset(GroupSpec[Any, GroupSpec[Any, Any] | GeoZarrDataArray]): - ... - diff --git a/eopf_geozarr/data_api/geozarr/__init__.py b/eopf_geozarr/data_api/geozarr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eopf_geozarr/data_api/geozarr/common.py b/eopf_geozarr/data_api/geozarr/common.py new file mode 100644 index 0000000..5e52ae4 --- /dev/null +++ b/eopf_geozarr/data_api/geozarr/common.py @@ -0,0 +1,79 @@ +from cf_xarray.utils import parse_cf_standard_name_table + + +import io +import urllib +import urllib.request + +from pydantic import BaseModel + + +def get_cf_standard_names(url: str) -> tuple[str, ...]: + """ + Retrieve the set of CF standard names and return them as a tuple. + """ + + headers = {"User-Agent": 'eopf_geozarr'} + + req = urllib.request.Request(url, headers=headers) + + try: + with urllib.request.urlopen(req) as response: + content = response.read() # Read the entire response body into memory + content_fobj = io.BytesIO(content) + except urllib.error.URLError as e: + raise e + + _info, table, _aliases = parse_cf_standard_name_table(source=content_fobj) + return tuple(table.keys()) + + +# This is a URL to the CF standard names table. +CF_STANDARD_NAME_URL = ("https://raw.githubusercontent.com/cf-convention/cf-convention.github.io/" + "master/Data/cf-standard-names/current/src/cf-standard-name-table.xml") + + +# this does IO against github. consider locally storing this data instead if fetching every time +# is problematic. +CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL) + + +def check_standard_name(name: str) -> str: + """ + Check if the standard name is valid according to the CF conventions. + + Parameters + ---------- + name : str + The standard name to check. + + Returns + ------- + str + The validated standard name. + + Raises + ------ + ValueError + If the standard name is not valid. + """ + + if name in CF_STANDARD_NAMES: + return name + raise ValueError(f"Invalid standard name: {name}. This name was not found in the list of CF standard names.") + + +class MultiscaleAttrs(BaseModel): + """ + Attributes for a GeoZarr multiscale dataset. + + Attributes + ---------- + tile_matrix_set : str + The tile matrix set identifier for the multiscale dataset. + resampling_method : str + The resampling method for the multiscale dataset. + """ + + tile_matrix_set: str + resampling_method: str \ No newline at end of file diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py new file mode 100644 index 0000000..d4e061c --- /dev/null +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -0,0 +1,177 @@ +from __future__ import annotations +from typing import Annotated, Any, Literal, Self + +from pydantic import BaseModel, AfterValidator, Field, model_serializer, model_validator +from pydantic_zarr.v2 import ArraySpec, GroupSpec, from_flat_group, AnyArraySpec, AnyGroupSpec, TAttr, TItem + +from eopf_geozarr.data_api.geozarr.common import MultiscaleAttrs, check_standard_name + +CFStandardName = Annotated[str, AfterValidator(check_standard_name)] + +class MyGroupSpec(GroupSpec[TAttr, TItem]): + @classmethod + def from_flat(cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bool = False) -> Self: + """ + Create a `GroupSpec` from a flat hierarchy representation. The flattened hierarchy is a + `dict` with the following constraints: keys must be valid paths; values must + be `ArraySpec` or `GroupSpec` instances. + + Parameters + ---------- + data : Dict[str, ArraySpec | GroupSpec] + A flattened representation of a Zarr hierarchy. + + Returns + ------- + GroupSpec + A `GroupSpec` representation of the hierarchy. + + Examples + -------- + >>> from pydantic_zarr.v2 import GroupSpec, ArraySpec + >>> import numpy as np + >>> flat = {'': GroupSpec(attributes={'foo': 10}, members=None)} + >>> GroupSpec.from_flat(flat) + GroupSpec(zarr_format=2, attributes={'foo': 10}, members={}) + >>> flat = { + '': GroupSpec(attributes={'foo': 10}, members=None), + '/a': ArraySpec.from_array(np.arange(10))} + >>> GroupSpec.from_flat(flat) + GroupSpec(zarr_format=2, attributes={'foo': 10}, members={'a': ArraySpec(zarr_format=2, attributes={}, shape=(10,), chunks=(10,), dtype=' Dataset: + """ + Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. + + For each DataArray in the model, we check the dimensions associated with the DataArray. + For each dimension associated with a data variable, an array with the name of that data variable + must be present in the members of the group. + + Parameters + ---------- + model : GroupSpec[Any, Any] + The GeoZarr DataArray model to check. + + Returns + ------- + GroupSpec[Any, Any] + The validated GeoZarr DataArray model. + """ + if model.members is None: + raise ValueError("Model members cannot be None") + + arrays: dict[str, DataArray] = {k: v for k, v in model.members.items() if isinstance(v, DataArray)} + for key, array in arrays.items(): + for idx, dim in enumerate(array.attributes.array_dimensions): + if dim not in model.members: + raise ValueError(f"Dimension '{dim}' for array '{key}' is not defined in the model members.") + member = model.members[dim] + if isinstance(member, GroupSpec): + raise ValueError(f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead.") + if member.shape[0] != array.shape[idx]: + raise ValueError(f"Dimension '{dim}' for array '{key}' has a shape mismatch: " + f"{member.shape[0]} != {array.shape[idx]}.") + return model + + +class DatasetAttrs(BaseModel): + """ + Attributes for a GeoZarr dataset. + + Attributes + ---------- + multiscales: MultiscaleAttrs + """ + multiscales: MultiscaleAttrs + +class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): + @model_validator(mode="after") + def check_valid_coordinates(self) -> Self: + """ + Validate the coordinates of the GeoZarr DataSet. + + This method checks that all DataArrays in the dataset have valid coordinates + according to the GeoZarr specification. + + Returns + ------- + GroupSpec[Any, Any] + The validated GeoZarr DataSet. + """ + return check_valid_coordinates(self) diff --git a/eopf_geozarr/data_api/geozarr/v3.py b/eopf_geozarr/data_api/geozarr/v3.py new file mode 100644 index 0000000..e69de29 diff --git a/eopf_geozarr/tests/test_data_api/__init__.py b/eopf_geozarr/tests/test_data_api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py new file mode 100644 index 0000000..08ee876 --- /dev/null +++ b/eopf_geozarr/tests/test_data_api/test_geozarr.py @@ -0,0 +1,141 @@ + +from __future__ import annotations +import re +from eopf_geozarr.data_api.geozarr.v2 import CoordArray, CoordArrayAttrs, MyGroupSpec, check_valid_coordinates +from eopf_geozarr.data_api.geozarr.v2 import ( + DataArray, + DataArrayAttrs) +import pytest +from typing import Any +from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, check_standard_name, get_cf_standard_names +from pydantic_zarr.v2 import GroupSpec, ArraySpec + +def test_get_cf_standard_names() -> None: + """ + Test the get_cf_standard_names function to ensure it retrieves the CF standard names correctly. + """ + standard_names = get_cf_standard_names(CF_STANDARD_NAME_URL) + assert isinstance(standard_names, tuple) + assert len(standard_names) > 0 + assert all(isinstance(name, str) for name in standard_names) + +@pytest.mark.parametrize("name", ["air_temperature", "sea_surface_temperature", "precipitation_flux"]) +def test_check_standard_name_valid(name: str) -> None: + """ + Test the check_standard_name function with valid standard names. + """ + assert check_standard_name(name) == name + +def test_check_standard_name_invalid() -> None: + """ + Test the check_standard_name function with an invalid standard name. + """ + with pytest.raises(ValueError): + check_standard_name("invalid_standard_name") + +def test_coord_array_attrs_dimensions_length() -> None: + """ + Test that the array_dimensions attribute must have length 1. + """ + msg = ('1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n ' + ' Tuple should have at most 1 item after validation, not 2') + with pytest.raises(ValueError, match=re.escape(msg)): + CoordArrayAttrs( + _ARRAY_DIMENSIONS=("time", "lat"), + standard_name="air_temperature", + units="mm", + axis="Y", + ) + +def test_coord_array_dimensionality() -> None: + """ + Test that only 1-dimensional arrays are allowed. + """ + msg = ('1 validation error for CoordArray\nshape\n ' + 'Tuple should have at most 1 item after validation, not 2') + with pytest.raises(ValueError, match=re.escape(msg)): + CoordArray( + shape=(10, 11), + dtype='|u1', + chunks=(10, 11), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=("time",), + standard_name="air_temperature", + units='s', + axis='Y', + ) + ) + +class TestCheckValidCoordinates: + @pytest.mark.parametrize("example", [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(10, 11), + dtype='|u1', + chunks=(10, 11), + attributes=DataArrayAttrs( + array_dimensions=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude" + )), + "/time": CoordArray(shape=(10,), dtype='|u1', chunks=(10,), attributes=CoordArrayAttrs( + array_dimensions=["time"], + standard_name="time", + units="s", + axis="T")), + "/lat": CoordArray( + shape=(11,), + dtype='|u1', + chunks=(11,), + attributes=CoordArrayAttrs( + array_dimensions=["lat"], + standard_name="latitude", + units='m', + axis='Y' + )) + }, + ]) + @staticmethod + def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + """ + group = MyGroupSpec.from_flat(example, by_alias=True) + assert check_valid_coordinates(group) == group + + @pytest.mark.parametrize("example", [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(9, 10), + dtype='|u1', + chunks=(10, 11), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude"), + ), + "/time": ArraySpec(shape=(10,), dtype='|u1', chunks=(10,), attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["time"], + standard_name="time", + units="s", + axis="T" + )), + "/lat": ArraySpec(shape=(11,), dtype='|u1', chunks=(11,), attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], + standard_name="latitude", + units='m', + axis='Y')) + }, + ]) + @staticmethod + def test_invalid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + """ + group = MyGroupSpec[Any, DataArray | CoordArray].from_flat(example, by_alias=True) + with pytest.raises(ValueError): + check_valid_coordinates(group) From 553c7c7e7d93b8845e06b439a17839ae90eea1f5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Jul 2025 17:27:19 +0200 Subject: [PATCH 03/25] initial working pydantic models for geozarr --- eopf_geozarr/data_api/geozarr/common.py | 27 +-- eopf_geozarr/data_api/geozarr/v2.py | 63 ++++-- .../tests/test_data_api/test_geozarr.py | 195 +++++++++++------- eopf_geozarr/tests/test_fs_utils.py | 10 +- 4 files changed, 184 insertions(+), 111 deletions(-) diff --git a/eopf_geozarr/data_api/geozarr/common.py b/eopf_geozarr/data_api/geozarr/common.py index 5e52ae4..626ebf8 100644 --- a/eopf_geozarr/data_api/geozarr/common.py +++ b/eopf_geozarr/data_api/geozarr/common.py @@ -1,25 +1,22 @@ -from cf_xarray.utils import parse_cf_standard_name_table - - +"""Common utilities for GeoZarr data API.""" import io import urllib import urllib.request +from cf_xarray.utils import parse_cf_standard_name_table from pydantic import BaseModel def get_cf_standard_names(url: str) -> tuple[str, ...]: - """ - Retrieve the set of CF standard names and return them as a tuple. - """ + """Retrieve the set of CF standard names and return them as a tuple.""" - headers = {"User-Agent": 'eopf_geozarr'} + headers = {"User-Agent": "eopf_geozarr"} req = urllib.request.Request(url, headers=headers) try: with urllib.request.urlopen(req) as response: - content = response.read() # Read the entire response body into memory + content = response.read() # Read the entire response body into memory content_fobj = io.BytesIO(content) except urllib.error.URLError as e: raise e @@ -29,11 +26,13 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]: # This is a URL to the CF standard names table. -CF_STANDARD_NAME_URL = ("https://raw.githubusercontent.com/cf-convention/cf-convention.github.io/" - "master/Data/cf-standard-names/current/src/cf-standard-name-table.xml") +CF_STANDARD_NAME_URL = ( + "https://raw.githubusercontent.com/cf-convention/cf-convention.github.io/" + "master/Data/cf-standard-names/current/src/cf-standard-name-table.xml" +) -# this does IO against github. consider locally storing this data instead if fetching every time +# this does IO against github. consider locally storing this data instead if fetching every time # is problematic. CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL) @@ -60,7 +59,9 @@ def check_standard_name(name: str) -> str: if name in CF_STANDARD_NAMES: return name - raise ValueError(f"Invalid standard name: {name}. This name was not found in the list of CF standard names.") + raise ValueError( + f"Invalid standard name: {name}. This name was not found in the list of CF standard names." + ) class MultiscaleAttrs(BaseModel): @@ -76,4 +77,4 @@ class MultiscaleAttrs(BaseModel): """ tile_matrix_set: str - resampling_method: str \ No newline at end of file + resampling_method: str diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index d4e061c..72408a9 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -1,16 +1,35 @@ +"""GeoZarr data API for Zarr V2.""" from __future__ import annotations + from typing import Annotated, Any, Literal, Self -from pydantic import BaseModel, AfterValidator, Field, model_serializer, model_validator -from pydantic_zarr.v2 import ArraySpec, GroupSpec, from_flat_group, AnyArraySpec, AnyGroupSpec, TAttr, TItem +from pydantic import AfterValidator, BaseModel, Field, model_serializer, model_validator +from pydantic_zarr.v2 import ( + AnyArraySpec, + AnyGroupSpec, + ArraySpec, + GroupSpec, + TAttr, + TItem, + from_flat_group, +) from eopf_geozarr.data_api.geozarr.common import MultiscaleAttrs, check_standard_name CFStandardName = Annotated[str, AfterValidator(check_standard_name)] + class MyGroupSpec(GroupSpec[TAttr, TItem]): + """ + A custom GroupSpec + + We override the from_flat method to ensure that we can pass by_alias=True + when creating a GroupSpec from a flat hierarchy. + """ @classmethod - def from_flat(cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bool = False) -> Self: + def from_flat( + cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bool = False + ) -> Self: """ Create a `GroupSpec` from a flat hierarchy representation. The flattened hierarchy is a `dict` with the following constraints: keys must be valid paths; values must @@ -42,6 +61,7 @@ def from_flat(cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bo from_flated = from_flat_group(data) return cls(**from_flated.model_dump(by_alias=by_alias)) + class CoordArrayAttrs(BaseModel, populate_by_name=True): """ Attributes for a GeoZarr coordinate array. @@ -59,7 +79,7 @@ class CoordArrayAttrs(BaseModel, populate_by_name=True): The name of the grid mapping, which is a string that describes the type of grid mapping used for the variable. """ - + array_dimensions: tuple[str] = Field(alias="_ARRAY_DIMENSIONS") standard_name: CFStandardName long_name: str | None = None @@ -69,9 +89,11 @@ class CoordArrayAttrs(BaseModel, populate_by_name=True): class CoordArray(ArraySpec[CoordArrayAttrs]): """ - A GeoZarr coordinate array variable. It must be 1-dimensional and have a single element in - its array_dimensions attribute. + A GeoZarr coordinate array variable. + + It must be 1-dimensional and have a single element in its array_dimensions attribute. """ + shape: tuple[int] @@ -102,12 +124,9 @@ class DataArrayAttrs(BaseModel, populate_by_name=True): grid_mapping_name: str - - class DataArray(ArraySpec[DataArrayAttrs]): """ A GeoZarr DataArray variable. - References ---------- @@ -136,17 +155,25 @@ def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: if model.members is None: raise ValueError("Model members cannot be None") - arrays: dict[str, DataArray] = {k: v for k, v in model.members.items() if isinstance(v, DataArray)} + arrays: dict[str, DataArray] = { + k: v for k, v in model.members.items() if isinstance(v, DataArray) + } for key, array in arrays.items(): for idx, dim in enumerate(array.attributes.array_dimensions): if dim not in model.members: - raise ValueError(f"Dimension '{dim}' for array '{key}' is not defined in the model members.") + raise ValueError( + f"Dimension '{dim}' for array '{key}' is not defined in the model members." + ) member = model.members[dim] if isinstance(member, GroupSpec): - raise ValueError(f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead.") + raise ValueError( + f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead." + ) if member.shape[0] != array.shape[idx]: - raise ValueError(f"Dimension '{dim}' for array '{key}' has a shape mismatch: " - f"{member.shape[0]} != {array.shape[idx]}.") + raise ValueError( + f"Dimension '{dim}' for array '{key}' has a shape mismatch: " + f"{member.shape[0]} != {array.shape[idx]}." + ) return model @@ -158,9 +185,15 @@ class DatasetAttrs(BaseModel): ---------- multiscales: MultiscaleAttrs """ + multiscales: MultiscaleAttrs + class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): + """ + A GeoZarr Dataset. + """ + @model_validator(mode="after") def check_valid_coordinates(self) -> Self: """ @@ -174,4 +207,4 @@ def check_valid_coordinates(self) -> Self: GroupSpec[Any, Any] The validated GeoZarr DataSet. """ - return check_valid_coordinates(self) + return check_valid_coordinates(self) diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py index 08ee876..266f2b6 100644 --- a/eopf_geozarr/tests/test_data_api/test_geozarr.py +++ b/eopf_geozarr/tests/test_data_api/test_geozarr.py @@ -1,14 +1,25 @@ - from __future__ import annotations + import re -from eopf_geozarr.data_api.geozarr.v2 import CoordArray, CoordArrayAttrs, MyGroupSpec, check_valid_coordinates -from eopf_geozarr.data_api.geozarr.v2 import ( - DataArray, - DataArrayAttrs) -import pytest from typing import Any -from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, check_standard_name, get_cf_standard_names -from pydantic_zarr.v2 import GroupSpec, ArraySpec + +import pytest +from pydantic_zarr.v2 import ArraySpec, GroupSpec + +from eopf_geozarr.data_api.geozarr.common import ( + CF_STANDARD_NAME_URL, + check_standard_name, + get_cf_standard_names, +) +from eopf_geozarr.data_api.geozarr.v2 import ( + CoordArray, + CoordArrayAttrs, + DataArray, + DataArrayAttrs, + MyGroupSpec, + check_valid_coordinates, +) + def test_get_cf_standard_names() -> None: """ @@ -19,13 +30,17 @@ def test_get_cf_standard_names() -> None: assert len(standard_names) > 0 assert all(isinstance(name, str) for name in standard_names) -@pytest.mark.parametrize("name", ["air_temperature", "sea_surface_temperature", "precipitation_flux"]) + +@pytest.mark.parametrize( + "name", ["air_temperature", "sea_surface_temperature", "precipitation_flux"] +) def test_check_standard_name_valid(name: str) -> None: """ Test the check_standard_name function with valid standard names. """ assert check_standard_name(name) == name + def test_check_standard_name_invalid() -> None: """ Test the check_standard_name function with an invalid standard name. @@ -33,70 +48,82 @@ def test_check_standard_name_invalid() -> None: with pytest.raises(ValueError): check_standard_name("invalid_standard_name") + def test_coord_array_attrs_dimensions_length() -> None: """ Test that the array_dimensions attribute must have length 1. """ - msg = ('1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n ' - ' Tuple should have at most 1 item after validation, not 2') + msg = ( + "1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n " + " Tuple should have at most 1 item after validation, not 2" + ) with pytest.raises(ValueError, match=re.escape(msg)): CoordArrayAttrs( - _ARRAY_DIMENSIONS=("time", "lat"), - standard_name="air_temperature", - units="mm", - axis="Y", - ) + _ARRAY_DIMENSIONS=("time", "lat"), + standard_name="air_temperature", + units="mm", + axis="Y", + ) + def test_coord_array_dimensionality() -> None: """ Test that only 1-dimensional arrays are allowed. """ - msg = ('1 validation error for CoordArray\nshape\n ' - 'Tuple should have at most 1 item after validation, not 2') + msg = ( + "1 validation error for CoordArray\nshape\n " + "Tuple should have at most 1 item after validation, not 2" + ) with pytest.raises(ValueError, match=re.escape(msg)): CoordArray( shape=(10, 11), - dtype='|u1', + dtype="|u1", chunks=(10, 11), attributes=CoordArrayAttrs( _ARRAY_DIMENSIONS=("time",), standard_name="air_temperature", - units='s', - axis='Y', - ) + units="s", + axis="Y", + ), ) + class TestCheckValidCoordinates: - @pytest.mark.parametrize("example", [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(10, 11), - dtype='|u1', - chunks=(10, 11), - attributes=DataArrayAttrs( - array_dimensions=["time", "lat"], - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude" - )), - "/time": CoordArray(shape=(10,), dtype='|u1', chunks=(10,), attributes=CoordArrayAttrs( - array_dimensions=["time"], - standard_name="time", - units="s", - axis="T")), - "/lat": CoordArray( - shape=(11,), - dtype='|u1', - chunks=(11,), - attributes=CoordArrayAttrs( - array_dimensions=["lat"], - standard_name="latitude", - units='m', - axis='Y' - )) - }, - ]) + @pytest.mark.parametrize( + "example", + [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(10, 11), + dtype="|u1", + chunks=(10, 11), + attributes=DataArrayAttrs( + array_dimensions=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude", + ), + ), + "/time": CoordArray( + shape=(10,), + dtype="|u1", + chunks=(10,), + attributes=CoordArrayAttrs( + array_dimensions=["time"], standard_name="time", units="s", axis="T" + ), + ), + "/lat": CoordArray( + shape=(11,), + dtype="|u1", + chunks=(11,), + attributes=CoordArrayAttrs( + array_dimensions=["lat"], standard_name="latitude", units="m", axis="Y" + ), + ), + }, + ], + ) @staticmethod def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: """ @@ -105,36 +132,48 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None group = MyGroupSpec.from_flat(example, by_alias=True) assert check_valid_coordinates(group) == group - @pytest.mark.parametrize("example", [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(9, 10), - dtype='|u1', - chunks=(10, 11), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time", "lat"], - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude"), - ), - "/time": ArraySpec(shape=(10,), dtype='|u1', chunks=(10,), attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["time"], - standard_name="time", - units="s", - axis="T" - )), - "/lat": ArraySpec(shape=(11,), dtype='|u1', chunks=(11,), attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], - standard_name="latitude", - units='m', - axis='Y')) - }, - ]) + @pytest.mark.parametrize( + "example", + [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(9, 10), + dtype="|u1", + chunks=(10, 11), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude", + ), + ), + "/time": ArraySpec( + shape=(10,), + dtype="|u1", + chunks=(10,), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["time"], standard_name="time", units="s", axis="T" + ), + ), + "/lat": ArraySpec( + shape=(11,), + dtype="|u1", + chunks=(11,), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], standard_name="latitude", units="m", axis="Y" + ), + ), + }, + ], + ) @staticmethod def test_invalid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. + + This test checks that the function raises a ValueError when the dimensions of the data variable + do not match the dimensions of the coordinate arrays. """ group = MyGroupSpec[Any, DataArray | CoordArray].from_flat(example, by_alias=True) with pytest.raises(ValueError): diff --git a/eopf_geozarr/tests/test_fs_utils.py b/eopf_geozarr/tests/test_fs_utils.py index 2c232fa..9f34430 100644 --- a/eopf_geozarr/tests/test_fs_utils.py +++ b/eopf_geozarr/tests/test_fs_utils.py @@ -13,9 +13,9 @@ normalize_path, parse_s3_path, path_exists, + read_json_metadata, validate_s3_access, write_json_metadata, - read_json_metadata, ) @@ -167,8 +167,8 @@ def test_path_exists(mock_get_filesystem): @patch("eopf_geozarr.conversion.fs_utils.get_filesystem") def test_write_json_metadata(mock_get_filesystem): """Test unified JSON metadata writing.""" - from unittest.mock import mock_open, MagicMock - + from unittest.mock import MagicMock, mock_open + mock_fs = Mock() # Create a proper context manager mock mock_file = mock_open() @@ -190,8 +190,8 @@ def test_write_json_metadata(mock_get_filesystem): @patch("eopf_geozarr.conversion.fs_utils.get_filesystem") def test_read_json_metadata(mock_get_filesystem): """Test unified JSON metadata reading.""" - from unittest.mock import mock_open, MagicMock - + from unittest.mock import MagicMock, mock_open + mock_fs = Mock() # Create a proper context manager mock mock_file = mock_open(read_data='{"key": "value", "number": 42}') From 6a88bcbbdc2f7fbdb6e8dbe0f7da1551670cc33a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 8 Aug 2025 13:09:07 +0200 Subject: [PATCH 04/25] bump pydantic min version and define serialization by alias per class --- eopf_geozarr/data_api/geozarr/v2.py | 51 +++---------------- .../tests/test_data_api/test_geozarr.py | 5 +- pyproject.toml | 6 +-- 3 files changed, 11 insertions(+), 51 deletions(-) diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index 72408a9..547a248 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -3,7 +3,7 @@ from typing import Annotated, Any, Literal, Self -from pydantic import AfterValidator, BaseModel, Field, model_serializer, model_validator +from pydantic import AfterValidator, BaseModel, ConfigDict, Field, model_serializer, model_validator from pydantic_zarr.v2 import ( AnyArraySpec, AnyGroupSpec, @@ -19,50 +19,7 @@ CFStandardName = Annotated[str, AfterValidator(check_standard_name)] -class MyGroupSpec(GroupSpec[TAttr, TItem]): - """ - A custom GroupSpec - - We override the from_flat method to ensure that we can pass by_alias=True - when creating a GroupSpec from a flat hierarchy. - """ - @classmethod - def from_flat( - cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bool = False - ) -> Self: - """ - Create a `GroupSpec` from a flat hierarchy representation. The flattened hierarchy is a - `dict` with the following constraints: keys must be valid paths; values must - be `ArraySpec` or `GroupSpec` instances. - - Parameters - ---------- - data : Dict[str, ArraySpec | GroupSpec] - A flattened representation of a Zarr hierarchy. - - Returns - ------- - GroupSpec - A `GroupSpec` representation of the hierarchy. - - Examples - -------- - >>> from pydantic_zarr.v2 import GroupSpec, ArraySpec - >>> import numpy as np - >>> flat = {'': GroupSpec(attributes={'foo': 10}, members=None)} - >>> GroupSpec.from_flat(flat) - GroupSpec(zarr_format=2, attributes={'foo': 10}, members={}) - >>> flat = { - '': GroupSpec(attributes={'foo': 10}, members=None), - '/a': ArraySpec.from_array(np.arange(10))} - >>> GroupSpec.from_flat(flat) - GroupSpec(zarr_format=2, attributes={'foo': 10}, members={'a': ArraySpec(zarr_format=2, attributes={}, shape=(10,), chunks=(10,), dtype=' None """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. """ - group = MyGroupSpec.from_flat(example, by_alias=True) + group = GroupSpec.from_flat(example) assert check_valid_coordinates(group) == group @pytest.mark.parametrize( @@ -175,6 +174,6 @@ def test_invalid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> No This test checks that the function raises a ValueError when the dimensions of the data variable do not match the dimensions of the coordinate arrays. """ - group = MyGroupSpec[Any, DataArray | CoordArray].from_flat(example, by_alias=True) + group = GroupSpec[Any, DataArray | CoordArray].from_flat(example) with pytest.raises(ValueError): check_valid_coordinates(group) diff --git a/pyproject.toml b/pyproject.toml index 9acd1b7..582a66c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,9 @@ classifiers = [ ] requires-python = ">=3.11" dependencies = [ - "pydantic-zarr@git+https://github.com/zarr-developers/pydantic-zarr", - # "zarr>=3.0.10", - "zarr@git+https://github.com/zarr-developers/zarr-python", + "pydantic-zarr>=0.8.0", + "pydantic>=2.11", + "zarr>=3.1.0", "xarray>=2025.7.1", "dask[array,distributed]>=2025.5.1", "numpy>=2.3.1", From bca5f5b68584ab71778071f6a566dc07c9312a50 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 8 Aug 2025 22:33:55 +0200 Subject: [PATCH 05/25] fix broken test --- eopf_geozarr/data_api/geozarr/v2.py | 2 +- eopf_geozarr/tests/test_data_api/test_geozarr.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index 547a248..26e7052 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -95,7 +95,7 @@ class DataArray(ArraySpec[DataArrayAttrs]): """ -def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: +def check_valid_coordinates(model: GroupSpec[Any, DataArray | CoordArray]) -> Dataset: """ Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py index aa88a9e..f8cedd3 100644 --- a/eopf_geozarr/tests/test_data_api/test_geozarr.py +++ b/eopf_geozarr/tests/test_data_api/test_geozarr.py @@ -147,7 +147,7 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None grid_mapping_name="latitude_longitude", ), ), - "/time": ArraySpec( + "/time": CoordArray( shape=(10,), dtype="|u1", chunks=(10,), @@ -155,7 +155,7 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None _ARRAY_DIMENSIONS=["time"], standard_name="time", units="s", axis="T" ), ), - "/lat": ArraySpec( + "/lat": CoordArray( shape=(11,), dtype="|u1", chunks=(11,), @@ -167,7 +167,7 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None ], ) @staticmethod - def test_invalid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + def test_invalid_coordinates(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. From 38a721f5571ba2bbf923980009cbdab9a3656e08 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 11 Aug 2025 12:17:57 +0200 Subject: [PATCH 06/25] wip --- eopf_geozarr/data_api/geozarr/common.py | 43 +++++++++++++++++-- eopf_geozarr/data_api/geozarr/v2.py | 13 +++--- .../tests/test_data_api/test_geozarr.py | 2 +- pyproject.toml | 6 +-- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/eopf_geozarr/data_api/geozarr/common.py b/eopf_geozarr/data_api/geozarr/common.py index 626ebf8..1c2a37f 100644 --- a/eopf_geozarr/data_api/geozarr/common.py +++ b/eopf_geozarr/data_api/geozarr/common.py @@ -2,6 +2,7 @@ import io import urllib import urllib.request +from typing import Literal, TypeAlias, TypeVar from cf_xarray.utils import parse_cf_standard_name_table from pydantic import BaseModel @@ -64,6 +65,37 @@ def check_standard_name(name: str) -> str: ) +# todo: narrow to literal type +ResamplingMethod = Literal[ + "nearest", + "average", + "bilinear", + "cubic", + "cubic_spline", + "lanczos", + "mode", + "max", + "min", + "med", + "sum", + "q1", + "q3", + "rms", + "gauss", +] +"""A string literal indicating a resampling method""" + +TileMatrixSet: TypeAlias = str | dict[str, object] +"""Identifier, URI, or inline JSON object compliant with OGC TileMatrixSet v2""" + + +class TileMatrixSetLimits(BaseModel): + min_tile_col: int + min_tile_row: int + max_tile_col: int + max_tile_row: int + + class MultiscaleAttrs(BaseModel): """ Attributes for a GeoZarr multiscale dataset. @@ -72,9 +104,12 @@ class MultiscaleAttrs(BaseModel): ---------- tile_matrix_set : str The tile matrix set identifier for the multiscale dataset. - resampling_method : str - The resampling method for the multiscale dataset. + resampling_method : ResamplingMethod + The name of the resampling method for the multiscale dataset. + tile_matrix_set_limits : dict[str, TileMatrixSetLimits] | None, optional + The tile matrix set limits for the multiscale dataset. """ - tile_matrix_set: str - resampling_method: str + tile_matrix_set: TileMatrixSet + resampling_method: ResamplingMethod + tile_matrix_set_limits: dict[str, TileMatrixSetLimits] | None = None diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index 72408a9..752e12e 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -3,7 +3,7 @@ from typing import Annotated, Any, Literal, Self -from pydantic import AfterValidator, BaseModel, Field, model_serializer, model_validator +from pydantic import AfterValidator, BaseModel, ConfigDict, Field, model_serializer, model_validator from pydantic_zarr.v2 import ( AnyArraySpec, AnyGroupSpec, @@ -22,10 +22,11 @@ class MyGroupSpec(GroupSpec[TAttr, TItem]): """ A custom GroupSpec - + We override the from_flat method to ensure that we can pass by_alias=True when creating a GroupSpec from a flat hierarchy. """ + @classmethod def from_flat( cls, data: dict[str, AnyArraySpec | AnyGroupSpec], *, by_alias: bool = False @@ -89,15 +90,15 @@ class CoordArrayAttrs(BaseModel, populate_by_name=True): class CoordArray(ArraySpec[CoordArrayAttrs]): """ - A GeoZarr coordinate array variable. - + A GeoZarr coordinate array variable. + It must be 1-dimensional and have a single element in its array_dimensions attribute. """ shape: tuple[int] -class DataArrayAttrs(BaseModel, populate_by_name=True): +class DataArrayAttrs(BaseModel): """ Attributes for a GeoZarr DataArray. @@ -123,6 +124,8 @@ class DataArrayAttrs(BaseModel, populate_by_name=True): grid_mapping: object grid_mapping_name: str + model_config = ConfigDict(validate_by_name=True, serialze_by_alias=True) + class DataArray(ArraySpec[DataArrayAttrs]): """ diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py index 266f2b6..27fda83 100644 --- a/eopf_geozarr/tests/test_data_api/test_geozarr.py +++ b/eopf_geozarr/tests/test_data_api/test_geozarr.py @@ -129,7 +129,7 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. """ - group = MyGroupSpec.from_flat(example, by_alias=True) + group = GroupSpec.from_flat(example) assert check_valid_coordinates(group) == group @pytest.mark.parametrize( diff --git a/pyproject.toml b/pyproject.toml index 9acd1b7..92185a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,9 @@ classifiers = [ ] requires-python = ">=3.11" dependencies = [ - "pydantic-zarr@git+https://github.com/zarr-developers/pydantic-zarr", - # "zarr>=3.0.10", - "zarr@git+https://github.com/zarr-developers/zarr-python", + "pydantic-zarr>= 0.8.0", + "pydantic>=2.11", + "zarr>=3.1.0", "xarray>=2025.7.1", "dask[array,distributed]>=2025.5.1", "numpy>=2.3.1", From dfdeff2914275fc35bc4893861116cfaa112e610 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 11 Aug 2025 13:38:30 +0200 Subject: [PATCH 07/25] working mini roundtrip --- eopf_geozarr/data_api/geozarr/common.py | 42 +- eopf_geozarr/data_api/geozarr/v2.py | 4 +- eopf_geozarr/tests/test_data_api/conftest.py | 14144 ++++++++++++++++ .../tests/test_data_api/test_geozarr.py | 17 + 4 files changed, 14195 insertions(+), 12 deletions(-) create mode 100644 eopf_geozarr/tests/test_data_api/conftest.py diff --git a/eopf_geozarr/data_api/geozarr/common.py b/eopf_geozarr/data_api/geozarr/common.py index 1c2a37f..dcebf1b 100644 --- a/eopf_geozarr/data_api/geozarr/common.py +++ b/eopf_geozarr/data_api/geozarr/common.py @@ -2,7 +2,7 @@ import io import urllib import urllib.request -from typing import Literal, TypeAlias, TypeVar +from typing import Literal, Mapping, TypeAlias, TypeVar from cf_xarray.utils import parse_cf_standard_name_table from pydantic import BaseModel @@ -85,18 +85,38 @@ def check_standard_name(name: str) -> str: ] """A string literal indicating a resampling method""" -TileMatrixSet: TypeAlias = str | dict[str, object] -"""Identifier, URI, or inline JSON object compliant with OGC TileMatrixSet v2""" +class TileMatrixLimit(BaseModel): + """""" -class TileMatrixSetLimits(BaseModel): - min_tile_col: int - min_tile_row: int - max_tile_col: int - max_tile_row: int + tileMatrix: str + minTileCol: int + minTileRow: int + maxTileCol: int + maxTileRow: int -class MultiscaleAttrs(BaseModel): +class TileMatrix(BaseModel): + id: str + scaleDenominator: float + cellSize: float + pointOfOrigin: tuple[float, float] + tileWidth: int + tileHeight: int + matrixWidth: int + matrixHeight: int + + +class TileMatrixSet(BaseModel): + id: str + title: str | None = None + crs: str | None = None + supportedCRS: str | None = None + orderedAxes: tuple[str, str] | None = None + tileMatrices: tuple[TileMatrix, ...] + + +class Multiscales(BaseModel): """ Attributes for a GeoZarr multiscale dataset. @@ -112,4 +132,6 @@ class MultiscaleAttrs(BaseModel): tile_matrix_set: TileMatrixSet resampling_method: ResamplingMethod - tile_matrix_set_limits: dict[str, TileMatrixSetLimits] | None = None + # TODO: ensure that the keys match tile_matrix_set.tileMatrices[$index].id + # TODO: ensure that the keys match the tileMatrix attribute + tile_matrix_limits: dict[str, TileMatrixLimit] | None = None diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index 18da225..df0caf2 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -14,7 +14,7 @@ from_flat_group, ) -from eopf_geozarr.data_api.geozarr.common import MultiscaleAttrs, check_standard_name +from eopf_geozarr.data_api.geozarr.common import Multiscales, check_standard_name CFStandardName = Annotated[str, AfterValidator(check_standard_name)] @@ -149,7 +149,7 @@ class DatasetAttrs(BaseModel): multiscales: MultiscaleAttrs """ - multiscales: MultiscaleAttrs + multiscales: Multiscales class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): diff --git a/eopf_geozarr/tests/test_data_api/conftest.py b/eopf_geozarr/tests/test_data_api/conftest.py new file mode 100644 index 0000000..e45650c --- /dev/null +++ b/eopf_geozarr/tests/test_data_api/conftest.py @@ -0,0 +1,14144 @@ +example_zarr_json = r"""{ + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": { + "conditions": { + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/meteorology": { + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/meteorology/cams": { + "attributes": { + "Conventions": "CF-1.7", + "GRIB_centre": "ecmf", + "GRIB_centreDescription": "European Centre for Medium-Range Weather Forecasts", + "GRIB_edition": 1, + "GRIB_subCentre": 0, + "history": "2025-02-27T07:57 GRIB to CDM+CF via cfgrib-0.9.10.4/ecCodes-2.34.1 with {\"source\": \"tmp/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.SAFE/GRANULE/L1C_T32TLQ_A041032_20250113T103310/AUX_DATA/AUX_CAMSFO\", \"filter_by_keys\": {}, \"encode_cf\": [\"parameter\", \"time\", \"geography\", \"vertical\"]}", + "institution": "European Centre for Medium-Range Weather Forecasts" + }, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/meteorology/cams/surface": { + "shape": [], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "original GRIB coordinate for key: level(surface)", + "units": "1", + "_FillValue": "AAAAAAAA+H8=" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/aod865": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "aod865", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total Aerosol Optical Depth at 865nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210215, + "GRIB_shortName": "aod865", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Total Aerosol Optical Depth at 865nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/latitude": { + "shape": [ + 9 + ], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "latitude", + "standard_name": "latitude", + "stored_direction": "decreasing", + "units": "degrees_north", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/number": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "ensemble member numerical id", + "standard_name": "realization", + "units": "1" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/z": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "geopotential", + "GRIB_cfVarName": "z", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Geopotential", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 129, + "GRIB_shortName": "z", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "m**2 s**-2", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "m**2 s**-2" + }, + "long_name": "Geopotential", + "standard_name": "geopotential", + "units": "m**2 s**-2", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/step": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "time since forecast_reference_time", + "standard_name": "forecast_period", + "dtype": "timedelta64[ns]", + "units": "minutes" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/omaod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "omaod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Organic Matter Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210210, + "GRIB_shortName": "omaod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Organic Matter Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/aod469": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "aod469", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total Aerosol Optical Depth at 469nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210213, + "GRIB_shortName": "aod469", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Total Aerosol Optical Depth at 469nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/aod670": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "aod670", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total Aerosol Optical Depth at 670nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210214, + "GRIB_shortName": "aod670", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Total Aerosol Optical Depth at 670nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/isobaricInhPa": { + "shape": [], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "pressure", + "positive": "down", + "standard_name": "air_pressure", + "stored_direction": "decreasing", + "units": "hPa", + "_FillValue": "AAAAAAAA+H8=" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/duaod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "duaod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Dust Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210209, + "GRIB_shortName": "duaod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "isobaricInhPa", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Dust Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/ssaod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "ssaod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Sea Salt Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210208, + "GRIB_shortName": "ssaod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Sea Salt Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/time": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "_eopf_attrs": { + "_eopf_decode_datetime64": "datetime64[ns]" + }, + "long_name": "initial time of forecast", + "standard_name": "forecast_reference_time", + "units": "days since 2025-01-13 00:00:00", + "calendar": "proleptic_gregorian" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/valid_time": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "_eopf_attrs": { + "_eopf_decode_datetime64": "datetime64[ns]" + }, + "long_name": "time", + "standard_name": "time", + "units": "days since 2025-01-13 10:33:00", + "calendar": "proleptic_gregorian" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/bcaod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "bcaod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Black Carbon Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210211, + "GRIB_shortName": "bcaod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Black Carbon Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/aod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "aod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210207, + "GRIB_shortName": "aod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Total Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/longitude": { + "shape": [ + 9 + ], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "longitude", + "standard_name": "longitude", + "units": "degrees_east", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/aod1240": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "aod1240", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total Aerosol Optical Depth at 1240nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210216, + "GRIB_shortName": "aod1240", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Total Aerosol Optical Depth at 1240nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/cams/suaod550": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "suaod550", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Sulphate Aerosol Optical Depth at 550nm", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 210212, + "GRIB_shortName": "suaod550", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "~", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "~" + }, + "long_name": "Sulphate Aerosol Optical Depth at 550nm", + "standard_name": "unknown", + "units": "~", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf": { + "attributes": { + "Conventions": "CF-1.7", + "GRIB_centre": "ecmf", + "GRIB_centreDescription": "European Centre for Medium-Range Weather Forecasts", + "GRIB_edition": 1, + "GRIB_subCentre": 0, + "history": "2025-02-27T07:57 GRIB to CDM+CF via cfgrib-0.9.10.4/ecCodes-2.34.1 with {\"source\": \"tmp/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.SAFE/GRANULE/L1C_T32TLQ_A041032_20250113T103310/AUX_DATA/AUX_ECMWFT\", \"filter_by_keys\": {}, \"encode_cf\": [\"parameter\", \"time\", \"geography\", \"vertical\"]}", + "institution": "European Centre for Medium-Range Weather Forecasts" + }, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/meteorology/ecmwf/surface": { + "shape": [], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "original GRIB coordinate for key: level(surface)", + "units": "1", + "_FillValue": "AAAAAAAA+H8=" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/v10": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "v10", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "10 metre V wind component", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 166, + "GRIB_shortName": "10v", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "m s**-1", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "m s**-1" + }, + "long_name": "10 metre V wind component", + "standard_name": "unknown", + "units": "m s**-1", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/latitude": { + "shape": [ + 9 + ], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "latitude", + "standard_name": "latitude", + "stored_direction": "decreasing", + "units": "degrees_north", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/number": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "ensemble member numerical id", + "standard_name": "realization", + "units": "1" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/step": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "time since forecast_reference_time", + "standard_name": "forecast_period", + "dtype": "timedelta64[ns]", + "units": "minutes" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/r": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "relative_humidity", + "GRIB_cfVarName": "r", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Relative humidity", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 157, + "GRIB_shortName": "r", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "isobaricInhPa", + "GRIB_units": "%", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "%" + }, + "long_name": "Relative humidity", + "standard_name": "relative_humidity", + "units": "%", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/isobaricInhPa": { + "shape": [], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "pressure", + "positive": "down", + "standard_name": "air_pressure", + "stored_direction": "decreasing", + "units": "hPa", + "_FillValue": "AAAAAAAA+H8=" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/tcwv": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "lwe_thickness_of_atmosphere_mass_content_of_water_vapor", + "GRIB_cfVarName": "tcwv", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total column vertically-integrated water vapour", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 137, + "GRIB_shortName": "tcwv", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "kg m**-2", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "kg m**-2" + }, + "long_name": "Total column vertically-integrated water vapour", + "standard_name": "lwe_thickness_of_atmosphere_mass_content_of_water_vapor", + "units": "kg m**-2", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/u10": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "unknown", + "GRIB_cfVarName": "u10", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "10 metre U wind component", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 165, + "GRIB_shortName": "10u", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "m s**-1", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "m s**-1" + }, + "long_name": "10 metre U wind component", + "standard_name": "unknown", + "units": "m s**-1", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/time": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "_eopf_attrs": { + "_eopf_decode_datetime64": "datetime64[ns]" + }, + "long_name": "initial time of forecast", + "standard_name": "forecast_reference_time", + "units": "days since 2025-01-13 00:00:00", + "calendar": "proleptic_gregorian" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/valid_time": { + "shape": [], + "data_type": "int64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "_eopf_attrs": { + "_eopf_decode_datetime64": "datetime64[ns]" + }, + "long_name": "time", + "standard_name": "time", + "units": "days since 2025-01-13 10:33:00", + "calendar": "proleptic_gregorian" + }, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/tco3": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "atmosphere_mass_content_of_ozone", + "GRIB_cfVarName": "tco3", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Total column ozone", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 206, + "GRIB_shortName": "tco3", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "kg m**-2", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "kg m**-2" + }, + "long_name": "Total column ozone", + "standard_name": "atmosphere_mass_content_of_ozone", + "units": "kg m**-2", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/msl": { + "shape": [ + 9, + 9 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9, + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "GRIB_NV": 0, + "GRIB_Nx": 9, + "GRIB_Ny": 9, + "GRIB_cfName": "air_pressure_at_mean_sea_level", + "GRIB_cfVarName": "msl", + "GRIB_dataType": "fc", + "GRIB_gridDefinitionDescription": "Latitude/Longitude Grid", + "GRIB_gridType": "regular_ll", + "GRIB_iDirectionIncrementInDegrees": 0.177, + "GRIB_iScansNegatively": 0, + "GRIB_jDirectionIncrementInDegrees": 0.121, + "GRIB_jPointsAreConsecutive": 0, + "GRIB_jScansPositively": 0, + "GRIB_latitudeOfFirstGridPointInDegrees": 45.126, + "GRIB_latitudeOfLastGridPointInDegrees": 44.16, + "GRIB_longitudeOfFirstGridPointInDegrees": 6.457, + "GRIB_longitudeOfLastGridPointInDegrees": 7.872, + "GRIB_missingValue": 3.4028234663852886e+38, + "GRIB_name": "Mean sea level pressure", + "GRIB_numberOfPoints": 81, + "GRIB_paramId": 151, + "GRIB_shortName": "msl", + "GRIB_stepType": "instant", + "GRIB_stepUnits": 0, + "GRIB_totalNumber": 0, + "GRIB_typeOfLevel": "surface", + "GRIB_units": "Pa", + "_eopf_attrs": { + "coordinates": [ + "number", + "time", + "step", + "surface", + "latitude", + "longitude", + "valid_time", + "isobaricInhPa" + ], + "dimensions": [ + "latitude", + "longitude" + ], + "units": "Pa" + }, + "long_name": "Mean sea level pressure", + "standard_name": "air_pressure_at_mean_sea_level", + "units": "Pa", + "coordinates": "isobaricInhPa number step surface time valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/meteorology/ecmwf/longitude": { + "shape": [ + 9 + ], + "data_type": "float64", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 9 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "long_name": "longitude", + "standard_name": "longitude", + "units": "degrees_east", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, + "conditions/mask": { + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/mask/l1c_classification": { + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/mask/l1c_classification/r60m": { + "attributes": {}, + "zarr_format": 3, + "consolidated_metadata": { + "kind": "inline", + "must_understand": false, + "metadata": {} + }, + "node_type": "group" + }, + "conditions/mask/l1c_classification/r60m/b00": { + "shape": [ + 1830, + 1830 + ], + "data_type": "uint8", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 1830, + 1830 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0, + "codecs": [ + { + "name": "bytes" + }, + { + "name": "blosc", + "configuration": { + "typesize": 1, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "attributes": { + "_eopf_attrs": { + "coordinates": [ + "x", + "y" + ], + "dimensions": [ + "y", + "x" + ], + "flag_masks": [ + 1, + 2, + 4 + ], + "flag_meanings": [ + "OPAQUE", + "CIRRUS", + "SNOW_ICE" + ] + }, + "dtype": " None: """ @@ -177,3 +180,17 @@ def test_invalid_coordinates(example: dict[str, ArraySpec[Any] | GroupSpec[Any, group = GroupSpec[Any, DataArray | CoordArray].from_flat(example) with pytest.raises(ValueError): check_valid_coordinates(group) + + +def test_round_trip() -> None: + from pydantic_zarr.core import tuplify_json + from pydantic_zarr.v3 import GroupSpec + from zarr.core.buffer import default_buffer_prototype + + from eopf_geozarr.data_api.geozarr.common import Multiscales + + source_store = {"zarr.json": default_buffer_prototype().buffer.from_bytes(example_zarr_json)} + source_untyped = GroupSpec.from_zarr(zarr.open_group(source_store, mode="r")) + flat = source_untyped.to_flat() + meta = flat["/measurements/reflectance/r60m"].attributes["multiscales"] + assert Multiscales(**meta).model_dump() == tuplify_json(meta) From 2088e33713b7adaddc9738bd868af52b64e2bf5e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 11 Aug 2025 18:47:25 +0200 Subject: [PATCH 08/25] refactor test layout --- .../tests/test_data_api/test_common.py | 67 ++++++ .../tests/test_data_api/test_geozarr.py | 190 +----------------- eopf_geozarr/tests/test_data_api/test_v2.py | 124 ++++++++++++ eopf_geozarr/tests/test_data_api/test_v3.py | 0 4 files changed, 193 insertions(+), 188 deletions(-) create mode 100644 eopf_geozarr/tests/test_data_api/test_common.py create mode 100644 eopf_geozarr/tests/test_data_api/test_v2.py create mode 100644 eopf_geozarr/tests/test_data_api/test_v3.py diff --git a/eopf_geozarr/tests/test_data_api/test_common.py b/eopf_geozarr/tests/test_data_api/test_common.py new file mode 100644 index 0000000..74de043 --- /dev/null +++ b/eopf_geozarr/tests/test_data_api/test_common.py @@ -0,0 +1,67 @@ +from __future__ import annotations +import re + +import zarr +import pytest +from pydantic_zarr.core import tuplify_json +from pydantic_zarr.v3 import GroupSpec +from zarr.core.buffer import default_buffer_prototype +from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, CoordArrayAttrs, check_standard_name, get_cf_standard_names +from eopf_geozarr.tests.test_data_api.conftest import example_zarr_json + + +def test_get_cf_standard_names() -> None: + """ + Test the get_cf_standard_names function to ensure it retrieves the CF standard names correctly. + """ + standard_names = get_cf_standard_names(CF_STANDARD_NAME_URL) + assert isinstance(standard_names, tuple) + assert len(standard_names) > 0 + assert all(isinstance(name, str) for name in standard_names) + + +@pytest.mark.parametrize( + "name", ["air_temperature", "sea_surface_temperature", "precipitation_flux"] +) +def test_check_standard_name_valid(name: str) -> None: + """ + Test the check_standard_name function with valid standard names. + """ + assert check_standard_name(name) == name + + +def test_check_standard_name_invalid() -> None: + """ + Test the check_standard_name function with an invalid standard name. + """ + with pytest.raises(ValueError): + check_standard_name("invalid_standard_name") + + +def test_coord_array_attrs_dimensions_length() -> None: + """ + Test that the array_dimensions attribute must have length 1. + """ + msg = ( + "1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n " + " Tuple should have at most 1 item after validation, not 2" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + CoordArrayAttrs( + _ARRAY_DIMENSIONS=("time", "lat"), + standard_name="air_temperature", + units="mm", + axis="Y", + ) + + +def test_multiscales_round_trip() -> None: + + + from eopf_geozarr.data_api.geozarr.common import Multiscales + + source_store = {"zarr.json": default_buffer_prototype().buffer.from_bytes(example_zarr_json)} + source_untyped = GroupSpec.from_zarr(zarr.open_group(source_store, mode="r")) + flat = source_untyped.to_flat() + meta = flat["/measurements/reflectance/r60m"].attributes["multiscales"] + assert Multiscales(**meta).model_dump() == tuplify_json(meta) \ No newline at end of file diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py index a6a9aa2..b6aeec5 100644 --- a/eopf_geozarr/tests/test_data_api/test_geozarr.py +++ b/eopf_geozarr/tests/test_data_api/test_geozarr.py @@ -1,196 +1,10 @@ from __future__ import annotations -import re -from typing import Any -import pytest -import zarr -from pydantic_zarr.v2 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.common import ( - CF_STANDARD_NAME_URL, - check_standard_name, - get_cf_standard_names, -) -from eopf_geozarr.data_api.geozarr.v2 import ( - CoordArray, - CoordArrayAttrs, - DataArray, - DataArrayAttrs, - check_valid_coordinates, -) -from .conftest import example_zarr_json -def test_get_cf_standard_names() -> None: - """ - Test the get_cf_standard_names function to ensure it retrieves the CF standard names correctly. - """ - standard_names = get_cf_standard_names(CF_STANDARD_NAME_URL) - assert isinstance(standard_names, tuple) - assert len(standard_names) > 0 - assert all(isinstance(name, str) for name in standard_names) - -@pytest.mark.parametrize( - "name", ["air_temperature", "sea_surface_temperature", "precipitation_flux"] -) -def test_check_standard_name_valid(name: str) -> None: - """ - Test the check_standard_name function with valid standard names. - """ - assert check_standard_name(name) == name - - -def test_check_standard_name_invalid() -> None: - """ - Test the check_standard_name function with an invalid standard name. - """ - with pytest.raises(ValueError): - check_standard_name("invalid_standard_name") - - -def test_coord_array_attrs_dimensions_length() -> None: - """ - Test that the array_dimensions attribute must have length 1. - """ - msg = ( - "1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n " - " Tuple should have at most 1 item after validation, not 2" - ) - with pytest.raises(ValueError, match=re.escape(msg)): - CoordArrayAttrs( - _ARRAY_DIMENSIONS=("time", "lat"), - standard_name="air_temperature", - units="mm", - axis="Y", - ) - - -def test_coord_array_dimensionality() -> None: - """ - Test that only 1-dimensional arrays are allowed. - """ - msg = ( - "1 validation error for CoordArray\nshape\n " - "Tuple should have at most 1 item after validation, not 2" - ) - with pytest.raises(ValueError, match=re.escape(msg)): - CoordArray( - shape=(10, 11), - dtype="|u1", - chunks=(10, 11), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=("time",), - standard_name="air_temperature", - units="s", - axis="Y", - ), - ) - - -class TestCheckValidCoordinates: - @pytest.mark.parametrize( - "example", - [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(10, 11), - dtype="|u1", - chunks=(10, 11), - attributes=DataArrayAttrs( - array_dimensions=["time", "lat"], - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude", - ), - ), - "/time": CoordArray( - shape=(10,), - dtype="|u1", - chunks=(10,), - attributes=CoordArrayAttrs( - array_dimensions=["time"], standard_name="time", units="s", axis="T" - ), - ), - "/lat": CoordArray( - shape=(11,), - dtype="|u1", - chunks=(11,), - attributes=CoordArrayAttrs( - array_dimensions=["lat"], standard_name="latitude", units="m", axis="Y" - ), - ), - }, - ], - ) - @staticmethod - def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: - """ - Test the check_valid_coordinates function to ensure it validates coordinates correctly. - """ - group = GroupSpec.from_flat(example) - assert check_valid_coordinates(group) == group - - @pytest.mark.parametrize( - "example", - [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(9, 10), - dtype="|u1", - chunks=(10, 11), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time", "lat"], - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude", - ), - ), - "/time": CoordArray( - shape=(10,), - dtype="|u1", - chunks=(10,), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["time"], standard_name="time", units="s", axis="T" - ), - ), - "/lat": CoordArray( - shape=(11,), - dtype="|u1", - chunks=(11,), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], standard_name="latitude", units="m", axis="Y" - ), - ), - }, - ], - ) - @staticmethod - def test_invalid_coordinates(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: - """ - Test the check_valid_coordinates function to ensure it validates coordinates correctly. - - This test checks that the function raises a ValueError when the dimensions of the data variable - do not match the dimensions of the coordinate arrays. - """ - group = GroupSpec[Any, DataArray | CoordArray].from_flat(example) - with pytest.raises(ValueError): - check_valid_coordinates(group) - - -def test_round_trip() -> None: - from pydantic_zarr.core import tuplify_json - from pydantic_zarr.v3 import GroupSpec - from zarr.core.buffer import default_buffer_prototype - - from eopf_geozarr.data_api.geozarr.common import Multiscales - - source_store = {"zarr.json": default_buffer_prototype().buffer.from_bytes(example_zarr_json)} - source_untyped = GroupSpec.from_zarr(zarr.open_group(source_store, mode="r")) - flat = source_untyped.to_flat() - meta = flat["/measurements/reflectance/r60m"].attributes["multiscales"] - assert Multiscales(**meta).model_dump() == tuplify_json(meta) +def test_dataarray_round_trip() -> None: + pass \ No newline at end of file diff --git a/eopf_geozarr/tests/test_data_api/test_v2.py b/eopf_geozarr/tests/test_data_api/test_v2.py new file mode 100644 index 0000000..19a799f --- /dev/null +++ b/eopf_geozarr/tests/test_data_api/test_v2.py @@ -0,0 +1,124 @@ +from __future__ import annotations +import pytest +from pydantic_zarr.v2 import ArraySpec, GroupSpec +from typing import Any +from eopf_geozarr.data_api.geozarr.common import CoordArrayAttrs, DataArrayAttrs + + +import re + +from eopf_geozarr.data_api.geozarr.v2 import CoordArray, DataArray, check_valid_coordinates + + +def test_coord_array_dimensionality() -> None: + """ + Test that only 1-dimensional arrays are allowed. + """ + msg = ( + "1 validation error for CoordArray\nshape\n " + "Tuple should have at most 1 item after validation, not 2" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + CoordArray( + shape=(10, 11), + dtype="|u1", + chunks=(10, 11), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=("time",), + standard_name="air_temperature", + units="s", + axis="Y", + ), + ) + + +class TestCheckValidCoordinates: + @pytest.mark.parametrize( + "example", + [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(10, 11), + dtype="|u1", + chunks=(10, 11), + attributes=DataArrayAttrs( + array_dimensions=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude", + ), + ), + "/time": CoordArray( + shape=(10,), + dtype="|u1", + chunks=(10,), + attributes=CoordArrayAttrs( + array_dimensions=["time"], standard_name="time", units="s", axis="T" + ), + ), + "/lat": CoordArray( + shape=(11,), + dtype="|u1", + chunks=(11,), + attributes=CoordArrayAttrs( + array_dimensions=["lat"], standard_name="latitude", units="m", axis="Y" + ), + ), + }, + ], + ) + @staticmethod + def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + """ + group = GroupSpec.from_flat(example) + assert check_valid_coordinates(group) == group + + @pytest.mark.parametrize( + "example", + [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(9, 10), + dtype="|u1", + chunks=(10, 11), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time", "lat"], + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude", + ), + ), + "/time": CoordArray( + shape=(10,), + dtype="|u1", + chunks=(10,), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["time"], standard_name="time", units="s", axis="T" + ), + ), + "/lat": CoordArray( + shape=(11,), + dtype="|u1", + chunks=(11,), + attributes=CoordArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], standard_name="latitude", units="m", axis="Y" + ), + ), + }, + ], + ) + @staticmethod + def test_invalid_coordinates(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + + This test checks that the function raises a ValueError when the dimensions of the data variable + do not match the dimensions of the coordinate arrays. + """ + group = GroupSpec[Any, DataArray | CoordArray].from_flat(example) + with pytest.raises(ValueError): + check_valid_coordinates(group) \ No newline at end of file diff --git a/eopf_geozarr/tests/test_data_api/test_v3.py b/eopf_geozarr/tests/test_data_api/test_v3.py new file mode 100644 index 0000000..e69de29 From cfcf7e10ee5a697b45eb206825b5b274b124f45a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 12 Aug 2025 12:23:28 +0200 Subject: [PATCH 09/25] refactor v2 and v3 data structures --- eopf_geozarr/data_api/geozarr/common.py | 57 ++++++++++++-- eopf_geozarr/data_api/geozarr/v2.py | 65 ++++------------ eopf_geozarr/data_api/geozarr/v3.py | 78 +++++++++++++++++++ .../tests/test_data_api/test_common.py | 3 +- eopf_geozarr/tests/test_data_api/test_v2.py | 5 +- 5 files changed, 152 insertions(+), 56 deletions(-) diff --git a/eopf_geozarr/data_api/geozarr/common.py b/eopf_geozarr/data_api/geozarr/common.py index dcebf1b..2ad2855 100644 --- a/eopf_geozarr/data_api/geozarr/common.py +++ b/eopf_geozarr/data_api/geozarr/common.py @@ -2,10 +2,12 @@ import io import urllib import urllib.request -from typing import Literal, Mapping, TypeAlias, TypeVar - +from typing import Annotated, Literal, Mapping, TypeAlias, TypeVar +import pydantic_zarr +import pydantic_zarr.v2 +import pydantic_zarr.v3 from cf_xarray.utils import parse_cf_standard_name_table -from pydantic import BaseModel +from pydantic import AfterValidator, BaseModel, ConfigDict, Field def get_cf_standard_names(url: str) -> tuple[str, ...]: @@ -25,6 +27,11 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]: _info, table, _aliases = parse_cf_standard_name_table(source=content_fobj) return tuple(table.keys()) +def get_array_dimensions(array: pydantic_zarr.v2.ArraySpec | pydantic_zarr.v3.ArraySpec) -> tuple[str, ...] | None: + if isinstance(array, pydantic_zarr.v2.ArraySpec): + return array.model_dump()["attributes"].get("array_dimensions") + else: + return array.model_dump()["dimension_names"] # This is a URL to the CF standard names table. CF_STANDARD_NAME_URL = ( @@ -32,7 +39,6 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]: "master/Data/cf-standard-names/current/src/cf-standard-name-table.xml" ) - # this does IO against github. consider locally storing this data instead if fetching every time # is problematic. CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL) @@ -118,7 +124,7 @@ class TileMatrixSet(BaseModel): class Multiscales(BaseModel): """ - Attributes for a GeoZarr multiscale dataset. + Multiscale metadata for a GeoZarr dataset. Attributes ---------- @@ -135,3 +141,44 @@ class Multiscales(BaseModel): # TODO: ensure that the keys match tile_matrix_set.tileMatrices[$index].id # TODO: ensure that the keys match the tileMatrix attribute tile_matrix_limits: dict[str, TileMatrixLimit] | None = None + + +CFStandardName = Annotated[str, AfterValidator(check_standard_name)] + + +class DatasetAttrs(BaseModel): + """ + Attributes for a GeoZarr dataset. + + Attributes + ---------- + multiscales: MultiscaleAttrs + """ + + multiscales: Multiscales + + +class BaseDataArrayAttrs(BaseModel): + """ + Base attributes for a GeoZarr DataArray. + + Attributes + ---------- + standard_name : str + The CF standard name of the variable. + grid_mapping : object + The grid mapping of the variable, which is a reference to a grid mapping variable that + describes the spatial reference of the variable. + grid_mapping_name : str + The name of the grid mapping, which is a string that describes the type of grid mapping + used for the variable. + """ + + # todo: validate that this names listed here are the names of zarr arrays + # unless the variable is an auxiliary variable + # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates + array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") + standard_name: CFStandardName + grid_mapping: object + grid_mapping_name: str + diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/eopf_geozarr/data_api/geozarr/v2.py index df0caf2..2099731 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/eopf_geozarr/data_api/geozarr/v2.py @@ -1,9 +1,9 @@ """GeoZarr data API for Zarr V2.""" from __future__ import annotations -from typing import Annotated, Any, Literal, Self +from typing import Any, Literal, Self -from pydantic import AfterValidator, BaseModel, ConfigDict, Field, model_serializer, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator from pydantic_zarr.v2 import ( AnyArraySpec, AnyGroupSpec, @@ -14,9 +14,7 @@ from_flat_group, ) -from eopf_geozarr.data_api.geozarr.common import Multiscales, check_standard_name - -CFStandardName = Annotated[str, AfterValidator(check_standard_name)] +from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, CFStandardName, DatasetAttrs, get_array_dimensions class CoordArrayAttrs(BaseModel): @@ -47,7 +45,6 @@ class CoordArrayAttrs(BaseModel): units: str axis: str - class CoordArray(ArraySpec[CoordArrayAttrs]): """ A GeoZarr coordinate array variable. @@ -57,35 +54,19 @@ class CoordArray(ArraySpec[CoordArrayAttrs]): shape: tuple[int] +class DataArrayAttrs(BaseDataArrayAttrs): + """ + DataArrayAttrs for DataArrays using Zarr V2 -class DataArrayAttrs(BaseModel): - """ - Attributes for a GeoZarr DataArray. - - Attributes - ---------- - array_dimensions : tuple[str, ...] - Alias for the _ARRAY_DIMENSIONS attribute, which lists the dimension names for this array. - standard_name : str - The CF standard name of the variable. - grid_mapping : object - The grid mapping of the variable, which is a reference to a grid mapping variable that - describes the spatial reference of the variable. - grid_mapping_name : str - The name of the grid mapping, which is a string that describes the type of grid mapping - used for the variable. - """ - - # todo: validate that this names listed here are the names of zarr arrays - # unless the variable is an auxiliary variable - # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates - array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - standard_name: CFStandardName - grid_mapping: object - grid_mapping_name: str - - model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True) - + Attributes + ---------- + array_dimensions : tuple[str, ...] + The dimensions of the array. Aliased from _ARRAY_DIMENSIONS. + """ + # necessary for ensuring that the array_dimensions are serialized as _ARRAY_DIMENSIONS + model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True) + array_dimensions : tuple[str, ...] + class DataArray(ArraySpec[DataArrayAttrs]): """ @@ -97,7 +78,7 @@ class DataArray(ArraySpec[DataArrayAttrs]): """ -def check_valid_coordinates(model: GroupSpec[Any, DataArray | CoordArray]) -> Dataset: +def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: """ Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. @@ -122,7 +103,7 @@ def check_valid_coordinates(model: GroupSpec[Any, DataArray | CoordArray]) -> Da k: v for k, v in model.members.items() if isinstance(v, DataArray) } for key, array in arrays.items(): - for idx, dim in enumerate(array.attributes.array_dimensions): + for idx, dim in enumerate(get_array_dimensions(array)): # type: ignore[arg-type] if dim not in model.members: raise ValueError( f"Dimension '{dim}' for array '{key}' is not defined in the model members." @@ -140,18 +121,6 @@ def check_valid_coordinates(model: GroupSpec[Any, DataArray | CoordArray]) -> Da return model -class DatasetAttrs(BaseModel): - """ - Attributes for a GeoZarr dataset. - - Attributes - ---------- - multiscales: MultiscaleAttrs - """ - - multiscales: Multiscales - - class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): """ A GeoZarr Dataset. diff --git a/eopf_geozarr/data_api/geozarr/v3.py b/eopf_geozarr/data_api/geozarr/v3.py index e69de29..09ee973 100644 --- a/eopf_geozarr/data_api/geozarr/v3.py +++ b/eopf_geozarr/data_api/geozarr/v3.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import Any, Self +from pydantic import model_validator +from pydantic_zarr.v3 import ArraySpec, GroupSpec + +from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs + +class DataArray(ArraySpec[BaseDataArrayAttrs]): + """ + A GeoZarr DataArray variable. + + References + ---------- + https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray + """ + +def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: + """ + Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. + + For each DataArray in the model, we check the dimensions associated with the DataArray. + For each dimension associated with a data variable, an array with the name of that data variable + must be present in the members of the group. + + Parameters + ---------- + model : GroupSpec[Any, Any] + The GeoZarr DataArray model to check. + + Returns + ------- + GroupSpec[Any, Any] + The validated GeoZarr DataArray model. + """ + if model.members is None: + raise ValueError("Model members cannot be None") + + arrays: dict[str, DataArray] = { + k: v for k, v in model.members.items() if isinstance(v, DataArray) + } + for key, array in arrays.items(): + for idx, dim in enumerate(get_array_dimensions(array)): # type: ignore[arg-type] + if dim not in model.members: + raise ValueError( + f"Dimension '{dim}' for array '{key}' is not defined in the model members." + ) + member = model.members[dim] + if isinstance(member, GroupSpec): + raise ValueError( + f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead." + ) + if member.shape[0] != array.shape[idx]: + raise ValueError( + f"Dimension '{dim}' for array '{key}' has a shape mismatch: " + f"{member.shape[0]} != {array.shape[idx]}." + ) + return model + +class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): + """ + A GeoZarr Dataset. + """ + + @model_validator(mode="after") + def check_valid_coordinates(self) -> Self: + """ + Validate the coordinates of the GeoZarr DataSet. + + This method checks that all DataArrays in the dataset have valid coordinates + according to the GeoZarr specification. + + Returns + ------- + GroupSpec[Any, Any] + The validated GeoZarr DataSet. + """ + return check_valid_coordinates(self) diff --git a/eopf_geozarr/tests/test_data_api/test_common.py b/eopf_geozarr/tests/test_data_api/test_common.py index 74de043..a081aee 100644 --- a/eopf_geozarr/tests/test_data_api/test_common.py +++ b/eopf_geozarr/tests/test_data_api/test_common.py @@ -2,11 +2,12 @@ import re import zarr +from eopf_geozarr.data_api.geozarr.v2 import CoordArrayAttrs import pytest from pydantic_zarr.core import tuplify_json from pydantic_zarr.v3 import GroupSpec from zarr.core.buffer import default_buffer_prototype -from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, CoordArrayAttrs, check_standard_name, get_cf_standard_names +from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, check_standard_name, get_cf_standard_names from eopf_geozarr.tests.test_data_api.conftest import example_zarr_json diff --git a/eopf_geozarr/tests/test_data_api/test_v2.py b/eopf_geozarr/tests/test_data_api/test_v2.py index 19a799f..931bfc5 100644 --- a/eopf_geozarr/tests/test_data_api/test_v2.py +++ b/eopf_geozarr/tests/test_data_api/test_v2.py @@ -1,8 +1,9 @@ from __future__ import annotations +from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs import pytest from pydantic_zarr.v2 import ArraySpec, GroupSpec from typing import Any -from eopf_geozarr.data_api.geozarr.common import CoordArrayAttrs, DataArrayAttrs +from eopf_geozarr.data_api.geozarr.v2 import CoordArrayAttrs, DataArrayAttrs import re @@ -86,7 +87,7 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None dtype="|u1", chunks=(10, 11), attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time", "lat"], + array_dimensions=["time", "lat"], standard_name="air_temperature", grid_mapping=None, grid_mapping_name="latitude_longitude", From f8c57221bda23b1d659cd518ec1d05711c49ebd3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 15 Aug 2025 09:23:49 +0200 Subject: [PATCH 10/25] adapt to src layout, relax cf requirement --- .../tests/test_data_api/test_geozarr.py | 10 -- eopf_geozarr/tests/test_data_api/test_v3.py | 0 .../data_api/geozarr/__init__.py | 0 .../eopf_geozarr}/data_api/geozarr/common.py | 45 ++------ .../eopf_geozarr}/data_api/geozarr/v2.py | 89 ++++++--------- .../eopf_geozarr}/data_api/geozarr/v3.py | 42 ++++++- .../tests => tests}/test_data_api/__init__.py | 0 .../tests => tests}/test_data_api/conftest.py | 13 ++- .../test_data_api/test_common.py | 41 +++---- .../tests => tests}/test_data_api/test_v2.py | 100 +++++++++-------- tests/test_data_api/test_v3.py | 104 ++++++++++++++++++ 11 files changed, 266 insertions(+), 178 deletions(-) delete mode 100644 eopf_geozarr/tests/test_data_api/test_geozarr.py delete mode 100644 eopf_geozarr/tests/test_data_api/test_v3.py rename {eopf_geozarr => src/eopf_geozarr}/data_api/geozarr/__init__.py (100%) rename {eopf_geozarr => src/eopf_geozarr}/data_api/geozarr/common.py (71%) rename {eopf_geozarr => src/eopf_geozarr}/data_api/geozarr/v2.py (55%) rename {eopf_geozarr => src/eopf_geozarr}/data_api/geozarr/v3.py (65%) rename {eopf_geozarr/tests => tests}/test_data_api/__init__.py (100%) rename {eopf_geozarr/tests => tests}/test_data_api/conftest.py (99%) rename {eopf_geozarr/tests => tests}/test_data_api/test_common.py (55%) rename {eopf_geozarr/tests => tests}/test_data_api/test_v2.py (54%) create mode 100644 tests/test_data_api/test_v3.py diff --git a/eopf_geozarr/tests/test_data_api/test_geozarr.py b/eopf_geozarr/tests/test_data_api/test_geozarr.py deleted file mode 100644 index b6aeec5..0000000 --- a/eopf_geozarr/tests/test_data_api/test_geozarr.py +++ /dev/null @@ -1,10 +0,0 @@ -from __future__ import annotations - - - - - - - -def test_dataarray_round_trip() -> None: - pass \ No newline at end of file diff --git a/eopf_geozarr/tests/test_data_api/test_v3.py b/eopf_geozarr/tests/test_data_api/test_v3.py deleted file mode 100644 index e69de29..0000000 diff --git a/eopf_geozarr/data_api/geozarr/__init__.py b/src/eopf_geozarr/data_api/geozarr/__init__.py similarity index 100% rename from eopf_geozarr/data_api/geozarr/__init__.py rename to src/eopf_geozarr/data_api/geozarr/__init__.py diff --git a/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py similarity index 71% rename from eopf_geozarr/data_api/geozarr/common.py rename to src/eopf_geozarr/data_api/geozarr/common.py index 2ad2855..88f4a41 100644 --- a/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -1,13 +1,14 @@ """Common utilities for GeoZarr data API.""" + import io import urllib import urllib.request -from typing import Annotated, Literal, Mapping, TypeAlias, TypeVar -import pydantic_zarr -import pydantic_zarr.v2 -import pydantic_zarr.v3 +from typing import Annotated, Final, Literal + from cf_xarray.utils import parse_cf_standard_name_table -from pydantic import AfterValidator, BaseModel, ConfigDict, Field +from pydantic import AfterValidator, BaseModel + +xarray_dims_key: Final = "_ARRAY_DIMENSIONS" def get_cf_standard_names(url: str) -> tuple[str, ...]: @@ -27,11 +28,6 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]: _info, table, _aliases = parse_cf_standard_name_table(source=content_fobj) return tuple(table.keys()) -def get_array_dimensions(array: pydantic_zarr.v2.ArraySpec | pydantic_zarr.v3.ArraySpec) -> tuple[str, ...] | None: - if isinstance(array, pydantic_zarr.v2.ArraySpec): - return array.model_dump()["attributes"].get("array_dimensions") - else: - return array.model_dump()["dimension_names"] # This is a URL to the CF standard names table. CF_STANDARD_NAME_URL = ( @@ -71,7 +67,8 @@ def check_standard_name(name: str) -> str: ) -# todo: narrow to literal type +CFStandardName = Annotated[str, AfterValidator(check_standard_name)] + ResamplingMethod = Literal[ "nearest", "average", @@ -122,7 +119,7 @@ class TileMatrixSet(BaseModel): tileMatrices: tuple[TileMatrix, ...] -class Multiscales(BaseModel): +class Multiscales(BaseModel, extra="allow"): """ Multiscale metadata for a GeoZarr dataset. @@ -143,10 +140,7 @@ class Multiscales(BaseModel): tile_matrix_limits: dict[str, TileMatrixLimit] | None = None -CFStandardName = Annotated[str, AfterValidator(check_standard_name)] - - -class DatasetAttrs(BaseModel): +class DatasetAttrs(BaseModel, extra="allow"): """ Attributes for a GeoZarr dataset. @@ -158,27 +152,12 @@ class DatasetAttrs(BaseModel): multiscales: Multiscales -class BaseDataArrayAttrs(BaseModel): +class BaseDataArrayAttrs(BaseModel, extra="allow"): """ Base attributes for a GeoZarr DataArray. Attributes ---------- - standard_name : str - The CF standard name of the variable. - grid_mapping : object - The grid mapping of the variable, which is a reference to a grid mapping variable that - describes the spatial reference of the variable. - grid_mapping_name : str - The name of the grid mapping, which is a string that describes the type of grid mapping - used for the variable. """ - # todo: validate that this names listed here are the names of zarr arrays - # unless the variable is an auxiliary variable - # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates - array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - standard_name: CFStandardName - grid_mapping: object - grid_mapping_name: str - + coordinates: str diff --git a/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py similarity index 55% rename from eopf_geozarr/data_api/geozarr/v2.py rename to src/eopf_geozarr/data_api/geozarr/v2.py index 2099731..c329133 100644 --- a/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -1,72 +1,32 @@ """GeoZarr data API for Zarr V2.""" + from __future__ import annotations -from typing import Any, Literal, Self +from typing import Any, Self, TypeVar -from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator -from pydantic_zarr.v2 import ( - AnyArraySpec, - AnyGroupSpec, - ArraySpec, - GroupSpec, - TAttr, - TItem, - from_flat_group, -) +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic_zarr.v2 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, CFStandardName, DatasetAttrs, get_array_dimensions +from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, Multiscales -class CoordArrayAttrs(BaseModel): +class DataArrayAttrs(BaseDataArrayAttrs): """ - Attributes for a GeoZarr coordinate array. + Attributes for a GeoZarr DataArray. Attributes ---------- array_dimensions : tuple[str, ...] - The dimensions of the array. - standard_name : str - The CF standard name of the variable. - grid_mapping : object - The grid mapping of the variable, which is a reference to a grid mapping variable that - describes the spatial reference of the variable. - grid_mapping_name : str - The name of the grid mapping, which is a string that describes the type of grid mapping - used for the variable. + Alias for the _ARRAY_DIMENSIONS attribute, which lists the dimension names for this array. """ - # model_config is necessary to ensure that this model dictifies with the key - # `"_ARRAY_DIMENSIONS"` instead of "array_dimensions" - model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True) - - array_dimensions: tuple[str] = Field(alias="_ARRAY_DIMENSIONS") - standard_name: CFStandardName - long_name: str | None = None - units: str - axis: str + # todo: validate that this names listed here are the names of zarr arrays + # unless the variable is an auxiliary variable + # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates + array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") -class CoordArray(ArraySpec[CoordArrayAttrs]): - """ - A GeoZarr coordinate array variable. - - It must be 1-dimensional and have a single element in its array_dimensions attribute. - """ + model_config = ConfigDict(populate_by_alias=True, serialize_by_alias=True) - shape: tuple[int] - -class DataArrayAttrs(BaseDataArrayAttrs): - """ - DataArrayAttrs for DataArrays using Zarr V2 - - Attributes - ---------- - array_dimensions : tuple[str, ...] - The dimensions of the array. Aliased from _ARRAY_DIMENSIONS. - """ - # necessary for ensuring that the array_dimensions are serialized as _ARRAY_DIMENSIONS - model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True) - array_dimensions : tuple[str, ...] - class DataArray(ArraySpec[DataArrayAttrs]): """ @@ -77,8 +37,15 @@ class DataArray(ArraySpec[DataArrayAttrs]): https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray """ + @property + def array_dimensions(self) -> tuple[str, ...]: + return self.attributes.array_dimensions -def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: + +T = TypeVar("T", bound=GroupSpec[Any, Any]) + + +def check_valid_coordinates(model: T) -> T: """ Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. @@ -103,7 +70,7 @@ def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: k: v for k, v in model.members.items() if isinstance(v, DataArray) } for key, array in arrays.items(): - for idx, dim in enumerate(get_array_dimensions(array)): # type: ignore[arg-type] + for idx, dim in enumerate(array.array_dimensions): if dim not in model.members: raise ValueError( f"Dimension '{dim}' for array '{key}' is not defined in the model members." @@ -121,6 +88,18 @@ def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: return model +class DatasetAttrs(BaseModel): + """ + Attributes for a GeoZarr dataset. + + Attributes + ---------- + multiscales: MultiscaleAttrs + """ + + multiscales: Multiscales + + class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): """ A GeoZarr Dataset. diff --git a/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py similarity index 65% rename from eopf_geozarr/data_api/geozarr/v3.py rename to src/eopf_geozarr/data_api/geozarr/v3.py index 09ee973..b07b432 100644 --- a/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -1,27 +1,58 @@ from __future__ import annotations -from typing import Any, Self +from typing import Any, Self, TypeVar + from pydantic import model_validator from pydantic_zarr.v3 import ArraySpec, GroupSpec from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs + class DataArray(ArraySpec[BaseDataArrayAttrs]): """ - A GeoZarr DataArray variable. + A Zarr array that represents as GeoZarr DataArray variable. + + The attributes of this array are defined in `BaseDataArrayAttrs`. + + This array has an additional constraint: the dimension_names field must be a tuple of strings. References ---------- https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray """ -def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: + dimension_names: tuple[str, ...] + + @model_validator(mode="after") + def check_coordinates_dimensionality(self) -> Self: + # split coordinates on whitespace + coords_split = self.attributes.coordinates.split() + missing_dims = set(coords_split) - set(self.dimension_names) + if len(missing_dims) > 0: + msg = ( + f"The coordinates {coords_split} are inconsistent with the dimension names " + f"{self.dimension_names}. The following dimensions are missing: {missing_dims}" + ) + raise ValueError(msg) + return self + + @property + def array_dimensions(self) -> tuple[str, ...]: + return self.dimension_names + + +T = TypeVar("T", bound=GroupSpec[Any, Any]) + + +def check_valid_coordinates(model: T) -> T: """ Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. For each DataArray in the model, we check the dimensions associated with the DataArray. For each dimension associated with a data variable, an array with the name of that data variable - must be present in the members of the group. + must be present in the members of the group, and the shape of that array must align with the + DataArray shape. + Parameters ---------- @@ -40,7 +71,7 @@ def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: k: v for k, v in model.members.items() if isinstance(v, DataArray) } for key, array in arrays.items(): - for idx, dim in enumerate(get_array_dimensions(array)): # type: ignore[arg-type] + for idx, dim in enumerate(array.array_dimensions): if dim not in model.members: raise ValueError( f"Dimension '{dim}' for array '{key}' is not defined in the model members." @@ -57,6 +88,7 @@ def check_valid_coordinates(model: GroupSpec[Any, Any]) -> Dataset: ) return model + class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): """ A GeoZarr Dataset. diff --git a/eopf_geozarr/tests/test_data_api/__init__.py b/tests/test_data_api/__init__.py similarity index 100% rename from eopf_geozarr/tests/test_data_api/__init__.py rename to tests/test_data_api/__init__.py diff --git a/eopf_geozarr/tests/test_data_api/conftest.py b/tests/test_data_api/conftest.py similarity index 99% rename from eopf_geozarr/tests/test_data_api/conftest.py rename to tests/test_data_api/conftest.py index e45650c..486a4ca 100644 --- a/eopf_geozarr/tests/test_data_api/conftest.py +++ b/tests/test_data_api/conftest.py @@ -1,3 +1,8 @@ +from __future__ import annotations + +from zarr import open_group +from zarr.core.buffer import default_buffer_prototype + example_zarr_json = r"""{ "attributes": {}, "zarr_format": 3, @@ -14139,6 +14144,10 @@ } }, "node_type": "group" -}""".encode( - "utf-8" +}""".encode("utf-8") +example_group = open_group( + store={ + "zarr.json": default_buffer_prototype().buffer.from_bytes(example_zarr_json) + }, + mode="r", ) diff --git a/eopf_geozarr/tests/test_data_api/test_common.py b/tests/test_data_api/test_common.py similarity index 55% rename from eopf_geozarr/tests/test_data_api/test_common.py rename to tests/test_data_api/test_common.py index a081aee..c911e90 100644 --- a/eopf_geozarr/tests/test_data_api/test_common.py +++ b/tests/test_data_api/test_common.py @@ -1,14 +1,16 @@ from __future__ import annotations -import re -import zarr -from eopf_geozarr.data_api.geozarr.v2 import CoordArrayAttrs import pytest from pydantic_zarr.core import tuplify_json -from pydantic_zarr.v3 import GroupSpec -from zarr.core.buffer import default_buffer_prototype -from eopf_geozarr.data_api.geozarr.common import CF_STANDARD_NAME_URL, check_standard_name, get_cf_standard_names -from eopf_geozarr.tests.test_data_api.conftest import example_zarr_json +from pydantic_zarr.v3 import GroupSpec as GroupSpec_V3 + +from eopf_geozarr.data_api.geozarr.common import ( + CF_STANDARD_NAME_URL, + check_standard_name, + get_cf_standard_names, +) + +from .conftest import example_group def test_get_cf_standard_names() -> None: @@ -39,30 +41,13 @@ def test_check_standard_name_invalid() -> None: check_standard_name("invalid_standard_name") -def test_coord_array_attrs_dimensions_length() -> None: +def test_multiscales_round_trip() -> None: """ - Test that the array_dimensions attribute must have length 1. + Ensure that we can round-trip multiscale metadata through the `Multiscales` model. """ - msg = ( - "1 validation error for CoordArrayAttrs\n_ARRAY_DIMENSIONS\n " - " Tuple should have at most 1 item after validation, not 2" - ) - with pytest.raises(ValueError, match=re.escape(msg)): - CoordArrayAttrs( - _ARRAY_DIMENSIONS=("time", "lat"), - standard_name="air_temperature", - units="mm", - axis="Y", - ) - - -def test_multiscales_round_trip() -> None: - - from eopf_geozarr.data_api.geozarr.common import Multiscales - source_store = {"zarr.json": default_buffer_prototype().buffer.from_bytes(example_zarr_json)} - source_untyped = GroupSpec.from_zarr(zarr.open_group(source_store, mode="r")) + source_untyped = GroupSpec_V3.from_zarr(example_group) flat = source_untyped.to_flat() meta = flat["/measurements/reflectance/r60m"].attributes["multiscales"] - assert Multiscales(**meta).model_dump() == tuplify_json(meta) \ No newline at end of file + assert Multiscales(**meta).model_dump() == tuplify_json(meta) diff --git a/eopf_geozarr/tests/test_data_api/test_v2.py b/tests/test_data_api/test_v2.py similarity index 54% rename from eopf_geozarr/tests/test_data_api/test_v2.py rename to tests/test_data_api/test_v2.py index 931bfc5..c41c426 100644 --- a/eopf_geozarr/tests/test_data_api/test_v2.py +++ b/tests/test_data_api/test_v2.py @@ -1,36 +1,17 @@ from __future__ import annotations -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs -import pytest -from pydantic_zarr.v2 import ArraySpec, GroupSpec -from typing import Any -from eopf_geozarr.data_api.geozarr.v2 import CoordArrayAttrs, DataArrayAttrs +from typing import Any -import re - -from eopf_geozarr.data_api.geozarr.v2 import CoordArray, DataArray, check_valid_coordinates +import pytest +from pydantic_zarr.v2 import ArraySpec, GroupSpec +from eopf_geozarr.data_api.geozarr.v2 import ( + DataArray, + DataArrayAttrs, + check_valid_coordinates, +) -def test_coord_array_dimensionality() -> None: - """ - Test that only 1-dimensional arrays are allowed. - """ - msg = ( - "1 validation error for CoordArray\nshape\n " - "Tuple should have at most 1 item after validation, not 2" - ) - with pytest.raises(ValueError, match=re.escape(msg)): - CoordArray( - shape=(10, 11), - dtype="|u1", - chunks=(10, 11), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=("time",), - standard_name="air_temperature", - units="s", - axis="Y", - ), - ) +from .conftest import example_group class TestCheckValidCoordinates: @@ -44,26 +25,33 @@ class TestCheckValidCoordinates: dtype="|u1", chunks=(10, 11), attributes=DataArrayAttrs( - array_dimensions=["time", "lat"], + _FILL_VALUE="AAAAAAAA+H8=", + _ARRAY_DIMENSIONS=["time", "lat"], standard_name="air_temperature", grid_mapping=None, grid_mapping_name="latitude_longitude", ), ), - "/time": CoordArray( + "/time": DataArray( shape=(10,), dtype="|u1", chunks=(10,), - attributes=CoordArrayAttrs( - array_dimensions=["time"], standard_name="time", units="s", axis="T" + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time"], + standard_name="time", + units="s", + axis="T", ), ), - "/lat": CoordArray( + "/lat": DataArray( shape=(11,), dtype="|u1", chunks=(11,), - attributes=CoordArrayAttrs( - array_dimensions=["lat"], standard_name="latitude", units="m", axis="Y" + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], + standard_name="latitude", + units="m", + axis="Y", ), ), }, @@ -87,39 +75,61 @@ def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None dtype="|u1", chunks=(10, 11), attributes=DataArrayAttrs( - array_dimensions=["time", "lat"], + _ARRAY_DIMENSIONS=["time", "lat"], + _FILL_VALUE="AAAAAAAA+H8=", standard_name="air_temperature", grid_mapping=None, grid_mapping_name="latitude_longitude", ), ), - "/time": CoordArray( + "/time": DataArray( shape=(10,), dtype="|u1", chunks=(10,), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["time"], standard_name="time", units="s", axis="T" + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time"], + standard_name="time", + units="s", + axis="T", ), ), - "/lat": CoordArray( + "/lat": DataArray( shape=(11,), dtype="|u1", chunks=(11,), - attributes=CoordArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], standard_name="latitude", units="m", axis="Y" + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], + standard_name="latitude", + units="m", + axis="Y", ), ), }, ], ) @staticmethod - def test_invalid_coordinates(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + def test_invalid_coordinates( + example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]], + ) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. This test checks that the function raises a ValueError when the dimensions of the data variable do not match the dimensions of the coordinate arrays. """ - group = GroupSpec[Any, DataArray | CoordArray].from_flat(example) + group = GroupSpec[Any, DataArray].from_flat(example) with pytest.raises(ValueError): - check_valid_coordinates(group) \ No newline at end of file + check_valid_coordinates(group) + + +@pytest.mark.skip(reason="We don't have a v2 example group yet") +def test_dataarray_attrs_round_trip() -> None: + """ + Ensure that we can round-trip dataarray attributes through the `Multiscales` model. + """ + source_untyped = GroupSpec.from_zarr(example_group) + flat = source_untyped.to_flat() + for key, val in flat.items(): + if isinstance(val, ArraySpec): + model_json = val.model_dump()["attributes"] + assert DataArrayAttrs(**model_json).model_dump() == model_json diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py new file mode 100644 index 0000000..a08fd4b --- /dev/null +++ b/tests/test_data_api/test_v3.py @@ -0,0 +1,104 @@ +from typing import Any + +import pytest +import zarr +from pydantic_zarr.core import tuplify_json +from pydantic_zarr.v3 import ArraySpec, GroupSpec + +from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, check_valid_coordinates + +from .conftest import example_group + + +class TestCheckValidCoordinates: + @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) + def test_valid(data_shape: tuple[int, ...]) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + """ + + # create a group containing "coordinated" arrays, with 1 n-dimensional data array, and + # N 1-dimensional coordinate arrays + + group = GroupSpec.from_flat(example) + assert check_valid_coordinates(group) == group + + @pytest.mark.parametrize( + "example", + [ + { + "": GroupSpec(attributes={}, members=None), + "/data_var": DataArray( + shape=(9, 10), + data_type="uint8", + chunks=(10, 11), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time", "lat"], + _FILL_VALUE="AAAAAAAA+H8=", + standard_name="air_temperature", + grid_mapping=None, + grid_mapping_name="latitude_longitude", + ), + ), + "/time": DataArray( + shape=(10,), + data_type="uint8", + chunks=(10,), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["time"], + standard_name="time", + units="s", + axis="T", + ), + ), + "/lat": DataArray( + shape=(11,), + data_type="uint8", + chunks=(11,), + attributes=DataArrayAttrs( + _ARRAY_DIMENSIONS=["lat"], + standard_name="latitude", + units="m", + axis="Y", + ), + ), + }, + ], + ) + @staticmethod + def test_invalid_coordinates( + example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]], + ) -> None: + """ + Test the check_valid_coordinates function to ensure it validates coordinates correctly. + + This test checks that the function raises a ValueError when the dimensions of the data variable + do not match the dimensions of the coordinate arrays. + """ + group = GroupSpec[Any, DataArray].from_flat(example) + with pytest.raises(ValueError): + check_valid_coordinates(group) + + +def test_dataarray_round_trip() -> None: + """ + Ensure that we can round-trip dataarray attributes through the `Multiscales` model. + """ + source_untyped = GroupSpec.from_zarr(example_group) + flat = source_untyped.to_flat() + for key, val in flat.items(): + if isinstance(val, ArraySpec): + model_json = val.model_dump() + assert DataArray(**model_json).model_dump() == model_json + + +def test_multiscale_attrs_round_trip() -> None: + """ + Test that multiscale datasets round-trip through the `Multiscales` model + """ + source_group_members = dict(example_group.members(max_depth=None)) + for key, val in source_group_members.items(): + if isinstance(val, zarr.Group): + if "multiscales" in val.attrs.asdict(): + model_json = GroupSpec.from_zarr(val).model_dump() + assert Dataset(**model_json).model_dump() == tuplify_json(model_json) From d1a2e2dc59ec91df516bd89fa6c453ef2faa79a6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 16 Aug 2025 22:21:15 +0200 Subject: [PATCH 11/25] add array_dimensions kwarg to from_array --- src/eopf_geozarr/data_api/geozarr/common.py | 6 +- src/eopf_geozarr/data_api/geozarr/v2.py | 48 +++++++- src/eopf_geozarr/data_api/geozarr/v3.py | 13 --- tests/test_data_api/test_v2.py | 123 ++++++-------------- tests/test_data_api/test_v3.py | 70 +++-------- 5 files changed, 99 insertions(+), 161 deletions(-) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 88f4a41..fc69c24 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -8,7 +8,7 @@ from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel -xarray_dims_key: Final = "_ARRAY_DIMENSIONS" +XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" def get_cf_standard_names(url: str) -> tuple[str, ...]: @@ -158,6 +158,4 @@ class BaseDataArrayAttrs(BaseModel, extra="allow"): Attributes ---------- - """ - - coordinates: str + """ \ No newline at end of file diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index c329133..8c9c260 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -2,12 +2,13 @@ from __future__ import annotations -from typing import Any, Self, TypeVar +from collections.abc import Mapping +from typing import Any, Iterable, Literal, Self, TypeVar from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic_zarr.v2 import ArraySpec, GroupSpec - -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, Multiscales +from pydantic_zarr.v2 import auto_attributes +from eopf_geozarr.data_api.geozarr.common import XARRAY_DIMS_KEY, BaseDataArrayAttrs, Multiscales class DataArrayAttrs(BaseDataArrayAttrs): @@ -30,12 +31,51 @@ class DataArrayAttrs(BaseDataArrayAttrs): class DataArray(ArraySpec[DataArrayAttrs]): """ - A GeoZarr DataArray variable. + A GeoZarr DataArray variable. It must have attributes that contain an `"_ARRAY_DIMENSIONS"` + key, with a length that matches the dimensionality of the array. References ---------- https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray """ + @classmethod + def from_array( + cls, + array: Any, + chunks: tuple[int, ...] | Literal["auto"] = "auto", + attributes: Mapping[str, object] | Literal["auto"] = "auto", + fill_value: object | Literal["auto"] = "auto", + order: Literal["C", "F"] | Literal["auto"] ="auto", + filters: tuple[Any, ...] | Literal["auto"]="auto", + dimension_separator: Literal[".", "/"] | Literal["auto"]="auto", + compressor: Any | Literal["auto"] = "auto", + dimension_names: Iterable[str] | Literal["auto"] = "auto") -> Self: + if attributes == "auto": + auto_attrs = dict(auto_attributes(array)) + else: + auto_attrs = dict(attributes) + if dimension_names != "auto": + auto_attrs = auto_attrs | {XARRAY_DIMS_KEY: tuple(dimension_names)} + model = super().from_array( + array=array, + chunks=chunks, + attributes=auto_attrs, + fill_value=fill_value, + order=order, + filters=filters, + dimension_separator=dimension_separator, + compressor=compressor, + ) + return model + + @model_validator(mode="after") + def check_array_dimensions(self) -> Self: + if (len_dim := len(self.attributes.array_dimensions)) != (ndim:=len(self.shape)): + msg = ( + f'The {XARRAY_DIMS_KEY} attribute has length {len_dim}, which does not ' + f'match the number of dimensions for this array (got {ndim}).') + raise ValueError(msg) + return self @property def array_dimensions(self) -> tuple[str, ...]: diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index b07b432..813489b 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -23,19 +23,6 @@ class DataArray(ArraySpec[BaseDataArrayAttrs]): dimension_names: tuple[str, ...] - @model_validator(mode="after") - def check_coordinates_dimensionality(self) -> Self: - # split coordinates on whitespace - coords_split = self.attributes.coordinates.split() - missing_dims = set(coords_split) - set(self.dimension_names) - if len(missing_dims) > 0: - msg = ( - f"The coordinates {coords_split} are inconsistent with the dimension names " - f"{self.dimension_names}. The following dimensions are missing: {missing_dims}" - ) - raise ValueError(msg) - return self - @property def array_dimensions(self) -> tuple[str, ...]: return self.dimension_names diff --git a/tests/test_data_api/test_v2.py b/tests/test_data_api/test_v2.py index c41c426..90fd036 100644 --- a/tests/test_data_api/test_v2.py +++ b/tests/test_data_api/test_v2.py @@ -2,6 +2,7 @@ from typing import Any +from pydantic import ValidationError import pytest from pydantic_zarr.v2 import ArraySpec, GroupSpec @@ -12,104 +13,37 @@ ) from .conftest import example_group +import numpy as np +def test_invalid_dimension_names() -> None: + msg = r'The _ARRAY_DIMENSIONS attribute has length 3, which does not match the number of dimensions for this array \(got 2\)' + with pytest.raises(ValidationError, match=msg): + DataArray.from_array(np.zeros((10, 10)), dimension_names=["x", "y", "z"]) class TestCheckValidCoordinates: - @pytest.mark.parametrize( - "example", - [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(10, 11), - dtype="|u1", - chunks=(10, 11), - attributes=DataArrayAttrs( - _FILL_VALUE="AAAAAAAA+H8=", - _ARRAY_DIMENSIONS=["time", "lat"], - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude", - ), - ), - "/time": DataArray( - shape=(10,), - dtype="|u1", - chunks=(10,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time"], - standard_name="time", - units="s", - axis="T", - ), - ), - "/lat": DataArray( - shape=(11,), - dtype="|u1", - chunks=(11,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], - standard_name="latitude", - units="m", - axis="Y", - ), - ), - }, - ], - ) @staticmethod - def test_valid(example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]]) -> None: + @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) + def test_valid(data_shape: tuple[int, ...]) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. """ - group = GroupSpec.from_flat(example) + + base_array = DataArray.from_array( + np.zeros((data_shape), dtype='uint8'), + dimension_names=[f'dim_{s}' for s in range(len(data_shape))] + ) + coords_arrays = { + f'dim_{idx}' : DataArray.from_array( + np.arange(s), + dimension_names=(f'dim_{idx}',)) for idx,s in enumerate(data_shape) + } + group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group - @pytest.mark.parametrize( - "example", - [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(9, 10), - dtype="|u1", - chunks=(10, 11), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time", "lat"], - _FILL_VALUE="AAAAAAAA+H8=", - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude", - ), - ), - "/time": DataArray( - shape=(10,), - dtype="|u1", - chunks=(10,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time"], - standard_name="time", - units="s", - axis="T", - ), - ), - "/lat": DataArray( - shape=(11,), - dtype="|u1", - chunks=(11,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], - standard_name="latitude", - units="m", - axis="Y", - ), - ), - }, - ], - ) @staticmethod + @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) def test_invalid_coordinates( - example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]], + data_shape: tuple[int, ...], ) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. @@ -117,11 +51,22 @@ def test_invalid_coordinates( This test checks that the function raises a ValueError when the dimensions of the data variable do not match the dimensions of the coordinate arrays. """ - group = GroupSpec[Any, DataArray].from_flat(example) - with pytest.raises(ValueError): + base_array = DataArray.from_array( + np.zeros((data_shape), dtype='uint8'), + dimension_names=[f'dim_{s}' for s in range(len(data_shape))] + ) + coords_arrays = { + f'dim_{idx}' : DataArray.from_array( + np.arange(s + 1), + dimension_names=(f'dim_{idx}',)) for idx, s in enumerate(data_shape) + } + group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + msg = "Dimension .* for array 'base' has a shape mismatch:" + with pytest.raises(ValueError, match=msg): check_valid_coordinates(group) + @pytest.mark.skip(reason="We don't have a v2 example group yet") def test_dataarray_attrs_round_trip() -> None: """ diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py index a08fd4b..1781eba 100644 --- a/tests/test_data_api/test_v3.py +++ b/tests/test_data_api/test_v3.py @@ -4,70 +4,33 @@ import zarr from pydantic_zarr.core import tuplify_json from pydantic_zarr.v3 import ArraySpec, GroupSpec - +import numpy as np from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, check_valid_coordinates from .conftest import example_group class TestCheckValidCoordinates: + @staticmethod @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) def test_valid(data_shape: tuple[int, ...]) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. """ - # create a group containing "coordinated" arrays, with 1 n-dimensional data array, and - # N 1-dimensional coordinate arrays - - group = GroupSpec.from_flat(example) + base_array = DataArray.from_array( + np.zeros((data_shape), dtype='uint8'), + dimension_names=[f'dim_{s}' for s in range(len(data_shape))]) + coords_arrays = { + f'dim_{idx}' : DataArray.from_array(np.arange(s), dimension_names=(f'dim_{idx}',)) for idx,s in enumerate(data_shape) + } + group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group - @pytest.mark.parametrize( - "example", - [ - { - "": GroupSpec(attributes={}, members=None), - "/data_var": DataArray( - shape=(9, 10), - data_type="uint8", - chunks=(10, 11), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time", "lat"], - _FILL_VALUE="AAAAAAAA+H8=", - standard_name="air_temperature", - grid_mapping=None, - grid_mapping_name="latitude_longitude", - ), - ), - "/time": DataArray( - shape=(10,), - data_type="uint8", - chunks=(10,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["time"], - standard_name="time", - units="s", - axis="T", - ), - ), - "/lat": DataArray( - shape=(11,), - data_type="uint8", - chunks=(11,), - attributes=DataArrayAttrs( - _ARRAY_DIMENSIONS=["lat"], - standard_name="latitude", - units="m", - axis="Y", - ), - ), - }, - ], - ) @staticmethod + @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) def test_invalid_coordinates( - example: dict[str, ArraySpec[Any] | GroupSpec[Any, Any]], + data_shape: tuple[int, ...], ) -> None: """ Test the check_valid_coordinates function to ensure it validates coordinates correctly. @@ -75,8 +38,13 @@ def test_invalid_coordinates( This test checks that the function raises a ValueError when the dimensions of the data variable do not match the dimensions of the coordinate arrays. """ - group = GroupSpec[Any, DataArray].from_flat(example) - with pytest.raises(ValueError): + base_array = DataArray.from_array(np.zeros((data_shape), dtype='uint8'), dimension_names=[f'dim_{s}' for s in range(len(data_shape))]) + coords_arrays = { + f'dim_{idx}' : DataArray.from_array(np.arange(s + 1), dimension_names=(f'dim_{idx}',)) for idx, s in enumerate(data_shape) + } + group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) + msg = "Dimension .* for array 'base' has a shape mismatch:" + with pytest.raises(ValueError, match=msg): check_valid_coordinates(group) @@ -87,7 +55,7 @@ def test_dataarray_round_trip() -> None: source_untyped = GroupSpec.from_zarr(example_group) flat = source_untyped.to_flat() for key, val in flat.items(): - if isinstance(val, ArraySpec): + if isinstance(val, ArraySpec) and val.dimension_names is not None: model_json = val.model_dump() assert DataArray(**model_json).model_dump() == model_json From ad585a0abd14c36ac5703e8d3d7b1f403b6305b3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 18 Aug 2025 13:06:46 +0200 Subject: [PATCH 12/25] bump mypy python version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ab591d3..795181b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,7 +112,7 @@ use_parentheses = true ensure_newline_before_comments = true [tool.mypy] -python_version = "3.10" +python_version = "3.11" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true From 3d11af412e460993f8e603dcff0555c5342c4e8f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 18 Aug 2025 13:16:20 +0200 Subject: [PATCH 13/25] lint --- .pre-commit-config.yaml | 2 +- src/eopf_geozarr/data_api/geozarr/common.py | 2 +- src/eopf_geozarr/data_api/geozarr/v2.py | 41 +++++++++++++-------- tests/test_data_api/test_v2.py | 39 +++++++++++--------- tests/test_data_api/test_v3.py | 27 ++++++++++---- 5 files changed, 67 insertions(+), 44 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d544d9a..6364472 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,4 +26,4 @@ repos: additional_dependencies: - types-simplejson - types-attrs - - pydantic~=2.0 \ No newline at end of file + - pydantic>=2.11 \ No newline at end of file diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index fc69c24..52a2db0 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -158,4 +158,4 @@ class BaseDataArrayAttrs(BaseModel, extra="allow"): Attributes ---------- - """ \ No newline at end of file + """ diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 8c9c260..1435b6b 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -6,9 +6,13 @@ from typing import Any, Iterable, Literal, Self, TypeVar from pydantic import BaseModel, ConfigDict, Field, model_validator -from pydantic_zarr.v2 import ArraySpec, GroupSpec -from pydantic_zarr.v2 import auto_attributes -from eopf_geozarr.data_api.geozarr.common import XARRAY_DIMS_KEY, BaseDataArrayAttrs, Multiscales +from pydantic_zarr.v2 import ArraySpec, GroupSpec, auto_attributes + +from eopf_geozarr.data_api.geozarr.common import ( + XARRAY_DIMS_KEY, + BaseDataArrayAttrs, + Multiscales, +) class DataArrayAttrs(BaseDataArrayAttrs): @@ -26,7 +30,7 @@ class DataArrayAttrs(BaseDataArrayAttrs): # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - model_config = ConfigDict(populate_by_alias=True, serialize_by_alias=True) + model_config = ConfigDict(serialize_by_alias=True) class DataArray(ArraySpec[DataArrayAttrs]): @@ -38,18 +42,20 @@ class DataArray(ArraySpec[DataArrayAttrs]): ---------- https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray """ + @classmethod def from_array( - cls, + cls, array: Any, chunks: tuple[int, ...] | Literal["auto"] = "auto", attributes: Mapping[str, object] | Literal["auto"] = "auto", fill_value: object | Literal["auto"] = "auto", - order: Literal["C", "F"] | Literal["auto"] ="auto", - filters: tuple[Any, ...] | Literal["auto"]="auto", - dimension_separator: Literal[".", "/"] | Literal["auto"]="auto", + order: Literal["C", "F"] | Literal["auto"] = "auto", + filters: tuple[Any, ...] | Literal["auto"] = "auto", + dimension_separator: Literal[".", "/"] | Literal["auto"] = "auto", compressor: Any | Literal["auto"] = "auto", - dimension_names: Iterable[str] | Literal["auto"] = "auto") -> Self: + dimension_names: Iterable[str] | Literal["auto"] = "auto", + ) -> Self: if attributes == "auto": auto_attrs = dict(auto_attributes(array)) else: @@ -57,8 +63,8 @@ def from_array( if dimension_names != "auto": auto_attrs = auto_attrs | {XARRAY_DIMS_KEY: tuple(dimension_names)} model = super().from_array( - array=array, - chunks=chunks, + array=array, + chunks=chunks, attributes=auto_attrs, fill_value=fill_value, order=order, @@ -66,20 +72,23 @@ def from_array( dimension_separator=dimension_separator, compressor=compressor, ) - return model + return model # type: ignore[no-any-return] @model_validator(mode="after") def check_array_dimensions(self) -> Self: - if (len_dim := len(self.attributes.array_dimensions)) != (ndim:=len(self.shape)): + if (len_dim := len(self.attributes.array_dimensions)) != ( + ndim := len(self.shape) + ): msg = ( - f'The {XARRAY_DIMS_KEY} attribute has length {len_dim}, which does not ' - f'match the number of dimensions for this array (got {ndim}).') + f"The {XARRAY_DIMS_KEY} attribute has length {len_dim}, which does not " + f"match the number of dimensions for this array (got {ndim})." + ) raise ValueError(msg) return self @property def array_dimensions(self) -> tuple[str, ...]: - return self.attributes.array_dimensions + return self.attributes.array_dimensions # type: ignore[no-any-return] T = TypeVar("T", bound=GroupSpec[Any, Any]) diff --git a/tests/test_data_api/test_v2.py b/tests/test_data_api/test_v2.py index 90fd036..bacfbb9 100644 --- a/tests/test_data_api/test_v2.py +++ b/tests/test_data_api/test_v2.py @@ -2,8 +2,9 @@ from typing import Any -from pydantic import ValidationError +import numpy as np import pytest +from pydantic import ValidationError from pydantic_zarr.v2 import ArraySpec, GroupSpec from eopf_geozarr.data_api.geozarr.v2 import ( @@ -13,13 +14,14 @@ ) from .conftest import example_group -import numpy as np + def test_invalid_dimension_names() -> None: - msg = r'The _ARRAY_DIMENSIONS attribute has length 3, which does not match the number of dimensions for this array \(got 2\)' + msg = r"The _ARRAY_DIMENSIONS attribute has length 3, which does not match the number of dimensions for this array \(got 2\)" with pytest.raises(ValidationError, match=msg): DataArray.from_array(np.zeros((10, 10)), dimension_names=["x", "y", "z"]) + class TestCheckValidCoordinates: @staticmethod @pytest.mark.parametrize("data_shape", [(10,), (10, 12)]) @@ -29,14 +31,15 @@ def test_valid(data_shape: tuple[int, ...]) -> None: """ base_array = DataArray.from_array( - np.zeros((data_shape), dtype='uint8'), - dimension_names=[f'dim_{s}' for s in range(len(data_shape))] - ) + np.zeros((data_shape), dtype="uint8"), + dimension_names=[f"dim_{s}" for s in range(len(data_shape))], + ) coords_arrays = { - f'dim_{idx}' : DataArray.from_array( - np.arange(s), - dimension_names=(f'dim_{idx}',)) for idx,s in enumerate(data_shape) - } + f"dim_{idx}": DataArray.from_array( + np.arange(s), dimension_names=(f"dim_{idx}",) + ) + for idx, s in enumerate(data_shape) + } group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group @@ -52,21 +55,21 @@ def test_invalid_coordinates( do not match the dimensions of the coordinate arrays. """ base_array = DataArray.from_array( - np.zeros((data_shape), dtype='uint8'), - dimension_names=[f'dim_{s}' for s in range(len(data_shape))] - ) + np.zeros((data_shape), dtype="uint8"), + dimension_names=[f"dim_{s}" for s in range(len(data_shape))], + ) coords_arrays = { - f'dim_{idx}' : DataArray.from_array( - np.arange(s + 1), - dimension_names=(f'dim_{idx}',)) for idx, s in enumerate(data_shape) - } + f"dim_{idx}": DataArray.from_array( + np.arange(s + 1), dimension_names=(f"dim_{idx}",) + ) + for idx, s in enumerate(data_shape) + } group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) msg = "Dimension .* for array 'base' has a shape mismatch:" with pytest.raises(ValueError, match=msg): check_valid_coordinates(group) - @pytest.mark.skip(reason="We don't have a v2 example group yet") def test_dataarray_attrs_round_trip() -> None: """ diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py index 1781eba..e9d9887 100644 --- a/tests/test_data_api/test_v3.py +++ b/tests/test_data_api/test_v3.py @@ -1,10 +1,11 @@ from typing import Any +import numpy as np import pytest import zarr from pydantic_zarr.core import tuplify_json from pydantic_zarr.v3 import ArraySpec, GroupSpec -import numpy as np + from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, check_valid_coordinates from .conftest import example_group @@ -19,11 +20,15 @@ def test_valid(data_shape: tuple[int, ...]) -> None: """ base_array = DataArray.from_array( - np.zeros((data_shape), dtype='uint8'), - dimension_names=[f'dim_{s}' for s in range(len(data_shape))]) + np.zeros((data_shape), dtype="uint8"), + dimension_names=[f"dim_{s}" for s in range(len(data_shape))], + ) coords_arrays = { - f'dim_{idx}' : DataArray.from_array(np.arange(s), dimension_names=(f'dim_{idx}',)) for idx,s in enumerate(data_shape) - } + f"dim_{idx}": DataArray.from_array( + np.arange(s), dimension_names=(f"dim_{idx}",) + ) + for idx, s in enumerate(data_shape) + } group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) assert check_valid_coordinates(group) == group @@ -38,10 +43,16 @@ def test_invalid_coordinates( This test checks that the function raises a ValueError when the dimensions of the data variable do not match the dimensions of the coordinate arrays. """ - base_array = DataArray.from_array(np.zeros((data_shape), dtype='uint8'), dimension_names=[f'dim_{s}' for s in range(len(data_shape))]) + base_array = DataArray.from_array( + np.zeros((data_shape), dtype="uint8"), + dimension_names=[f"dim_{s}" for s in range(len(data_shape))], + ) coords_arrays = { - f'dim_{idx}' : DataArray.from_array(np.arange(s + 1), dimension_names=(f'dim_{idx}',)) for idx, s in enumerate(data_shape) - } + f"dim_{idx}": DataArray.from_array( + np.arange(s + 1), dimension_names=(f"dim_{idx}",) + ) + for idx, s in enumerate(data_shape) + } group = GroupSpec[Any, DataArray](members={"base": base_array, **coords_arrays}) msg = "Dimension .* for array 'base' has a shape mismatch:" with pytest.raises(ValueError, match=msg): From 667de5dc17ac7e10f09d93fd626c65987dedb803 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 4 Sep 2025 14:41:16 +0200 Subject: [PATCH 14/25] add grid mapping --- src/eopf_geozarr/data_api/geozarr/common.py | 32 +++++++++++++++++++++ src/eopf_geozarr/data_api/geozarr/v3.py | 11 +++++++ 2 files changed, 43 insertions(+) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 52a2db0..8aa245a 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -89,6 +89,21 @@ def check_standard_name(name: str) -> str: """A string literal indicating a resampling method""" +class GridMappingAttrs(BaseModel, extra="allow"): + """ + Grid mapping attributes for a GeoZarr grid mapping variable. + + Attributes + ---------- + grid_mapping_name : str + The name of the grid mapping. + + Additional attributes might be present depending on the type of grid mapping. + """ + + grid_mapping_name: str + + class TileMatrixLimit(BaseModel): """""" @@ -144,6 +159,23 @@ class DatasetAttrs(BaseModel, extra="allow"): """ Attributes for a GeoZarr dataset. + A dataset is a collection of DataArrays. + + Attributes + ---------- + grid_mapping: str + The name of the grid mapping variable for this dataset. + """ + + grid_mapping: str + + +class MultiscaleDatasetAttrs(BaseModel, extra="allow"): + """ + Attributes for Multiscale GeoZarr dataset. + + A Multiscale dataset is a collection of Dataet + Attributes ---------- multiscales: MultiscaleAttrs diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index 813489b..536e43f 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -95,3 +95,14 @@ def check_valid_coordinates(self) -> Self: The validated GeoZarr DataSet. """ return check_valid_coordinates(self) + + @model_validator(mode="after") + def validate_grid_mapping(self) -> Self: + if ( + self.members is not None + and self.attributes.grid_mapping not in self.members + ): + raise ValueError( + f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." + ) + return self From 9802aba524b1daee86d1fb5c501380bc78d9b56f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 5 Sep 2025 12:42:01 +0200 Subject: [PATCH 15/25] update multiscale models --- src/eopf_geozarr/data_api/geozarr/common.py | 2 +- src/eopf_geozarr/data_api/geozarr/v2.py | 73 ++++++++++++++++----- src/eopf_geozarr/data_api/geozarr/v3.py | 33 +++++++++- 3 files changed, 88 insertions(+), 20 deletions(-) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 8aa245a..8e6bf08 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -170,7 +170,7 @@ class DatasetAttrs(BaseModel, extra="allow"): grid_mapping: str -class MultiscaleDatasetAttrs(BaseModel, extra="allow"): +class MultiscaleAttrs(BaseModel, extra="allow"): """ Attributes for Multiscale GeoZarr dataset. diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 1435b6b..195014e 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -1,17 +1,19 @@ -"""GeoZarr data API for Zarr V2.""" +"""Zarr V2 Models for the GeoZarr Zarr Hierarchy.""" from __future__ import annotations from collections.abc import Mapping from typing import Any, Iterable, Literal, Self, TypeVar -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import ConfigDict, Field, model_validator from pydantic_zarr.v2 import ArraySpec, GroupSpec, auto_attributes from eopf_geozarr.data_api.geozarr.common import ( XARRAY_DIMS_KEY, BaseDataArrayAttrs, - Multiscales, + DatasetAttrs, + GridMappingAttrs, + MultiscaleAttrs, ) @@ -29,7 +31,8 @@ class DataArrayAttrs(BaseDataArrayAttrs): # unless the variable is an auxiliary variable # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - + + # this is necessary to serialize the `array_dimensions` attribute as `_ARRAY_DIMENSIONS` model_config = ConfigDict(serialize_by_alias=True) @@ -56,6 +59,9 @@ def from_array( compressor: Any | Literal["auto"] = "auto", dimension_names: Iterable[str] | Literal["auto"] = "auto", ) -> Self: + """ + Override the default from_array method to include a dimension_names parameter. + """ if attributes == "auto": auto_attrs = dict(auto_attributes(array)) else: @@ -90,6 +96,17 @@ def check_array_dimensions(self) -> Self: def array_dimensions(self) -> tuple[str, ...]: return self.attributes.array_dimensions # type: ignore[no-any-return] +class GridMappingVariable(ArraySpec[GridMappingAttrs]): + """ + A Zarr array that represents a GeoZarr grid mapping variable. + + The attributes of this array are defined in `GridMappingAttrs`. + + References + ---------- + """ + ... + T = TypeVar("T", bound=GroupSpec[Any, Any]) @@ -137,19 +154,7 @@ def check_valid_coordinates(model: T) -> T: return model -class DatasetAttrs(BaseModel): - """ - Attributes for a GeoZarr dataset. - - Attributes - ---------- - multiscales: MultiscaleAttrs - """ - - multiscales: Multiscales - - -class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): +class Dataset(GroupSpec[DatasetAttrs, DataArray | GridMappingVariable]): """ A GeoZarr Dataset. """ @@ -168,3 +173,37 @@ def check_valid_coordinates(self) -> Self: The validated GeoZarr DataSet. """ return check_valid_coordinates(self) + + @model_validator(mode="after") + def validate_grid_mapping(self) -> Self: + if ( + self.members is not None + ): + missing_key = self.attributes.grid_mapping not in self.members + if missing_key: + raise ValueError( + f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." + ) + if not(isinstance(self.members[self.attributes.grid_mapping], GridMappingVariable)): + raise ValueError( + f"Grid mapping variable '{self.attributes.grid_mapping}' is not of type GridMappingVariable. " + "Found {type(self.members[self.attributes.grid_mapping])} instead." + ) + + return self + +class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): + """ + A GeoZarr Multiscale group. + + Attributes + ---------- + attributes: MultiscaleAttrs + Attributes for a multiscale GeoZarr group. + members: Mapping[str, Dataset] + A mapping of dataset names to GeoZarr Datasets. + ---------- + """ + # todo: define a validation routine that ensures the referential integrity between + # multiscale attributes and the actual datasets + ... \ No newline at end of file diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index 536e43f..114f440 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -1,3 +1,4 @@ +"""Zarr V3 Models for the GeoZarr Zarr Hierarchy.""" from __future__ import annotations from typing import Any, Self, TypeVar @@ -5,7 +6,7 @@ from pydantic import model_validator from pydantic_zarr.v3 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs +from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs, GridMappingAttrs, MultiscaleAttrs class DataArray(ArraySpec[BaseDataArrayAttrs]): @@ -21,12 +22,23 @@ class DataArray(ArraySpec[BaseDataArrayAttrs]): https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-dataarray """ + # The dimension names must be a tuple of strings dimension_names: tuple[str, ...] @property def array_dimensions(self) -> tuple[str, ...]: return self.dimension_names +class GridMappingVariable(ArraySpec[GridMappingAttrs]): + """ + A Zarr array that represents a GeoZarr grid mapping variable. + + The attributes of this array are defined in `GridMappingAttrs`. + + References + ---------- + """ + ... T = TypeVar("T", bound=GroupSpec[Any, Any]) @@ -76,7 +88,7 @@ def check_valid_coordinates(model: T) -> T: return model -class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): +class Dataset(GroupSpec[DatasetAttrs, DataArray]): """ A GeoZarr Dataset. """ @@ -101,8 +113,25 @@ def validate_grid_mapping(self) -> Self: if ( self.members is not None and self.attributes.grid_mapping not in self.members + and not isinstance(self.members[self.attributes.grid_mapping], GridMappingDataArray) ): raise ValueError( f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." ) return self + +class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): + """ + A GeoZarr Multiscale group. + + Attributes + ---------- + attributes: MultiscaleAttrs + Attributes for a multiscale GeoZarr group. + members: Mapping[str, Dataset] + A mapping of dataset names to GeoZarr Datasets. + ---------- + """ + # todo: define a validation routine that ensures the referential integrity between + # multiscale attributes and the actual datasets + ... \ No newline at end of file From 9545a3841e4fd7335577b6e66bd99c48f3c3fffc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 5 Sep 2025 17:37:06 +0200 Subject: [PATCH 16/25] use GridMappingVariable class, and pydantic experimental missing sentinel --- pyproject.toml | 2 +- src/eopf_geozarr/data_api/geozarr/common.py | 18 +++++++-------- src/eopf_geozarr/data_api/geozarr/v2.py | 25 ++++++++++++--------- src/eopf_geozarr/data_api/geozarr/v3.py | 22 ++++++++++++------ tests/test_data_api/test_v3.py | 6 ++--- 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 795181b..5dd503c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ requires-python = ">=3.11" dependencies = [ "pydantic-zarr>=0.8.0", - "pydantic>=2.11", + "pydantic>=2.12.0a1", "zarr>=3.1.1", "xarray>=2025.7.1", "dask[array,distributed]>=2025.5.1", diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 8e6bf08..917b8ba 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -7,7 +7,7 @@ from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel - +from pydantic.experimental.missing_sentinel import MISSING XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" @@ -98,7 +98,13 @@ class GridMappingAttrs(BaseModel, extra="allow"): grid_mapping_name : str The name of the grid mapping. + Extra fields are permitted. + Additional attributes might be present depending on the type of grid mapping. + + References + ---------- + https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections """ grid_mapping_name: str @@ -160,15 +166,8 @@ class DatasetAttrs(BaseModel, extra="allow"): Attributes for a GeoZarr dataset. A dataset is a collection of DataArrays. - - Attributes - ---------- - grid_mapping: str - The name of the grid mapping variable for this dataset. """ - - grid_mapping: str - + ... class MultiscaleAttrs(BaseModel, extra="allow"): """ @@ -191,3 +190,4 @@ class BaseDataArrayAttrs(BaseModel, extra="allow"): Attributes ---------- """ + grid_mapping: str | MISSING = MISSING diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 195014e..e663d44 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -104,6 +104,7 @@ class GridMappingVariable(ArraySpec[GridMappingAttrs]): References ---------- + https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections """ ... @@ -178,17 +179,19 @@ def check_valid_coordinates(self) -> Self: def validate_grid_mapping(self) -> Self: if ( self.members is not None - ): - missing_key = self.attributes.grid_mapping not in self.members - if missing_key: - raise ValueError( - f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." - ) - if not(isinstance(self.members[self.attributes.grid_mapping], GridMappingVariable)): - raise ValueError( - f"Grid mapping variable '{self.attributes.grid_mapping}' is not of type GridMappingVariable. " - "Found {type(self.members[self.attributes.grid_mapping])} instead." - ) + ): + for key, val in self.members.items(): + if hasattr(val.attributes, "grid_mapping") and val.attributes.grid_mapping is not None: + grid_mapping_var: str = val.attributes.grid_mapping + missing_key = grid_mapping_var not in self.members + if missing_key: + msg = f"Grid mapping variable {grid_mapping_var} declared by {key} was not found in dataset members." + raise ValueError(msg) + if not(isinstance(self.members[grid_mapping_var], GridMappingVariable)): + raise ValueError( + f"Grid mapping variable '{grid_mapping_var}' is not of type GridMappingVariable. " + f"Found {type(self.members[grid_mapping_var])} instead." + ) return self diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index 114f440..b06e445 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -7,7 +7,7 @@ from pydantic_zarr.v3 import ArraySpec, GroupSpec from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs, GridMappingAttrs, MultiscaleAttrs - +from pydantic.experimental.missing_sentinel import MISSING class DataArray(ArraySpec[BaseDataArrayAttrs]): """ @@ -37,6 +37,7 @@ class GridMappingVariable(ArraySpec[GridMappingAttrs]): References ---------- + https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections """ ... @@ -88,7 +89,7 @@ def check_valid_coordinates(model: T) -> T: return model -class Dataset(GroupSpec[DatasetAttrs, DataArray]): +class Dataset(GroupSpec[DatasetAttrs, DataArray | GridMappingVariable]): """ A GeoZarr Dataset. """ @@ -112,12 +113,19 @@ def check_valid_coordinates(self) -> Self: def validate_grid_mapping(self) -> Self: if ( self.members is not None - and self.attributes.grid_mapping not in self.members - and not isinstance(self.members[self.attributes.grid_mapping], GridMappingDataArray) ): - raise ValueError( - f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." - ) + for key, val in self.members.items(): + if hasattr(val.attributes, "grid_mapping") and val.attributes.grid_mapping is not MISSING: + grid_mapping_var: str = val.attributes.grid_mapping + missing_key = grid_mapping_var not in self.members + if missing_key: + msg = f"Grid mapping variable {grid_mapping_var} declared by {key} was not found in dataset members." + raise ValueError(msg) + if not(isinstance(self.members[grid_mapping_var], GridMappingVariable)): + raise ValueError( + f"Grid mapping variable '{grid_mapping_var}' is not of type GridMappingVariable. " + f"Found {type(self.members[grid_mapping_var])} instead." + ) return self class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py index e9d9887..d464156 100644 --- a/tests/test_data_api/test_v3.py +++ b/tests/test_data_api/test_v3.py @@ -6,7 +6,7 @@ from pydantic_zarr.core import tuplify_json from pydantic_zarr.v3 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, check_valid_coordinates +from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, MultiscaleGroup, check_valid_coordinates from .conftest import example_group @@ -79,5 +79,5 @@ def test_multiscale_attrs_round_trip() -> None: for key, val in source_group_members.items(): if isinstance(val, zarr.Group): if "multiscales" in val.attrs.asdict(): - model_json = GroupSpec.from_zarr(val).model_dump() - assert Dataset(**model_json).model_dump() == tuplify_json(model_json) + model_json = MultiscaleGroup.from_zarr(val).model_dump() + assert MultiscaleGroup(**model_json).model_dump() == tuplify_json(model_json) From 9a79771e171c8a3824e2b11f7c188f392809144d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 5 Sep 2025 17:46:29 +0200 Subject: [PATCH 17/25] lint --- src/eopf_geozarr/data_api/geozarr/common.py | 4 +++ src/eopf_geozarr/data_api/geozarr/v2.py | 25 ++++++++++------ src/eopf_geozarr/data_api/geozarr/v3.py | 33 +++++++++++++++------ tests/test_data_api/test_v3.py | 10 +++++-- 4 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 917b8ba..df8610c 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -8,6 +8,7 @@ from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel from pydantic.experimental.missing_sentinel import MISSING + XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" @@ -167,8 +168,10 @@ class DatasetAttrs(BaseModel, extra="allow"): A dataset is a collection of DataArrays. """ + ... + class MultiscaleAttrs(BaseModel, extra="allow"): """ Attributes for Multiscale GeoZarr dataset. @@ -190,4 +193,5 @@ class BaseDataArrayAttrs(BaseModel, extra="allow"): Attributes ---------- """ + grid_mapping: str | MISSING = MISSING diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index e663d44..20f29be 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -31,7 +31,7 @@ class DataArrayAttrs(BaseDataArrayAttrs): # unless the variable is an auxiliary variable # see https://github.com/zarr-developers/geozarr-spec/blob/main/geozarr-spec.md#geozarr-coordinates array_dimensions: tuple[str, ...] = Field(alias="_ARRAY_DIMENSIONS") - + # this is necessary to serialize the `array_dimensions` attribute as `_ARRAY_DIMENSIONS` model_config = ConfigDict(serialize_by_alias=True) @@ -96,6 +96,7 @@ def check_array_dimensions(self) -> Self: def array_dimensions(self) -> tuple[str, ...]: return self.attributes.array_dimensions # type: ignore[no-any-return] + class GridMappingVariable(ArraySpec[GridMappingAttrs]): """ A Zarr array that represents a GeoZarr grid mapping variable. @@ -106,6 +107,7 @@ class GridMappingVariable(ArraySpec[GridMappingAttrs]): ---------- https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections """ + ... @@ -177,24 +179,28 @@ def check_valid_coordinates(self) -> Self: @model_validator(mode="after") def validate_grid_mapping(self) -> Self: - if ( - self.members is not None - ): + if self.members is not None: for key, val in self.members.items(): - if hasattr(val.attributes, "grid_mapping") and val.attributes.grid_mapping is not None: + if ( + hasattr(val.attributes, "grid_mapping") + and val.attributes.grid_mapping is not None + ): grid_mapping_var: str = val.attributes.grid_mapping missing_key = grid_mapping_var not in self.members if missing_key: msg = f"Grid mapping variable {grid_mapping_var} declared by {key} was not found in dataset members." raise ValueError(msg) - if not(isinstance(self.members[grid_mapping_var], GridMappingVariable)): + if not ( + isinstance(self.members[grid_mapping_var], GridMappingVariable) + ): raise ValueError( f"Grid mapping variable '{grid_mapping_var}' is not of type GridMappingVariable. " f"Found {type(self.members[grid_mapping_var])} instead." ) - + return self + class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): """ A GeoZarr Multiscale group. @@ -207,6 +213,7 @@ class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): A mapping of dataset names to GeoZarr Datasets. ---------- """ - # todo: define a validation routine that ensures the referential integrity between + + # todo: define a validation routine that ensures the referential integrity between # multiscale attributes and the actual datasets - ... \ No newline at end of file + ... diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index b06e445..c5442d8 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -1,13 +1,20 @@ """Zarr V3 Models for the GeoZarr Zarr Hierarchy.""" + from __future__ import annotations from typing import Any, Self, TypeVar from pydantic import model_validator +from pydantic.experimental.missing_sentinel import MISSING from pydantic_zarr.v3 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs, GridMappingAttrs, MultiscaleAttrs -from pydantic.experimental.missing_sentinel import MISSING +from eopf_geozarr.data_api.geozarr.common import ( + BaseDataArrayAttrs, + DatasetAttrs, + GridMappingAttrs, + MultiscaleAttrs, +) + class DataArray(ArraySpec[BaseDataArrayAttrs]): """ @@ -29,6 +36,7 @@ class DataArray(ArraySpec[BaseDataArrayAttrs]): def array_dimensions(self) -> tuple[str, ...]: return self.dimension_names + class GridMappingVariable(ArraySpec[GridMappingAttrs]): """ A Zarr array that represents a GeoZarr grid mapping variable. @@ -39,8 +47,10 @@ class GridMappingVariable(ArraySpec[GridMappingAttrs]): ---------- https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections """ + ... + T = TypeVar("T", bound=GroupSpec[Any, Any]) @@ -111,23 +121,27 @@ def check_valid_coordinates(self) -> Self: @model_validator(mode="after") def validate_grid_mapping(self) -> Self: - if ( - self.members is not None - ): + if self.members is not None: for key, val in self.members.items(): - if hasattr(val.attributes, "grid_mapping") and val.attributes.grid_mapping is not MISSING: + if ( + hasattr(val.attributes, "grid_mapping") + and val.attributes.grid_mapping is not MISSING + ): grid_mapping_var: str = val.attributes.grid_mapping missing_key = grid_mapping_var not in self.members if missing_key: msg = f"Grid mapping variable {grid_mapping_var} declared by {key} was not found in dataset members." raise ValueError(msg) - if not(isinstance(self.members[grid_mapping_var], GridMappingVariable)): + if not ( + isinstance(self.members[grid_mapping_var], GridMappingVariable) + ): raise ValueError( f"Grid mapping variable '{grid_mapping_var}' is not of type GridMappingVariable. " f"Found {type(self.members[grid_mapping_var])} instead." ) return self + class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): """ A GeoZarr Multiscale group. @@ -140,6 +154,7 @@ class MultiscaleGroup(GroupSpec[MultiscaleAttrs, Dataset]): A mapping of dataset names to GeoZarr Datasets. ---------- """ - # todo: define a validation routine that ensures the referential integrity between + + # todo: define a validation routine that ensures the referential integrity between # multiscale attributes and the actual datasets - ... \ No newline at end of file + ... diff --git a/tests/test_data_api/test_v3.py b/tests/test_data_api/test_v3.py index d464156..6ca049f 100644 --- a/tests/test_data_api/test_v3.py +++ b/tests/test_data_api/test_v3.py @@ -6,7 +6,11 @@ from pydantic_zarr.core import tuplify_json from pydantic_zarr.v3 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.v3 import DataArray, Dataset, MultiscaleGroup, check_valid_coordinates +from eopf_geozarr.data_api.geozarr.v3 import ( + DataArray, + MultiscaleGroup, + check_valid_coordinates, +) from .conftest import example_group @@ -80,4 +84,6 @@ def test_multiscale_attrs_round_trip() -> None: if isinstance(val, zarr.Group): if "multiscales" in val.attrs.asdict(): model_json = MultiscaleGroup.from_zarr(val).model_dump() - assert MultiscaleGroup(**model_json).model_dump() == tuplify_json(model_json) + assert MultiscaleGroup(**model_json).model_dump() == tuplify_json( + model_json + ) From 655b295d4be12da94c506324b8bd67fc173e8f1e Mon Sep 17 00:00:00 2001 From: Emmanuel Mathot Date: Tue, 9 Sep 2025 13:44:45 +0200 Subject: [PATCH 18/25] remove unnecessary dependency on types-simplejson in pre-commit config --- .pre-commit-config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6364472..2639583 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,6 +24,5 @@ repos: language_version: python exclude: tests/.* additional_dependencies: - - types-simplejson - types-attrs - - pydantic>=2.11 \ No newline at end of file + - pydantic>=2.11 From f89d6ab886962f34fe279c1276b0d02bdce18141 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 17 Sep 2025 14:11:45 +0200 Subject: [PATCH 19/25] simplify models via protocols --- src/eopf_geozarr/data_api/geozarr/common.py | 87 ++++++++++++++++++++- src/eopf_geozarr/data_api/geozarr/v2.py | 54 ++----------- src/eopf_geozarr/data_api/geozarr/v3.py | 66 ++-------------- tests/test_data_api/test_common.py | 32 ++++++++ 4 files changed, 133 insertions(+), 106 deletions(-) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 8aa245a..b6e1f7a 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -3,10 +3,11 @@ import io import urllib import urllib.request -from typing import Annotated, Final, Literal +from typing import Annotated, Any, Final, Literal, Mapping, TypeVar from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel +from typing_extensions import Protocol, runtime_checkable XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" @@ -40,6 +41,70 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]: CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL) +@runtime_checkable +class DataArrayLike(Protocol): + """ + This is a protocol that models the relevant properties of Zarr V2 and Zarr V3 DataArrays. + """ + + @property + def array_dimensions(self) -> tuple[str, ...]: ... + + shape: tuple[int, ...] + + +@runtime_checkable +class GroupLike(Protocol): + members: Mapping[str, Any] | None + attributes: Any + + +TGroupLike = TypeVar("TGroupLike", bound=GroupLike) + + +def check_valid_coordinates(model: TGroupLike) -> TGroupLike: + """ + Check if the coordinates of the DataArrayLike objects listed in GroupLike objects are valid. + + For each DataArrayLike in the model, we check the dimensions associated with the DataArrayLike. + For each dimension associated with a data variable, a DataArrayLike with the name of that data + variable must be present in the members of the group. + + Parameters + ---------- + model : GroupLike + An object that implements the GroupLike protocol. + + Returns + ------- + GroupLike + A GroupLike object with referentially valid coordinates. + """ + if model.members is None: + raise ValueError("Model members cannot be None") + + arrays: dict[str, DataArrayLike] = { + k: v for k, v in model.members.items() if isinstance(v, DataArrayLike) + } + for key, array in arrays.items(): + for idx, dim in enumerate(array.array_dimensions): + if dim not in model.members: + raise ValueError( + f"Dimension '{dim}' for array '{key}' is not defined in the model members." + ) + member = model.members[dim] + if isinstance(member, GroupLike): + raise ValueError( + f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead." + ) + if member.shape[0] != array.shape[idx]: + raise ValueError( + f"Dimension '{dim}' for array '{key}' has a shape mismatch: " + f"{member.shape[0]} != {array.shape[idx]}." + ) + return model + + def check_standard_name(name: str) -> str: """ Check if the standard name is valid according to the CF conventions. @@ -170,6 +235,26 @@ class DatasetAttrs(BaseModel, extra="allow"): grid_mapping: str +@runtime_checkable +class DatasetLike(Protocol): + attributes: DatasetAttrs + members: Mapping[str, DataArrayLike] | None + + +TDataSetLike = TypeVar("TDataSetLike", bound=DatasetLike) + + +def check_grid_mapping(model: TDataSetLike) -> TDataSetLike: + """ + Ensure that a grid mapping variable is present, and that it refers to a member of the model. + """ + if model.members is not None and model.attributes.grid_mapping not in model.members: + raise ValueError( + f"Grid mapping variable '{model.attributes.grid_mapping}' not found in dataset members." + ) + return model + + class MultiscaleDatasetAttrs(BaseModel, extra="allow"): """ Attributes for Multiscale GeoZarr dataset. diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 1435b6b..8d21d92 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections.abc import Mapping -from typing import Any, Iterable, Literal, Self, TypeVar +from typing import Any, Iterable, Literal, Self from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic_zarr.v2 import ArraySpec, GroupSpec, auto_attributes @@ -12,6 +12,8 @@ XARRAY_DIMS_KEY, BaseDataArrayAttrs, Multiscales, + check_grid_mapping, + check_valid_coordinates, ) @@ -91,52 +93,6 @@ def array_dimensions(self) -> tuple[str, ...]: return self.attributes.array_dimensions # type: ignore[no-any-return] -T = TypeVar("T", bound=GroupSpec[Any, Any]) - - -def check_valid_coordinates(model: T) -> T: - """ - Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. - - For each DataArray in the model, we check the dimensions associated with the DataArray. - For each dimension associated with a data variable, an array with the name of that data variable - must be present in the members of the group. - - Parameters - ---------- - model : GroupSpec[Any, Any] - The GeoZarr DataArray model to check. - - Returns - ------- - GroupSpec[Any, Any] - The validated GeoZarr DataArray model. - """ - if model.members is None: - raise ValueError("Model members cannot be None") - - arrays: dict[str, DataArray] = { - k: v for k, v in model.members.items() if isinstance(v, DataArray) - } - for key, array in arrays.items(): - for idx, dim in enumerate(array.array_dimensions): - if dim not in model.members: - raise ValueError( - f"Dimension '{dim}' for array '{key}' is not defined in the model members." - ) - member = model.members[dim] - if isinstance(member, GroupSpec): - raise ValueError( - f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead." - ) - if member.shape[0] != array.shape[idx]: - raise ValueError( - f"Dimension '{dim}' for array '{key}' has a shape mismatch: " - f"{member.shape[0]} != {array.shape[idx]}." - ) - return model - - class DatasetAttrs(BaseModel): """ Attributes for a GeoZarr dataset. @@ -168,3 +124,7 @@ def check_valid_coordinates(self) -> Self: The validated GeoZarr DataSet. """ return check_valid_coordinates(self) + + @model_validator(mode="after") + def check_grid_mapping(self) -> Self: + return check_grid_mapping(self) diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index 536e43f..837a8a8 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -1,11 +1,16 @@ from __future__ import annotations -from typing import Any, Self, TypeVar +from typing import Any, Self from pydantic import model_validator from pydantic_zarr.v3 import ArraySpec, GroupSpec -from eopf_geozarr.data_api.geozarr.common import BaseDataArrayAttrs, DatasetAttrs +from eopf_geozarr.data_api.geozarr.common import ( + BaseDataArrayAttrs, + DatasetAttrs, + check_grid_mapping, + check_valid_coordinates, +) class DataArray(ArraySpec[BaseDataArrayAttrs]): @@ -28,54 +33,6 @@ def array_dimensions(self) -> tuple[str, ...]: return self.dimension_names -T = TypeVar("T", bound=GroupSpec[Any, Any]) - - -def check_valid_coordinates(model: T) -> T: - """ - Check if the coordinates of the DataArrays listed in a GeoZarr DataSet are valid. - - For each DataArray in the model, we check the dimensions associated with the DataArray. - For each dimension associated with a data variable, an array with the name of that data variable - must be present in the members of the group, and the shape of that array must align with the - DataArray shape. - - - Parameters - ---------- - model : GroupSpec[Any, Any] - The GeoZarr DataArray model to check. - - Returns - ------- - GroupSpec[Any, Any] - The validated GeoZarr DataArray model. - """ - if model.members is None: - raise ValueError("Model members cannot be None") - - arrays: dict[str, DataArray] = { - k: v for k, v in model.members.items() if isinstance(v, DataArray) - } - for key, array in arrays.items(): - for idx, dim in enumerate(array.array_dimensions): - if dim not in model.members: - raise ValueError( - f"Dimension '{dim}' for array '{key}' is not defined in the model members." - ) - member = model.members[dim] - if isinstance(member, GroupSpec): - raise ValueError( - f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead." - ) - if member.shape[0] != array.shape[idx]: - raise ValueError( - f"Dimension '{dim}' for array '{key}' has a shape mismatch: " - f"{member.shape[0]} != {array.shape[idx]}." - ) - return model - - class Dataset(GroupSpec[DatasetAttrs, GroupSpec[Any, Any] | DataArray]): """ A GeoZarr Dataset. @@ -98,11 +55,4 @@ def check_valid_coordinates(self) -> Self: @model_validator(mode="after") def validate_grid_mapping(self) -> Self: - if ( - self.members is not None - and self.attributes.grid_mapping not in self.members - ): - raise ValueError( - f"Grid mapping variable '{self.attributes.grid_mapping}' not found in dataset members." - ) - return self + return check_grid_mapping(self) diff --git a/tests/test_data_api/test_common.py b/tests/test_data_api/test_common.py index c911e90..a019969 100644 --- a/tests/test_data_api/test_common.py +++ b/tests/test_data_api/test_common.py @@ -1,18 +1,50 @@ from __future__ import annotations +from typing import Any + +import numpy as np import pytest from pydantic_zarr.core import tuplify_json +from pydantic_zarr.v2 import GroupSpec as GroupSpec_V2 from pydantic_zarr.v3 import GroupSpec as GroupSpec_V3 from eopf_geozarr.data_api.geozarr.common import ( CF_STANDARD_NAME_URL, + DataArrayLike, + GroupLike, check_standard_name, get_cf_standard_names, ) +from eopf_geozarr.data_api.geozarr.v2 import DataArray as DataArray_V2 +from eopf_geozarr.data_api.geozarr.v2 import DataArray as DataArray_V3 from .conftest import example_group +@pytest.mark.parametrize( + "obj", + [ + DataArray_V2.from_array( + np.arange(10), attributes={"_ARRAY_DIMENSIONS": ("time",)} + ), + DataArray_V3.from_array(np.arange(10), dimension_names=("time",)), + ], +) +def test_datarraylike(obj: DataArray_V2 | DataArray_V3) -> None: + """ + Test that the DataArrayLike protocol works correctly + """ + assert isinstance(obj, DataArrayLike) + + +@pytest.mark.parametrize("obj", [GroupSpec_V2(), GroupSpec_V3()]) +def test_grouplike(obj: GroupSpec_V3[Any, Any] | GroupSpec_V2[Any, Any]) -> None: + """ + Test that the GroupLike protocol works correctly + """ + assert isinstance(obj, GroupLike) + + def test_get_cf_standard_names() -> None: """ Test the get_cf_standard_names function to ensure it retrieves the CF standard names correctly. From 986d5dc3ca5ead4225d4c600998ff3d187e51b44 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 17 Sep 2025 14:42:44 +0200 Subject: [PATCH 20/25] fix failing tests --- src/eopf_geozarr/data_api/geozarr/common.py | 2 +- src/eopf_geozarr/data_api/geozarr/v2.py | 9 +++++++++ src/eopf_geozarr/data_api/geozarr/v3.py | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 04fc04f..9667b57 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -269,7 +269,7 @@ def check_grid_mapping(model: TDataSetLike) -> TDataSetLike: return model -class MultiscaleDatasetAttrs(BaseModel, extra="allow"): +class MultiscaleGroupAttrs(BaseModel, extra="allow"): """ Attributes for Multiscale GeoZarr dataset. diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 480b490..2f9606c 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -13,6 +13,7 @@ BaseDataArrayAttrs, DatasetAttrs, GridMappingAttrs, + MultiscaleGroupAttrs, check_grid_mapping, check_valid_coordinates, ) @@ -135,3 +136,11 @@ def check_valid_coordinates(self) -> Self: @model_validator(mode="after") def check_grid_mapping(self) -> Self: return check_grid_mapping(self) + + +class MultiscaleGroup(GroupSpec[MultiscaleGroupAttrs, DataArray | GroupSpec[Any, Any]]): + """ + A GeoZarr Multiscale Group. + """ + + ... diff --git a/src/eopf_geozarr/data_api/geozarr/v3.py b/src/eopf_geozarr/data_api/geozarr/v3.py index 58b0fce..20649c1 100644 --- a/src/eopf_geozarr/data_api/geozarr/v3.py +++ b/src/eopf_geozarr/data_api/geozarr/v3.py @@ -10,6 +10,7 @@ from eopf_geozarr.data_api.geozarr.common import ( BaseDataArrayAttrs, DatasetAttrs, + MultiscaleGroupAttrs, check_grid_mapping, check_valid_coordinates, ) @@ -59,3 +60,11 @@ def check_valid_coordinates(self) -> Self: @model_validator(mode="after") def validate_grid_mapping(self) -> Self: return check_grid_mapping(self) + + +class MultiscaleGroup(GroupSpec[MultiscaleGroupAttrs, DataArray | GroupSpec[Any, Any]]): + """ + A GeoZarr Multiscale Group. + """ + + ... From ad2dcad1b47f4b0a60b0dbb88a867a64e631ee4e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 17 Sep 2025 15:12:05 +0200 Subject: [PATCH 21/25] add tile matrix limit json type --- src/eopf_geozarr/conversion/geozarr.py | 7 +++++-- src/eopf_geozarr/data_api/geozarr/common.py | 10 +++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/eopf_geozarr/conversion/geozarr.py b/src/eopf_geozarr/conversion/geozarr.py index 7becea3..fd15467 100644 --- a/src/eopf_geozarr/conversion/geozarr.py +++ b/src/eopf_geozarr/conversion/geozarr.py @@ -26,6 +26,8 @@ from zarr.storage import StoreLike from zarr.storage._common import make_store_path +from eopf_geozarr.data_api.geozarr.common import TileMatrixLimitJSON + from . import fs_utils, utils @@ -1305,9 +1307,9 @@ def _load_existing_dataset(path: str) -> Optional[xr.Dataset]: def _create_tile_matrix_limits( overview_levels: List[Dict[str, Any]], tile_width: int -) -> Dict[str, Any]: +) -> dict[str, TileMatrixLimitJSON]: """Create tile matrix limits for overview levels.""" - tile_matrix_limits = {} + tile_matrix_limits: dict[str, TileMatrixLimitJSON] = {} for ol in overview_levels: level_str = str(ol["level"]) max_tile_col = int(np.ceil(ol["width"] / tile_width)) - 1 @@ -1320,6 +1322,7 @@ def _create_tile_matrix_limits( "minTileRow": 0, "maxTileRow": max_tile_row, } + return tile_matrix_limits diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 9667b57..9ba2a69 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -3,7 +3,7 @@ import io import urllib import urllib.request -from typing import Annotated, Any, Final, Literal, Mapping, TypeVar +from typing import Annotated, Any, Final, Literal, Mapping, TypedDict, TypeVar from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel @@ -188,6 +188,14 @@ def array_dimensions(self) -> tuple[str, ...]: ... attributes: BaseDataArrayAttrs +class TileMatrixLimitJSON(TypedDict): + tileMatrix: str + minTileCol: int + minTileRow: int + maxTileCol: int + maxTileRow: int + + class TileMatrixLimit(BaseModel): """""" From f044808dcb33fa4c71d7d15d5f0b960ac8ec4dbf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 17 Sep 2025 15:39:08 +0200 Subject: [PATCH 22/25] add json types --- src/eopf_geozarr/conversion/geozarr.py | 48 ++++++++----- src/eopf_geozarr/data_api/geozarr/common.py | 31 +------- src/eopf_geozarr/data_api/geozarr/types.py | 79 +++++++++++++++++++++ src/eopf_geozarr/data_api/geozarr/v2.py | 2 +- 4 files changed, 111 insertions(+), 49 deletions(-) create mode 100644 src/eopf_geozarr/data_api/geozarr/types.py diff --git a/src/eopf_geozarr/conversion/geozarr.py b/src/eopf_geozarr/conversion/geozarr.py index fd15467..809ac92 100644 --- a/src/eopf_geozarr/conversion/geozarr.py +++ b/src/eopf_geozarr/conversion/geozarr.py @@ -17,6 +17,7 @@ import os import shutil import time +from collections.abc import Hashable, Iterable, Sequence from typing import Any, Dict, List, Optional, Tuple import numpy as np @@ -26,7 +27,15 @@ from zarr.storage import StoreLike from zarr.storage._common import make_store_path -from eopf_geozarr.data_api.geozarr.common import TileMatrixLimitJSON +from eopf_geozarr.data_api.geozarr.types import ( + OverviewLevelJSON, + StandardXCoordAttrsJSON, + StandardYCoordAttrsJSON, + TileMatrixJSON, + TileMatrixLimitJSON, + TileMatrixSetJSON, + XarrayEncodingJSON, +) from . import fs_utils, utils @@ -612,7 +621,7 @@ def calculate_overview_levels( native_height: int, min_dimension: int = 256, tile_width: int = 256, -) -> List[Dict[str, Any]]: +) -> list[OverviewLevelJSON]: """ Calculate overview levels following COG /2 downsampling logic. @@ -632,7 +641,7 @@ def calculate_overview_levels( list List of overview level dictionaries """ - overview_levels = [] + overview_levels: list[OverviewLevelJSON] = [] level = 0 current_width = native_width current_height = native_height @@ -662,10 +671,10 @@ def calculate_overview_levels( def create_native_crs_tile_matrix_set( native_crs: Any, - native_bounds: Tuple[float, float, float, float], - overview_levels: List[Dict[str, Any]], + native_bounds: tuple[float, float, float, float], + overview_levels: Iterable[OverviewLevelJSON], group_prefix: Optional[str] = "", -) -> Dict[str, Any]: +) -> TileMatrixSetJSON: """ Create a custom Tile Matrix Set for the native CRS following GeoZarr spec. @@ -686,7 +695,7 @@ def create_native_crs_tile_matrix_set( Tile Matrix Set definition following OGC standard """ left, bottom, right, top = native_bounds - tile_matrices = [] + tile_matrices: list[TileMatrixJSON] = [] for overview in overview_levels: level = overview["level"] @@ -844,13 +853,13 @@ def create_overview_dataset_all_vars( def write_dataset_band_by_band_with_validation( ds: xr.Dataset, - existing_dataset: Optional[xr.Dataset], + existing_dataset: xr.Dataset | None, output_path: str, - encoding: Dict[str, Any], + encoding: dict[Hashable, XarrayEncodingJSON], max_retries: int, group_name: str, force_overwrite: bool = False, -) -> Tuple[bool, xr.Dataset]: +) -> tuple[bool, xr.Dataset]: """ Write dataset band by band with individual band validation. @@ -1215,9 +1224,10 @@ def _find_reference_crs(geozarr_groups: Dict[str, xr.Dataset]) -> Optional[str]: def _create_encoding( ds: xr.Dataset, compressor: Any, spatial_chunk: int -) -> Dict[str, Any]: +) -> dict[Hashable, XarrayEncodingJSON]: """Create encoding for dataset variables.""" - encoding: Dict[str, Any] = {} + encoding: dict[Hashable, XarrayEncodingJSON] = {} + chunking: tuple[int, ...] for var in ds.data_vars: if hasattr(ds[var].data, "chunks"): current_chunks = ds[var].chunks @@ -1257,9 +1267,9 @@ def _create_encoding( def _create_geozarr_encoding( ds: xr.Dataset, compressor: Any, spatial_chunk: int -) -> Dict[str, Any]: +) -> dict[Hashable, XarrayEncodingJSON]: """Create encoding for GeoZarr dataset variables.""" - encoding: Dict[str, Any] = {} + encoding: dict[Hashable, XarrayEncodingJSON] = {} for var in ds.data_vars: if utils.is_grid_mapping_variable(ds, var): encoding[var] = {"compressors": None} @@ -1287,7 +1297,7 @@ def _create_geozarr_encoding( return encoding -def _load_existing_dataset(path: str) -> Optional[xr.Dataset]: +def _load_existing_dataset(path: str) -> xr.Dataset | None: """Load existing dataset if it exists.""" try: if fs_utils.path_exists(path): @@ -1306,7 +1316,7 @@ def _load_existing_dataset(path: str) -> Optional[xr.Dataset]: def _create_tile_matrix_limits( - overview_levels: List[Dict[str, Any]], tile_width: int + overview_levels: Iterable[OverviewLevelJSON], tile_width: int ) -> dict[str, TileMatrixLimitJSON]: """Create tile matrix limits for overview levels.""" tile_matrix_limits: dict[str, TileMatrixLimitJSON] = {} @@ -1326,7 +1336,7 @@ def _create_tile_matrix_limits( return tile_matrix_limits -def _get_x_coord_attrs() -> Dict[str, Any]: +def _get_x_coord_attrs() -> StandardXCoordAttrsJSON: """Get standard attributes for x coordinate.""" return { "units": "m", @@ -1336,7 +1346,7 @@ def _get_x_coord_attrs() -> Dict[str, Any]: } -def _get_y_coord_attrs() -> Dict[str, Any]: +def _get_y_coord_attrs() -> StandardYCoordAttrsJSON: """Get standard attributes for y coordinate.""" return { "units": "m", @@ -1346,7 +1356,7 @@ def _get_y_coord_attrs() -> Dict[str, Any]: } -def _find_grid_mapping_var_name(ds: xr.Dataset, data_vars: List[str]) -> str: +def _find_grid_mapping_var_name(ds: xr.Dataset, data_vars: Sequence[str]) -> str: """Find the grid_mapping variable name from the dataset.""" grid_mapping_var_name = ds.attrs.get("grid_mapping", None) if not grid_mapping_var_name and data_vars: diff --git a/src/eopf_geozarr/data_api/geozarr/common.py b/src/eopf_geozarr/data_api/geozarr/common.py index 9ba2a69..ee21ee3 100644 --- a/src/eopf_geozarr/data_api/geozarr/common.py +++ b/src/eopf_geozarr/data_api/geozarr/common.py @@ -3,14 +3,14 @@ import io import urllib import urllib.request -from typing import Annotated, Any, Final, Literal, Mapping, TypedDict, TypeVar +from typing import Annotated, Any, Mapping, TypeVar from cf_xarray.utils import parse_cf_standard_name_table from pydantic import AfterValidator, BaseModel from pydantic.experimental.missing_sentinel import MISSING from typing_extensions import Protocol, runtime_checkable -XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" +from eopf_geozarr.data_api.geozarr.types import ResamplingMethod class BaseDataArrayAttrs(BaseModel, extra="allow"): @@ -103,25 +103,6 @@ def check_standard_name(name: str) -> str: CFStandardName = Annotated[str, AfterValidator(check_standard_name)] -ResamplingMethod = Literal[ - "nearest", - "average", - "bilinear", - "cubic", - "cubic_spline", - "lanczos", - "mode", - "max", - "min", - "med", - "sum", - "q1", - "q3", - "rms", - "gauss", -] -"""A string literal indicating a resampling method""" - @runtime_checkable class GroupLike(Protocol): @@ -188,14 +169,6 @@ def array_dimensions(self) -> tuple[str, ...]: ... attributes: BaseDataArrayAttrs -class TileMatrixLimitJSON(TypedDict): - tileMatrix: str - minTileCol: int - minTileRow: int - maxTileCol: int - maxTileRow: int - - class TileMatrixLimit(BaseModel): """""" diff --git a/src/eopf_geozarr/data_api/geozarr/types.py b/src/eopf_geozarr/data_api/geozarr/types.py new file mode 100644 index 0000000..0e0b7dd --- /dev/null +++ b/src/eopf_geozarr/data_api/geozarr/types.py @@ -0,0 +1,79 @@ +"""Types and constants for the GeoZarr data API.""" + +from typing import Any, Final, Literal, NotRequired, TypedDict + + +class TileMatrixLimitJSON(TypedDict): + tileMatrix: str + minTileCol: int + minTileRow: int + maxTileCol: int + maxTileRow: int + + +class XarrayEncodingJSON(TypedDict): + chunks: NotRequired[tuple[int, ...]] + compressors: Any + + +class StandardXCoordAttrsJSON(TypedDict): + units: Literal["m"] + long_name: Literal["x coordinate of projection"] + standard_name: Literal["projection_x_coordinate"] + _ARRAY_DIMENSIONS: list[Literal["x"]] + + +class StandardYCoordAttrsJSON(TypedDict): + units: Literal["m"] + long_name: Literal["y coordinate of projection"] + standard_name: Literal["projection_y_coordinate"] + _ARRAY_DIMENSIONS: list[Literal["y"]] + + +class OverviewLevelJSON(TypedDict): + level: int + zoom: int + width: int + height: int + scale_factor: int + + +class TileMatrixJSON(TypedDict): + id: str + scaleDenominator: float + cellSize: float + pointOfOrigin: tuple[float, float] | list[float] + tileWidth: int + tileHeight: int + matrixWidth: int + matrixHeight: int + + +class TileMatrixSetJSON(TypedDict): + id: str + title: str | None + crs: str | None + supportedCRS: str | None + orderedAxes: tuple[str, str] | None | list[str] + tileMatrices: tuple[TileMatrixJSON, ...] | list[TileMatrixJSON] + + +ResamplingMethod = Literal[ + "nearest", + "average", + "bilinear", + "cubic", + "cubic_spline", + "lanczos", + "mode", + "max", + "min", + "med", + "sum", + "q1", + "q3", + "rms", + "gauss", +] +"""A string literal indicating a resampling method""" +XARRAY_DIMS_KEY: Final = "_ARRAY_DIMENSIONS" diff --git a/src/eopf_geozarr/data_api/geozarr/v2.py b/src/eopf_geozarr/data_api/geozarr/v2.py index 2f9606c..c9f316e 100644 --- a/src/eopf_geozarr/data_api/geozarr/v2.py +++ b/src/eopf_geozarr/data_api/geozarr/v2.py @@ -9,7 +9,6 @@ from pydantic_zarr.v2 import ArraySpec, GroupSpec, auto_attributes from eopf_geozarr.data_api.geozarr.common import ( - XARRAY_DIMS_KEY, BaseDataArrayAttrs, DatasetAttrs, GridMappingAttrs, @@ -17,6 +16,7 @@ check_grid_mapping, check_valid_coordinates, ) +from eopf_geozarr.data_api.geozarr.types import XARRAY_DIMS_KEY class DataArrayAttrs(BaseDataArrayAttrs): From 9ddacf2b22c8220bec2343949665b8669b5cc224 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 17 Sep 2025 15:42:43 +0200 Subject: [PATCH 23/25] clean up types --- src/eopf_geozarr/conversion/geozarr.py | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/eopf_geozarr/conversion/geozarr.py b/src/eopf_geozarr/conversion/geozarr.py index 809ac92..e7330e2 100644 --- a/src/eopf_geozarr/conversion/geozarr.py +++ b/src/eopf_geozarr/conversion/geozarr.py @@ -17,8 +17,8 @@ import os import shutil import time -from collections.abc import Hashable, Iterable, Sequence -from typing import Any, Dict, List, Optional, Tuple +from collections.abc import Hashable, Iterable, Mapping, Sequence +from typing import Any, Dict, List, Tuple import numpy as np import xarray as xr @@ -42,13 +42,13 @@ def create_geozarr_dataset( dt_input: xr.DataTree, - groups: List[str], + groups: Iterable[str], output_path: str, spatial_chunk: int = 4096, min_dimension: int = 256, tile_width: int = 256, max_retries: int = 3, - crs_groups: Optional[List[str]] = None, + crs_groups: list[str] | None = None, ) -> xr.DataTree: """ Create a GeoZarr-spec 0.4 compliant dataset from EOPF data. @@ -111,8 +111,8 @@ def create_geozarr_dataset( def setup_datatree_metadata_geozarr_spec_compliant( - dt: xr.DataTree, groups: List[str] -) -> Dict[str, xr.Dataset]: + dt: xr.DataTree, groups: Iterable[str] +) -> dict[str, xr.Dataset]: """ Set up GeoZarr-spec compliant CF standard names and CRS information. @@ -128,7 +128,7 @@ def setup_datatree_metadata_geozarr_spec_compliant( dict[str, xr.Dataset] Dictionary of datasets with GeoZarr compliance applied """ - geozarr_groups = {} + geozarr_groups: dict[str, xr.Dataset] = {} grid_mapping_var_name = "spatial_ref" for key in groups: @@ -167,14 +167,14 @@ def setup_datatree_metadata_geozarr_spec_compliant( def iterative_copy( dt_input: xr.DataTree, - geozarr_groups: Dict[str, xr.Dataset], + geozarr_groups: dict[str, xr.Dataset], output_path: str, compressor: Any, spatial_chunk: int = 4096, min_dimension: int = 256, tile_width: int = 256, max_retries: int = 3, - crs_groups: Optional[List[str]] = None, + crs_groups: list[str] | None = None, ) -> xr.DataTree: """ Iteratively copy groups from original DataTree to GeoZarr DataTree. @@ -283,7 +283,7 @@ def iterative_copy( def prepare_dataset_with_crs_info( - ds: xr.Dataset, reference_crs: Optional[str] = None + ds: xr.Dataset, reference_crs: str | None = None ) -> xr.Dataset: """ Prepare a dataset with CRS information without writing it to disk. @@ -673,7 +673,7 @@ def create_native_crs_tile_matrix_set( native_crs: Any, native_bounds: tuple[float, float, float, float], overview_levels: Iterable[OverviewLevelJSON], - group_prefix: Optional[str] = "", + group_prefix: str | None = "", ) -> TileMatrixSetJSON: """ Create a custom Tile Matrix Set for the native CRS following GeoZarr spec. @@ -1038,8 +1038,8 @@ def write_dataset_band_by_band_with_validation( def consolidate_metadata( store: StoreLike, - path: Optional[str] = None, - zarr_format: Optional[zarr.core.common.ZarrFormat] = None, + path: str | None = None, + zarr_format: zarr.core.common.ZarrFormat | None = None, ) -> zarr.Group: """ Consolidate metadata of all nodes in a hierarchy. @@ -1065,8 +1065,8 @@ def consolidate_metadata( async def async_consolidate_metadata( store: StoreLike, - path: Optional[str] = None, - zarr_format: Optional[zarr.core.common.ZarrFormat] = None, + path: str | None = None, + zarr_format: zarr.core.common.ZarrFormat | None = None, ) -> zarr.core.group.AsyncGroup: """ Consolidate metadata of all nodes in a hierarchy asynchronously. @@ -1213,9 +1213,9 @@ def _add_geotransform(ds: xr.Dataset, grid_mapping_var: str) -> None: ds[grid_mapping_var].attrs["GeoTransform"] = transform_str -def _find_reference_crs(geozarr_groups: Dict[str, xr.Dataset]) -> Optional[str]: +def _find_reference_crs(geozarr_groups: Mapping[str, xr.Dataset]) -> str | None: """Find the reference CRS in the geozarr groups.""" - for key, group in geozarr_groups.items(): + for group in geozarr_groups.values(): if group.rio.crs: crs_string: str = group.rio.crs.to_string() return crs_string From 770bfc15e9af624a16b5428ac398bc8b4177ebfc Mon Sep 17 00:00:00 2001 From: Emmanuel Mathot Date: Thu, 25 Sep 2025 14:54:58 +0200 Subject: [PATCH 24/25] update pre-commit configuration and improve code formatting in tests and notebook --- .pre-commit-config.yaml | 4 +- src/eopf_geozarr/cli.py | 6 +- src/eopf_geozarr/tests/__init__.py | 96 +++---- src/eopf_geozarr/tests/test_cli_e2e.py | 48 ++-- .../tests/test_integration_sentinel1.py | 24 +- .../tests/test_integration_sentinel2.py | 6 +- .../tests/test_reprojection_validation.py | 18 +- src/notebooks/sentinel2-l2a-analysis.ipynb | 244 ++++++++++++++++++ 8 files changed, 345 insertions(+), 101 deletions(-) create mode 100644 src/notebooks/sentinel2-l2a-analysis.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2639583..d2e8f08 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,13 +5,13 @@ repos: - id: validate-pyproject - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 6.0.1 hooks: - id: isort language_version: python - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.4 + rev: v0.13.1 hooks: - id: ruff args: ["--fix"] diff --git a/src/eopf_geozarr/cli.py b/src/eopf_geozarr/cli.py index 311437a..5ff9b81 100644 --- a/src/eopf_geozarr/cli.py +++ b/src/eopf_geozarr/cli.py @@ -410,9 +410,9 @@ def render_node(node: Any, path: str = "", level: int = 0) -> str: # Generate HTML for this node node_html = f"""
-
+
- {'šŸ“' if children_count > 0 else 'šŸ“„'} + {"šŸ“" if children_count > 0 else "šŸ“„"} {node_name} ({summary}) @@ -882,7 +882,7 @@ def _generate_html_output(
Generated
-
{__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
{__import__("datetime").datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
diff --git a/src/eopf_geozarr/tests/__init__.py b/src/eopf_geozarr/tests/__init__.py index 22bc7fa..21c7784 100644 --- a/src/eopf_geozarr/tests/__init__.py +++ b/src/eopf_geozarr/tests/__init__.py @@ -21,9 +21,9 @@ def _verify_basic_structure(output_path: pathlib.Path, groups: list[str]) -> Non # Check that level 0 (native resolution) exists level_0_path = group_path / "0" assert level_0_path.exists(), f"Level 0 not found for {group}" - assert ( - level_0_path / "zarr.json" - ).exists(), f"Level 0 missing zarr.json for {group}" + assert (level_0_path / "zarr.json").exists(), ( + f"Level 0 missing zarr.json for {group}" + ) def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> None: @@ -49,12 +49,12 @@ def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> No # Check 1: _ARRAY_DIMENSIONS attributes (required by GeoZarr spec) for var_name in ds.data_vars: if var_name != "spatial_ref": # Skip grid_mapping variable - assert ( - "_ARRAY_DIMENSIONS" in ds[var_name].attrs - ), f"Missing _ARRAY_DIMENSIONS for {var_name} in {group}" - assert ds[var_name].attrs["_ARRAY_DIMENSIONS"] == list( - ds[var_name].dims - ), f"Incorrect _ARRAY_DIMENSIONS for {var_name} in {group}" + assert "_ARRAY_DIMENSIONS" in ds[var_name].attrs, ( + f"Missing _ARRAY_DIMENSIONS for {var_name} in {group}" + ) + assert ds[var_name].attrs["_ARRAY_DIMENSIONS"] == list(ds[var_name].dims), ( + f"Incorrect _ARRAY_DIMENSIONS for {var_name} in {group}" + ) print( f" āœ… _ARRAY_DIMENSIONS: {ds[var_name].attrs['_ARRAY_DIMENSIONS']}" ) @@ -62,9 +62,9 @@ def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> No # Check coordinates for coord_name in ds.coords: if coord_name not in ["spatial_ref"]: # Skip CRS coordinate - assert ( - "_ARRAY_DIMENSIONS" in ds[coord_name].attrs - ), f"Missing _ARRAY_DIMENSIONS for coordinate {coord_name} in {group}" + assert "_ARRAY_DIMENSIONS" in ds[coord_name].attrs, ( + f"Missing _ARRAY_DIMENSIONS for coordinate {coord_name} in {group}" + ) print( f" āœ… {coord_name} _ARRAY_DIMENSIONS: {ds[coord_name].attrs['_ARRAY_DIMENSIONS']}" ) @@ -72,9 +72,9 @@ def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> No # Check 2: CF standard names (required by GeoZarr spec) for var_name in ds.data_vars: if var_name != "spatial_ref": - assert ( - "standard_name" in ds[var_name].attrs - ), f"Missing standard_name for {var_name} in {group}" + assert "standard_name" in ds[var_name].attrs, ( + f"Missing standard_name for {var_name} in {group}" + ) assert ( ds[var_name].attrs["standard_name"] == "toa_bidirectional_reflectance" ), f"Incorrect standard_name for {var_name} in {group}" @@ -83,22 +83,22 @@ def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> No # Check 3: Grid mapping attributes (required by GeoZarr spec) for var_name in ds.data_vars: if var_name != "spatial_ref": - assert ( - "grid_mapping" in ds[var_name].attrs - ), f"Missing grid_mapping for {var_name} in {group}" - assert ( - ds[var_name].attrs["grid_mapping"] == "spatial_ref" - ), f"Incorrect grid_mapping for {var_name} in {group}" + assert "grid_mapping" in ds[var_name].attrs, ( + f"Missing grid_mapping for {var_name} in {group}" + ) + assert ds[var_name].attrs["grid_mapping"] == "spatial_ref", ( + f"Incorrect grid_mapping for {var_name} in {group}" + ) print(f" āœ… grid_mapping: {ds[var_name].attrs['grid_mapping']}") # Check 4: Spatial reference variable (as in notebook) assert "spatial_ref" in ds, f"Missing spatial_ref variable in {group}" - assert ( - "_ARRAY_DIMENSIONS" in ds["spatial_ref"].attrs - ), f"Missing _ARRAY_DIMENSIONS for spatial_ref in {group}" - assert ( - ds["spatial_ref"].attrs["_ARRAY_DIMENSIONS"] == [] - ), f"Incorrect _ARRAY_DIMENSIONS for spatial_ref in {group}" + assert "_ARRAY_DIMENSIONS" in ds["spatial_ref"].attrs, ( + f"Missing _ARRAY_DIMENSIONS for spatial_ref in {group}" + ) + assert ds["spatial_ref"].attrs["_ARRAY_DIMENSIONS"] == [], ( + f"Incorrect _ARRAY_DIMENSIONS for spatial_ref in {group}" + ) print( f" āœ… spatial_ref _ARRAY_DIMENSIONS: {ds['spatial_ref'].attrs['_ARRAY_DIMENSIONS']}" ) @@ -124,9 +124,9 @@ def _verify_geozarr_spec_compliance(output_path: pathlib.Path, group: str) -> No if coord == "x" else "projection_y_coordinate" ) - assert ( - ds[coord].attrs["standard_name"] == expected_name - ), f"Incorrect standard_name for {coord} coordinate in {group}" + assert ds[coord].attrs["standard_name"] == expected_name, ( + f"Incorrect standard_name for {coord} coordinate in {group}" + ) print( f" āœ… {coord} standard_name: {ds[coord].attrs['standard_name']}" ) @@ -142,9 +142,9 @@ def _verify_multiscale_structure(output_path: pathlib.Path, group: str) -> None: # Check that at least one level exists (level 0 is always created) level_dirs = [d for d in group_path.iterdir() if d.is_dir() and d.name.isdigit()] - assert ( - len(level_dirs) >= 1 - ), f"Expected at least 1 overview level for {group}, found {len(level_dirs)}" + assert len(level_dirs) >= 1, ( + f"Expected at least 1 overview level for {group}, found {len(level_dirs)}" + ) print( f" Found {len(level_dirs)} overview levels: {sorted([d.name for d in level_dirs])}" ) @@ -156,9 +156,9 @@ def _verify_multiscale_structure(output_path: pathlib.Path, group: str) -> None: ds_0.close() if native_size >= 512: # Larger datasets should have multiple levels - assert ( - len(level_dirs) >= 2 - ), f"Expected multiple overview levels for large dataset {group} (size {native_size}), found {len(level_dirs)}" + assert len(level_dirs) >= 2, ( + f"Expected multiple overview levels for large dataset {group} (size {native_size}), found {len(level_dirs)}" + ) else: print(f" Small dataset (size {native_size}), single level is acceptable") @@ -176,9 +176,9 @@ def _verify_multiscale_structure(output_path: pathlib.Path, group: str) -> None: assert len(ds.data_vars) > 0, f"No data variables in {level_path}" # Verify that spatial dimensions exist - assert ( - "x" in ds.dims and "y" in ds.dims - ), f"Missing spatial dimensions in {level_path}" + assert "x" in ds.dims and "y" in ds.dims, ( + f"Missing spatial dimensions in {level_path}" + ) # Store shape for progression verification level_shapes[level_num] = (ds.dims["y"], ds.dims["x"]) @@ -198,12 +198,12 @@ def _verify_multiscale_structure(output_path: pathlib.Path, group: str) -> None: height_ratio = prev_height / curr_height width_ratio = prev_width / curr_width - assert ( - 1.8 <= height_ratio <= 2.2 - ), f"Height ratio between level {prev_level} and {level} should be ~2, got {height_ratio:.2f}" - assert ( - 1.8 <= width_ratio <= 2.2 - ), f"Width ratio between level {prev_level} and {level} should be ~2, got {width_ratio:.2f}" + assert 1.8 <= height_ratio <= 2.2, ( + f"Height ratio between level {prev_level} and {level} should be ~2, got {height_ratio:.2f}" + ) + assert 1.8 <= width_ratio <= 2.2, ( + f"Width ratio between level {prev_level} and {level} should be ~2, got {width_ratio:.2f}" + ) print( f" Level {prev_level}→{level} downsampling ratio: {height_ratio:.2f}x{width_ratio:.2f}" @@ -253,9 +253,9 @@ def _verify_rgb_data_access(output_path: pathlib.Path, groups: list[str]) -> Non blue_data = ds["b02"].values # Verify data shapes match - assert ( - red_data.shape == green_data.shape == blue_data.shape - ), f"RGB band shapes don't match in {group} level {level_num}" + assert red_data.shape == green_data.shape == blue_data.shape, ( + f"RGB band shapes don't match in {group} level {level_num}" + ) # Verify data is not empty assert red_data.size > 0, f"Empty red data in {group} level {level_num}" diff --git a/src/eopf_geozarr/tests/test_cli_e2e.py b/src/eopf_geozarr/tests/test_cli_e2e.py index 86148f3..96910b3 100644 --- a/src/eopf_geozarr/tests/test_cli_e2e.py +++ b/src/eopf_geozarr/tests/test_cli_e2e.py @@ -115,9 +115,9 @@ def test_cli_convert_real_sentinel2_data(self, temp_output_dir: str) -> None: cmd_info, capture_output=True, text=True, timeout=60 ) - assert ( - result_info.returncode == 0 - ), f"CLI info command failed: {result_info.stderr}" + assert result_info.returncode == 0, ( + f"CLI info command failed: {result_info.stderr}" + ) print("āœ… CLI info command succeeded") print(f"Info output: {result_info.stdout}") @@ -142,9 +142,9 @@ def test_cli_convert_real_sentinel2_data(self, temp_output_dir: str) -> None: cmd_validate, capture_output=True, text=True, timeout=60 ) - assert ( - result_validate.returncode == 0 - ), f"CLI validate command failed: {result_validate.stderr}" + assert result_validate.returncode == 0, ( + f"CLI validate command failed: {result_validate.stderr}" + ) print("āœ… CLI validate command succeeded") print(f"Validation output: {result_validate.stdout}") @@ -189,27 +189,27 @@ def _verify_converted_data_structure( first_var = data_vars[0] # Check _ARRAY_DIMENSIONS - assert ( - "_ARRAY_DIMENSIONS" in ds[first_var].attrs - ), f"Missing _ARRAY_DIMENSIONS in {first_var} for {group}" + assert "_ARRAY_DIMENSIONS" in ds[first_var].attrs, ( + f"Missing _ARRAY_DIMENSIONS in {first_var} for {group}" + ) # Check standard_name - assert ( - "standard_name" in ds[first_var].attrs - ), f"Missing standard_name in {first_var} for {group}" + assert "standard_name" in ds[first_var].attrs, ( + f"Missing standard_name in {first_var} for {group}" + ) # Check grid_mapping - assert ( - "grid_mapping" in ds[first_var].attrs - ), f"Missing grid_mapping in {first_var} for {group}" + assert "grid_mapping" in ds[first_var].attrs, ( + f"Missing grid_mapping in {first_var} for {group}" + ) print(f" āœ… GeoZarr compliance verified for {first_var}") # Check spatial_ref exists if "spatial_ref" in ds: - assert ( - "_ARRAY_DIMENSIONS" in ds["spatial_ref"].attrs - ), f"Missing _ARRAY_DIMENSIONS in spatial_ref for {group}" + assert "_ARRAY_DIMENSIONS" in ds["spatial_ref"].attrs, ( + f"Missing _ARRAY_DIMENSIONS in spatial_ref for {group}" + ) print(" āœ… spatial_ref variable verified") ds.close() @@ -282,9 +282,9 @@ def test_cli_crs_groups_option(self) -> None: ) assert result.returncode == 0, "Convert help command failed" assert "--crs-groups" in result.stdout, "--crs-groups option should be in help" - assert ( - "Groups that need CRS information added" in result.stdout - ), "Help text should be present" + assert "Groups that need CRS information added" in result.stdout, ( + "Help text should be present" + ) print("āœ… --crs-groups option appears in CLI help") @pytest.mark.slow @@ -434,9 +434,9 @@ def test_cli_crs_groups_empty_list(self, temp_output_dir: str) -> None: result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) # Should succeed (empty crs_groups list is valid) - assert ( - result.returncode == 0 - ), f"CLI with empty --crs-groups failed: {result.stderr}" + assert result.returncode == 0, ( + f"CLI with empty --crs-groups failed: {result.stderr}" + ) assert "CRS groups: []" in result.stdout, "Should show empty CRS groups list" print("āœ… CLI with empty --crs-groups list works correctly") diff --git a/src/eopf_geozarr/tests/test_integration_sentinel1.py b/src/eopf_geozarr/tests/test_integration_sentinel1.py index 1987906..ea75547 100644 --- a/src/eopf_geozarr/tests/test_integration_sentinel1.py +++ b/src/eopf_geozarr/tests/test_integration_sentinel1.py @@ -284,18 +284,18 @@ def test_sentinel1_gcp_conversion( y_bounds = (ds_measurements.y.min().values, ds_measurements.y.max().values) # Should be within the original GCP bounds (15-18 lon, 39-41 lat) - assert ( - 14.5 <= x_bounds[0] <= 15.5 - ), f"X min bound {x_bounds[0]} outside expected range" - assert ( - 17.5 <= x_bounds[1] <= 18.5 - ), f"X max bound {x_bounds[1]} outside expected range" - assert ( - 38.5 <= y_bounds[0] <= 39.5 - ), f"Y min bound {y_bounds[0]} outside expected range" - assert ( - 40.5 <= y_bounds[1] <= 41.5 - ), f"Y max bound {y_bounds[1]} outside expected range" + assert 14.5 <= x_bounds[0] <= 15.5, ( + f"X min bound {x_bounds[0]} outside expected range" + ) + assert 17.5 <= x_bounds[1] <= 18.5, ( + f"X max bound {x_bounds[1]} outside expected range" + ) + assert 38.5 <= y_bounds[0] <= 39.5, ( + f"Y min bound {y_bounds[0]} outside expected range" + ) + assert 40.5 <= y_bounds[1] <= 41.5, ( + f"Y max bound {y_bounds[1]} outside expected range" + ) # Check multiscales 2 levels created: 0 (native, checked above) and 1 assert "1" in dt["measurements"] diff --git a/src/eopf_geozarr/tests/test_integration_sentinel2.py b/src/eopf_geozarr/tests/test_integration_sentinel2.py index 1ab21ba..074fb41 100644 --- a/src/eopf_geozarr/tests/test_integration_sentinel2.py +++ b/src/eopf_geozarr/tests/test_integration_sentinel2.py @@ -368,9 +368,9 @@ def test_performance_characteristics( prev_pixels = timing_data[i - 1]["pixels"] # Allow some flexibility, but generally expect fewer pixels at higher levels - assert ( - curr_pixels <= prev_pixels * 1.1 - ), f"Level {timing_data[i]['level']} has more pixels than level {timing_data[i-1]['level']}" + assert curr_pixels <= prev_pixels * 1.1, ( + f"Level {timing_data[i]['level']} has more pixels than level {timing_data[i - 1]['level']}" + ) print("āœ… Performance characteristics verified!") diff --git a/src/eopf_geozarr/tests/test_reprojection_validation.py b/src/eopf_geozarr/tests/test_reprojection_validation.py index 4ae65b7..dc356d0 100644 --- a/src/eopf_geozarr/tests/test_reprojection_validation.py +++ b/src/eopf_geozarr/tests/test_reprojection_validation.py @@ -193,9 +193,9 @@ def test_titiler_compatibility(): elif "spatial_ref" in ds_measurements: # CRS info should be in spatial_ref attributes spatial_ref = ds_measurements.spatial_ref - assert ( - "crs_wkt" in spatial_ref.attrs - ), "Missing CRS information in spatial_ref" + assert "crs_wkt" in spatial_ref.attrs, ( + "Missing CRS information in spatial_ref" + ) print( f" - CRS info found in spatial_ref: {spatial_ref.attrs.get('crs_wkt', 'N/A')[:50]}..." ) @@ -266,14 +266,14 @@ def test_titiler_compatibility(): # Check CRS for overview (may be in spatial_ref variable) if ds_overview.rio.crs is not None: - assert ( - ds_overview.rio.crs.to_epsg() == 4326 - ), "Expected EPSG:4326 CRS for overview" + assert ds_overview.rio.crs.to_epsg() == 4326, ( + "Expected EPSG:4326 CRS for overview" + ) elif "spatial_ref" in ds_overview: spatial_ref_overview = ds_overview.spatial_ref - assert ( - "crs_wkt" in spatial_ref_overview.attrs - ), "Missing CRS information in overview spatial_ref" + assert "crs_wkt" in spatial_ref_overview.attrs, ( + "Missing CRS information in overview spatial_ref" + ) print(" - Overview CRS info found in spatial_ref") else: print(" - Warning: Overview CRS information not directly accessible") diff --git a/src/notebooks/sentinel2-l2a-analysis.ipynb b/src/notebooks/sentinel2-l2a-analysis.ipynb new file mode 100644 index 0000000..b445765 --- /dev/null +++ b/src/notebooks/sentinel2-l2a-analysis.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# EOPF Zarr Explorer Sentinel-2 L2A Data Structure Analysis\n", + "\n", + "This notebook analyzes the EOPF (Earth Observation Processing Framework) Sentinel-2 L2A Zarr dataset structure\n", + "\n", + "## Objectives\n", + "\n", + "1. **Data Structure Analysis**: Complete inventory of EOPF Sentinel-2 L2A dataset structure, chunking, and compression\n", + "2. **Hierarchy Size Analysis**: Display the size of the hierarchy data structure with sums at group level\n", + "3. **Metadata Analysis**: Analyze current metadata conventions and CRS handling\n", + "4. **Performance Analysis**: Identify bottlenecks for web access patterns\n", + "5. **Optimization Recommendations**: Document findings with recommendations for optimization\n", + "\n", + "## Dataset\n", + "\n", + "**Target Dataset**: `s2l2_test.zarr`\n", + "- **Product Type**: Sentinel-2 Level 2A (Bottom-of-Atmosphere reflectance)\n", + "- **Processing Level**: L2A (atmospherically corrected)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Data Loading" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import required libraries\n", + "import json\n", + "import warnings\n", + "import requests\n", + "from pathlib import Path\n", + "\n", + "# Import our analysis utilities\n", + "from eopf_analysis_utils import (\n", + " load_eopf_dataset,\n", + " analyze_hierarchy_sizes,\n", + " print_hierarchy_sizes\n", + ")\n", + "\n", + "# Suppress warnings for cleaner output\n", + "warnings.filterwarnings('ignore', category=UserWarning)\n", + "\n", + "print(\"āœ… Libraries and utilities imported successfully\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset configuration\n", + "DATASET_URL = \"/home/emathot/Workspace/eopf-explorer/data-model/tests-output/eopf_geozarr/s2l2_test.zarr\"\n", + "\n", + "print(f\"🌐 Dataset URL: {DATASET_URL}\")\n", + "\n", + "from xarray.namedarray.parallelcompat import list_chunkmanagers\n", + "chunk_managers = list_chunkmanagers()\n", + "for cm in chunk_managers:\n", + " print(f\"Chunk manager: {cm}\")\n", + "\n", + "from dask.distributed import Client\n", + "client = Client() # set up local cluster on your laptop\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the EOPF dataset\n", + "zarr_store, datatree = load_eopf_dataset(DATASET_URL)\n", + "\n", + "print(f\"\\nšŸ“ Store keys: {list(zarr_store.keys())}\")\n", + "print(f\"🌳 Datatree groups: {list(datatree.groups)}\")\n", + "print(f\"šŸ“Š Datatree variables: {list(datatree.variables)}\")\n", + "\n", + "# Display basic structure\n", + "print(\"\\n=== Zarr Store Structure ===\")\n", + "print(zarr_store.tree())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hierarchy Size Analysis\n", + "\n", + "This section analyzes the size of the hierarchy data structure with sums at group level, providing insights into data distribution and storage requirements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze hierarchy sizes with group-level sums using xarray\n", + "size_analysis = analyze_hierarchy_sizes(datatree, zarr_store)\n", + "\n", + "# Display hierarchy sizes in tree format\n", + "print_hierarchy_sizes(size_analysis, max_depth=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Additional size statistics\n", + "print(f\"\\nšŸ“ˆ DETAILED SIZE STATISTICS:\")\n", + "print(f\" Total arrays: {len(size_analysis['array_sizes'])}\")\n", + "print(f\" Total groups: {len(size_analysis['group_sizes'])}\")\n", + "print(f\" Hierarchy levels: {len(size_analysis['summary_by_level'])}\")\n", + "\n", + "# Show largest arrays\n", + "print(f\"\\nšŸ” LARGEST ARRAYS (Top 10):\")\n", + "sorted_arrays = sorted(size_analysis['array_sizes'].items(), \n", + " key=lambda x: x[1]['size_bytes'], reverse=True)\n", + "\n", + "for i, (array_path, array_info) in enumerate(sorted_arrays[:10]):\n", + " shape_str = \"x\".join(map(str, array_info['shape']))\n", + " print(f\" {i+1:2d}. {array_path}: {array_info['size_formatted']} ({shape_str}, {array_info['dtype']})\")\n", + "\n", + "# Show group size distribution\n", + "print(f\"\\nšŸ“Š GROUP SIZE DISTRIBUTION:\")\n", + "for level, level_info in sorted(size_analysis['summary_by_level'].items()):\n", + " avg_group_size = level_info['total_size_bytes'] / level_info['group_count'] if level_info['group_count'] > 0 else 0\n", + " from eopf_analysis_utils import format_size\n", + " print(f\" Level {level}: {level_info['group_count']} groups, avg size: {format_size(int(avg_group_size))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Group-Level Size Summary\n", + "\n", + "Detailed breakdown of sizes by major groups in the EOPF hierarchy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze major group categories\n", + "print(\"\\nšŸ” MAJOR GROUP ANALYSIS:\")\n", + "\n", + "major_groups = ['measurements', 'quality', 'conditions']\n", + "for group_name in major_groups:\n", + " if group_name in size_analysis['group_sizes']:\n", + " group_info = size_analysis['group_sizes'][group_name]\n", + " print(f\"\\nšŸ“ {group_name.upper()} GROUP:\")\n", + " print(f\" Total size: {group_info['size_formatted']}\")\n", + " print(f\" Arrays: {group_info['array_count']}\")\n", + " print(f\" Subgroups: {group_info['subgroup_count']}\")\n", + " \n", + " # Show percentage of total dataset\n", + " percentage = (group_info['size_bytes'] / size_analysis['total_size_bytes']) * 100\n", + " print(f\" Percentage of total: {percentage:.1f}%\")\n", + "\n", + "# Show resolution group breakdown for measurements\n", + "print(f\"\\nšŸ“Š RESOLUTION GROUP BREAKDOWN:\")\n", + "resolution_groups = {}\n", + "for group_path, group_info in size_analysis['group_sizes'].items():\n", + " if 'measurements/reflectance/' in group_path:\n", + " parts = group_path.split('/')\n", + " if len(parts) >= 3 and parts[2].startswith('r') and parts[2].endswith('m'):\n", + " res_group = parts[2]\n", + " if res_group not in resolution_groups:\n", + " resolution_groups[res_group] = {\n", + " 'size_bytes': 0,\n", + " 'array_count': 0\n", + " }\n", + " resolution_groups[res_group]['size_bytes'] += group_info['size_bytes']\n", + " resolution_groups[res_group]['array_count'] += group_info['array_count']\n", + "\n", + "for res_group, info in sorted(resolution_groups.items()):\n", + " from eopf_analysis_utils import format_size\n", + " print(f\" {res_group}: {format_size(info['size_bytes'])} ({info['array_count']} arrays)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This analysis provides comprehensive insights into:\n", + "\n", + "1. **Hierarchy Structure Sizes**: Complete breakdown of data sizes at each level with group-level sums\n", + "2. **Data Distribution**: Understanding of how data is distributed across the hierarchy\n", + "3. **Storage Requirements**: Detailed size information for capacity planning\n", + "4. **Resolution Analysis**: Breakdown by spatial resolution groups (r10m, r20m, r60m)\n", + "5. **Group Categories**: Analysis of measurements, quality, and conditions groups\n", + "\n", + "The hierarchy size analysis is particularly useful for:\n", + "- Understanding data volume distribution across groups\n", + "- Identifying the largest data components\n", + "- Planning storage and bandwidth requirements\n", + "- Optimizing data access patterns\n", + "- Comparing sizes across different resolution levels" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 6c7489ffa3099c6e70c2e5aee96b3008416d9f66 Mon Sep 17 00:00:00 2001 From: Emmanuel Mathot Date: Thu, 25 Sep 2025 14:56:40 +0200 Subject: [PATCH 25/25] remove obsolete Sentinel-2 L2A data structure analysis notebook --- src/notebooks/sentinel2-l2a-analysis.ipynb | 244 --------------------- 1 file changed, 244 deletions(-) delete mode 100644 src/notebooks/sentinel2-l2a-analysis.ipynb diff --git a/src/notebooks/sentinel2-l2a-analysis.ipynb b/src/notebooks/sentinel2-l2a-analysis.ipynb deleted file mode 100644 index b445765..0000000 --- a/src/notebooks/sentinel2-l2a-analysis.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# EOPF Zarr Explorer Sentinel-2 L2A Data Structure Analysis\n", - "\n", - "This notebook analyzes the EOPF (Earth Observation Processing Framework) Sentinel-2 L2A Zarr dataset structure\n", - "\n", - "## Objectives\n", - "\n", - "1. **Data Structure Analysis**: Complete inventory of EOPF Sentinel-2 L2A dataset structure, chunking, and compression\n", - "2. **Hierarchy Size Analysis**: Display the size of the hierarchy data structure with sums at group level\n", - "3. **Metadata Analysis**: Analyze current metadata conventions and CRS handling\n", - "4. **Performance Analysis**: Identify bottlenecks for web access patterns\n", - "5. **Optimization Recommendations**: Document findings with recommendations for optimization\n", - "\n", - "## Dataset\n", - "\n", - "**Target Dataset**: `s2l2_test.zarr`\n", - "- **Product Type**: Sentinel-2 Level 2A (Bottom-of-Atmosphere reflectance)\n", - "- **Processing Level**: L2A (atmospherically corrected)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup and Data Loading" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import required libraries\n", - "import json\n", - "import warnings\n", - "import requests\n", - "from pathlib import Path\n", - "\n", - "# Import our analysis utilities\n", - "from eopf_analysis_utils import (\n", - " load_eopf_dataset,\n", - " analyze_hierarchy_sizes,\n", - " print_hierarchy_sizes\n", - ")\n", - "\n", - "# Suppress warnings for cleaner output\n", - "warnings.filterwarnings('ignore', category=UserWarning)\n", - "\n", - "print(\"āœ… Libraries and utilities imported successfully\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Dataset configuration\n", - "DATASET_URL = \"/home/emathot/Workspace/eopf-explorer/data-model/tests-output/eopf_geozarr/s2l2_test.zarr\"\n", - "\n", - "print(f\"🌐 Dataset URL: {DATASET_URL}\")\n", - "\n", - "from xarray.namedarray.parallelcompat import list_chunkmanagers\n", - "chunk_managers = list_chunkmanagers()\n", - "for cm in chunk_managers:\n", - " print(f\"Chunk manager: {cm}\")\n", - "\n", - "from dask.distributed import Client\n", - "client = Client() # set up local cluster on your laptop\n", - "client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load the EOPF dataset\n", - "zarr_store, datatree = load_eopf_dataset(DATASET_URL)\n", - "\n", - "print(f\"\\nšŸ“ Store keys: {list(zarr_store.keys())}\")\n", - "print(f\"🌳 Datatree groups: {list(datatree.groups)}\")\n", - "print(f\"šŸ“Š Datatree variables: {list(datatree.variables)}\")\n", - "\n", - "# Display basic structure\n", - "print(\"\\n=== Zarr Store Structure ===\")\n", - "print(zarr_store.tree())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hierarchy Size Analysis\n", - "\n", - "This section analyzes the size of the hierarchy data structure with sums at group level, providing insights into data distribution and storage requirements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analyze hierarchy sizes with group-level sums using xarray\n", - "size_analysis = analyze_hierarchy_sizes(datatree, zarr_store)\n", - "\n", - "# Display hierarchy sizes in tree format\n", - "print_hierarchy_sizes(size_analysis, max_depth=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Additional size statistics\n", - "print(f\"\\nšŸ“ˆ DETAILED SIZE STATISTICS:\")\n", - "print(f\" Total arrays: {len(size_analysis['array_sizes'])}\")\n", - "print(f\" Total groups: {len(size_analysis['group_sizes'])}\")\n", - "print(f\" Hierarchy levels: {len(size_analysis['summary_by_level'])}\")\n", - "\n", - "# Show largest arrays\n", - "print(f\"\\nšŸ” LARGEST ARRAYS (Top 10):\")\n", - "sorted_arrays = sorted(size_analysis['array_sizes'].items(), \n", - " key=lambda x: x[1]['size_bytes'], reverse=True)\n", - "\n", - "for i, (array_path, array_info) in enumerate(sorted_arrays[:10]):\n", - " shape_str = \"x\".join(map(str, array_info['shape']))\n", - " print(f\" {i+1:2d}. {array_path}: {array_info['size_formatted']} ({shape_str}, {array_info['dtype']})\")\n", - "\n", - "# Show group size distribution\n", - "print(f\"\\nšŸ“Š GROUP SIZE DISTRIBUTION:\")\n", - "for level, level_info in sorted(size_analysis['summary_by_level'].items()):\n", - " avg_group_size = level_info['total_size_bytes'] / level_info['group_count'] if level_info['group_count'] > 0 else 0\n", - " from eopf_analysis_utils import format_size\n", - " print(f\" Level {level}: {level_info['group_count']} groups, avg size: {format_size(int(avg_group_size))}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Group-Level Size Summary\n", - "\n", - "Detailed breakdown of sizes by major groups in the EOPF hierarchy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analyze major group categories\n", - "print(\"\\nšŸ” MAJOR GROUP ANALYSIS:\")\n", - "\n", - "major_groups = ['measurements', 'quality', 'conditions']\n", - "for group_name in major_groups:\n", - " if group_name in size_analysis['group_sizes']:\n", - " group_info = size_analysis['group_sizes'][group_name]\n", - " print(f\"\\nšŸ“ {group_name.upper()} GROUP:\")\n", - " print(f\" Total size: {group_info['size_formatted']}\")\n", - " print(f\" Arrays: {group_info['array_count']}\")\n", - " print(f\" Subgroups: {group_info['subgroup_count']}\")\n", - " \n", - " # Show percentage of total dataset\n", - " percentage = (group_info['size_bytes'] / size_analysis['total_size_bytes']) * 100\n", - " print(f\" Percentage of total: {percentage:.1f}%\")\n", - "\n", - "# Show resolution group breakdown for measurements\n", - "print(f\"\\nšŸ“Š RESOLUTION GROUP BREAKDOWN:\")\n", - "resolution_groups = {}\n", - "for group_path, group_info in size_analysis['group_sizes'].items():\n", - " if 'measurements/reflectance/' in group_path:\n", - " parts = group_path.split('/')\n", - " if len(parts) >= 3 and parts[2].startswith('r') and parts[2].endswith('m'):\n", - " res_group = parts[2]\n", - " if res_group not in resolution_groups:\n", - " resolution_groups[res_group] = {\n", - " 'size_bytes': 0,\n", - " 'array_count': 0\n", - " }\n", - " resolution_groups[res_group]['size_bytes'] += group_info['size_bytes']\n", - " resolution_groups[res_group]['array_count'] += group_info['array_count']\n", - "\n", - "for res_group, info in sorted(resolution_groups.items()):\n", - " from eopf_analysis_utils import format_size\n", - " print(f\" {res_group}: {format_size(info['size_bytes'])} ({info['array_count']} arrays)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "This analysis provides comprehensive insights into:\n", - "\n", - "1. **Hierarchy Structure Sizes**: Complete breakdown of data sizes at each level with group-level sums\n", - "2. **Data Distribution**: Understanding of how data is distributed across the hierarchy\n", - "3. **Storage Requirements**: Detailed size information for capacity planning\n", - "4. **Resolution Analysis**: Breakdown by spatial resolution groups (r10m, r20m, r60m)\n", - "5. **Group Categories**: Analysis of measurements, quality, and conditions groups\n", - "\n", - "The hierarchy size analysis is particularly useful for:\n", - "- Understanding data volume distribution across groups\n", - "- Identifying the largest data components\n", - "- Planning storage and bandwidth requirements\n", - "- Optimizing data access patterns\n", - "- Comparing sizes across different resolution levels" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}