Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
d277d01
add geozarr model
d-v-b Jul 28, 2025
e798d98
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Jul 29, 2025
c940bd8
initial working pydantic models for geozarr
d-v-b Jul 29, 2025
553c7c7
initial working pydantic models for geozarr
d-v-b Jul 29, 2025
de36bf6
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Aug 7, 2025
105a3a5
Merge branch 'main' of https://github.com/eopf-explorer/data-model in…
d-v-b Aug 8, 2025
6a88bcb
bump pydantic min version and define serialization by alias per class
d-v-b Aug 8, 2025
bca5f5b
fix broken test
d-v-b Aug 8, 2025
38a721f
wip
d-v-b Aug 11, 2025
ad084c8
Merge branch 'feat/geozarr-model' of https://github.com/d-v-b/data-mo…
d-v-b Aug 11, 2025
dfdeff2
working mini roundtrip
d-v-b Aug 11, 2025
2088e33
refactor test layout
d-v-b Aug 11, 2025
cfcf7e1
refactor v2 and v3 data structures
d-v-b Aug 12, 2025
0faa082
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Aug 14, 2025
f8c5722
adapt to src layout, relax cf requirement
d-v-b Aug 15, 2025
d1a2e2d
add array_dimensions kwarg to from_array
d-v-b Aug 16, 2025
ad585a0
bump mypy python version
d-v-b Aug 18, 2025
3d11af4
lint
d-v-b Aug 18, 2025
6ef0a70
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Sep 3, 2025
667de5d
add grid mapping
d-v-b Sep 4, 2025
9802aba
update multiscale models
d-v-b Sep 5, 2025
9545a38
use GridMappingVariable class, and pydantic experimental missing sent…
d-v-b Sep 5, 2025
9a79771
lint
d-v-b Sep 5, 2025
655b295
remove unnecessary dependency on types-simplejson in pre-commit config
emmanuelmathot Sep 9, 2025
d250b8e
Merge branch 'main' of https://github.com/EOPF-Explorer/data-model in…
d-v-b Sep 17, 2025
f89d6ab
simplify models via protocols
d-v-b Sep 17, 2025
bcba9c0
handle changes to grid mapping semantics
d-v-b Sep 17, 2025
986d5dc
fix failing tests
d-v-b Sep 17, 2025
ad2dcad
add tile matrix limit json type
d-v-b Sep 17, 2025
f044808
add json types
d-v-b Sep 17, 2025
9ddacf2
clean up types
d-v-b Sep 17, 2025
aa244a9
Merge branch 'main' into pr/d-v-b/10
emmanuelmathot Sep 25, 2025
770bfc1
update pre-commit configuration and improve code formatting in tests …
emmanuelmathot Sep 25, 2025
6c7489f
remove obsolete Sentinel-2 L2A data structure analysis notebook
emmanuelmathot Sep 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ repos:
- id: validate-pyproject

- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 6.0.1
hooks:
- id: isort
language_version: python

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.4
rev: v0.13.1
hooks:
- id: ruff
args: ["--fix"]
Expand All @@ -24,6 +24,5 @@ repos:
language_version: python
exclude: tests/.*
additional_dependencies:
- types-simplejson
- types-attrs
- pydantic~=2.0
- pydantic>=2.11
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ classifiers = [
requires-python = ">=3.11"
dependencies = [
"pydantic-zarr>=0.8.0",
"pydantic>=2.12.0a1",
"zarr>=3.1.1",
"xarray>=2025.7.1",
"dask[array,distributed]>=2025.5.1",
Expand Down Expand Up @@ -111,7 +112,7 @@ use_parentheses = true
ensure_newline_before_comments = true

[tool.mypy]
python_version = "3.10"
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
Expand Down
6 changes: 3 additions & 3 deletions src/eopf_geozarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,9 @@ def render_node(node: Any, path: str = "", level: int = 0) -> str:
# Generate HTML for this node
node_html = f"""
<div class="tree-node" style="margin-left: {level * 20}px;">
<details class="tree-details" {'open' if level < 2 else ''}>
<details class="tree-details" {"open" if level < 2 else ""}>
<summary class="tree-summary">
<span class="tree-icon">{'📁' if children_count > 0 else '📄'}</span>
<span class="tree-icon">{"📁" if children_count > 0 else "📄"}</span>
<span class="tree-name">{node_name}</span>
<span class="tree-info">({summary})</span>
</summary>
Expand Down Expand Up @@ -882,7 +882,7 @@ def _generate_html_output(
</div>
<div class="header-info-item">
<div class="header-info-label">Generated</div>
<div class="header-info-value">{__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
<div class="header-info-value">{__import__("datetime").datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</div>
</div>
</div>
</div>
Expand Down
Empty file.
264 changes: 264 additions & 0 deletions src/eopf_geozarr/data_api/geozarr/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""Common utilities for GeoZarr data API."""

import io
import urllib
import urllib.request
from typing import Annotated, Any, Mapping, TypeVar

from cf_xarray.utils import parse_cf_standard_name_table
from pydantic import AfterValidator, BaseModel
from pydantic.experimental.missing_sentinel import MISSING
from typing_extensions import Protocol, runtime_checkable

from eopf_geozarr.data_api.geozarr.types import ResamplingMethod


class BaseDataArrayAttrs(BaseModel, extra="allow"):
"""
Base attributes for a GeoZarr DataArray.

Attributes
----------
"""

grid_mapping: str | MISSING = MISSING


class GridMappingAttrs(BaseModel, extra="allow"):
"""
Grid mapping attributes for a GeoZarr grid mapping variable.

Attributes
----------
grid_mapping_name : str
The name of the grid mapping.

Extra fields are permitted.

Additional attributes might be present depending on the type of grid mapping.

References
----------
https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections
"""

grid_mapping_name: str


def get_cf_standard_names(url: str) -> tuple[str, ...]:
"""Retrieve the set of CF standard names and return them as a tuple."""

headers = {"User-Agent": "eopf_geozarr"}

req = urllib.request.Request(url, headers=headers)

try:
with urllib.request.urlopen(req) as response:
content = response.read() # Read the entire response body into memory
content_fobj = io.BytesIO(content)
except urllib.error.URLError as e:
raise e

_info, table, _aliases = parse_cf_standard_name_table(source=content_fobj)
return tuple(table.keys())


# This is a URL to the CF standard names table.
CF_STANDARD_NAME_URL = (
"https://raw.githubusercontent.com/cf-convention/cf-convention.github.io/"
"master/Data/cf-standard-names/current/src/cf-standard-name-table.xml"
)

# this does IO against github. consider locally storing this data instead if fetching every time
# is problematic.
CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL)


def check_standard_name(name: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please bear with my limited knowledge of pydantic but how is made the link with the actual standard_name field name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pydantic does most of its validation routines based on type annotations. When we annotate an attribute on a pydantic model with this type: https://github.com/d-v-b/data-model/blob/3d11af412e460993f8e603dcff0555c5342c4e8f/src/eopf_geozarr/data_api/geozarr/common.py#L70, then pydantic will run the check_standard_name function after checking that the input is a string.

"""
Check if the standard name is valid according to the CF conventions.

Parameters
----------
name : str
The standard name to check.

Returns
-------
str
The validated standard name.

Raises
------
ValueError
If the standard name is not valid.
"""

if name in CF_STANDARD_NAMES:
return name
raise ValueError(
f"Invalid standard name: {name}. This name was not found in the list of CF standard names."
)


CFStandardName = Annotated[str, AfterValidator(check_standard_name)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we miss the grid_mapping attribute verification defaulted to spatial_ref scalar with the EPSG code. https://zarr.dev/geozarr-spec/documents/standard/template/geozarr-spec.html#_e15d59bd-f2ec-28e8-8016-4e541c95e10f

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add these, and if they are required we should make that more clear in the spec. right now the spec says

CF Conventions – Including attributes such as standard_name, units, axis, and grid_mapping to express spatiotemporal semantics and coordinate system properties.

but it isn't clear which CF attributes are required, optional, etc



@runtime_checkable
class GroupLike(Protocol):
members: Mapping[str, Any] | None
attributes: Any


TGroupLike = TypeVar("TGroupLike", bound=GroupLike)


def check_valid_coordinates(model: TGroupLike) -> TGroupLike:
"""
Check if the coordinates of the DataArrayLike objects listed in GroupLike objects are valid.

For each DataArrayLike in the model, we check the dimensions associated with the DataArrayLike.
For each dimension associated with a data variable, a DataArrayLike with the name of that data
variable must be present in the members of the group.

Parameters
----------
model : GroupLike
An object that implements the GroupLike protocol.

Returns
-------
GroupLike
A GroupLike object with referentially valid coordinates.
"""
if model.members is None:
raise ValueError("Model members cannot be None")

arrays: dict[str, DataArrayLike] = {
k: v for k, v in model.members.items() if isinstance(v, DataArrayLike)
}
for key, array in arrays.items():
for idx, dim in enumerate(array.array_dimensions):
if dim not in model.members:
raise ValueError(
f"Dimension '{dim}' for array '{key}' is not defined in the model members."
)
member = model.members[dim]
if isinstance(member, GroupLike):
raise ValueError(
f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead."
)
if member.shape[0] != array.shape[idx]:
raise ValueError(
f"Dimension '{dim}' for array '{key}' has a shape mismatch: "
f"{member.shape[0]} != {array.shape[idx]}."
)
return model


@runtime_checkable
class DataArrayLike(Protocol):
"""
This is a protocol that models the relevant properties of Zarr V2 and Zarr V3 DataArrays.
"""

@property
def array_dimensions(self) -> tuple[str, ...]: ...

shape: tuple[int, ...]
attributes: BaseDataArrayAttrs


class TileMatrixLimit(BaseModel):
""""""

tileMatrix: str
minTileCol: int
minTileRow: int
maxTileCol: int
maxTileRow: int


class TileMatrix(BaseModel):
id: str
scaleDenominator: float
cellSize: float
pointOfOrigin: tuple[float, float]
tileWidth: int
tileHeight: int
matrixWidth: int
matrixHeight: int


class TileMatrixSet(BaseModel):
id: str
title: str | None = None
crs: str | None = None
supportedCRS: str | None = None
orderedAxes: tuple[str, str] | None = None
tileMatrices: tuple[TileMatrix, ...]


class Multiscales(BaseModel, extra="allow"):
"""
Multiscale metadata for a GeoZarr dataset.

Attributes
----------
tile_matrix_set : str
The tile matrix set identifier for the multiscale dataset.
resampling_method : ResamplingMethod
The name of the resampling method for the multiscale dataset.
tile_matrix_set_limits : dict[str, TileMatrixSetLimits] | None, optional
The tile matrix set limits for the multiscale dataset.
"""

tile_matrix_set: TileMatrixSet
resampling_method: ResamplingMethod
# TODO: ensure that the keys match tile_matrix_set.tileMatrices[$index].id
# TODO: ensure that the keys match the tileMatrix attribute
tile_matrix_limits: dict[str, TileMatrixLimit] | None = None


class DatasetAttrs(BaseModel, extra="allow"):
"""
Attributes for a GeoZarr dataset.

A dataset is a collection of DataArrays. This class models the attributes of a dataset
"""

...


@runtime_checkable
class DatasetLike(Protocol):
members: Mapping[str, DataArrayLike] | None


TDataSetLike = TypeVar("TDataSetLike", bound=DatasetLike)


def check_grid_mapping(model: TDataSetLike) -> TDataSetLike:
"""
Ensure that a grid mapping variable is present, and that it refers to a member of the model.
"""
if model.members is not None:
for name, member in model.members.items():
if member.attributes.grid_mapping not in model.members:
msg = f"Grid mapping variable '{member.attributes.grid_mapping}' declared by {name} was not found in dataset members"
raise ValueError(msg)
return model


class MultiscaleGroupAttrs(BaseModel, extra="allow"):
"""
Attributes for Multiscale GeoZarr dataset.

A Multiscale dataset is a collection of Dataet

Attributes
----------
multiscales: MultiscaleAttrs
"""

multiscales: Multiscales
Loading