Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support exporting individual variables in DataCatalog.export_data #302

Merged
merged 5 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Changed

Added
-----
- Option to export individual variables from a data source and append to an existing data catalog in DataCatalog.export_data. PR #302


Fixed
-----
Expand Down
5 changes: 3 additions & 2 deletions examples/export_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@
"outputs": [],
"source": [
"# List of data sources to export\n",
"source_list = [\"merit_hydro\", \"era5\"]\n",
"# NOTE that for ERA5 we only export the precip variable and for merit_hydro we only export the elevtn variable\n",
"source_list = [\"merit_hydro[elevtn,flwdir]\", \"era5[precip]\", \"vito\"]\n",
"# Geographic extent\n",
"bbox = [12.0, 46.0, 13.0, 46.5]\n",
"# Time extent\n",
Expand Down Expand Up @@ -325,7 +326,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
"version": "3.10.9"
},
"vscode": {
"interpreter": {
Expand Down
20 changes: 16 additions & 4 deletions hydromt/data_adapter/geodataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,30 @@ def to_file(
Name of driver to read data with, see :py:func:`~hydromt.data_catalog.DataCatalog.get_geodataset`
"""
obj = self.get_data(
bbox=bbox, time_tuple=time_tuple, variables=variables, logger=logger
bbox=bbox,
time_tuple=time_tuple,
variables=variables,
logger=logger,
single_var_as_array=variables is None,
)
if obj.vector.index.size == 0 or ("time" in obj.coords and obj.time.size == 0):
return None, None

if driver is None or driver == "netcdf":
# always write netcdf
driver = "netcdf"
fn_out = join(data_root, f"{data_name}.nc")
dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.vector.vars
encoding = {k: {"zlib": True} for k in dvars}
obj.to_netcdf(fn_out, encoding=encoding)
if variables is None:
encoding = {k: {"zlib": True} for k in dvars}
fn_out = join(data_root, f"{data_name}.nc")
obj.to_netcdf(fn_out, encoding=encoding)
else: # save per variable
if not os.path.isdir(join(data_root, data_name)):
os.makedirs(join(data_root, data_name))
for var in dvars:
fn_out = join(data_root, data_name, f"{var}.nc")
obj[var].to_netcdf(fn_out, encoding={var: {"zlib": True}})
fn_out = join(data_root, data_name, "{variable}.nc")
Comment on lines +148 to +158
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section is not covered by tests yet. Check the coverage report.

elif driver == "zarr":
fn_out = join(data_root, f"{data_name}.zarr")
obj.to_zarr(fn_out, **kwargs)
Expand Down
29 changes: 19 additions & 10 deletions hydromt/data_adapter/rasterdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,24 +139,33 @@ def to_file(

try:
obj = self.get_data(
bbox=bbox, time_tuple=time_tuple, variables=variables, logger=logger
bbox=bbox,
time_tuple=time_tuple,
variables=variables,
logger=logger,
single_var_as_array=variables is None,
)
except IndexError as err: # out of bounds
logger.warning(str(err))
return None, None

if driver is None:
driver = self.driver
if driver in ["raster_tindex", "raster"]:
# by default write 2D raster data to GeoTiff and 3D raster data to netcdf
driver = "netcdf" if len(obj.dims) == 3 else "GTiff"
# by default write 2D raster data to GeoTiff and 3D raster data to netcdf
driver = "netcdf" if len(obj.dims) == 3 else "GTiff"
# write using various writers
if driver in ["netcdf"]: # TODO complete list
fn_out = join(data_root, f"{data_name}.nc")
if "encoding" not in kwargs:
dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.raster.vars
kwargs.update(encoding={k: {"zlib": True} for k in dvars})
obj.to_netcdf(fn_out, **kwargs)
dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.raster.vars
if variables is None:
encoding = {k: {"zlib": True} for k in dvars}
fn_out = join(data_root, f"{data_name}.nc")
obj.to_netcdf(fn_out, encoding=encoding, **kwargs)
else: # save per variable
if not os.path.isdir(join(data_root, data_name)):
os.makedirs(join(data_root, data_name))
for var in dvars:
fn_out = join(data_root, data_name, f"{var}.nc")
obj[var].to_netcdf(fn_out, encoding={var: {"zlib": True}}, **kwargs)
fn_out = join(data_root, data_name, "{variable}.nc")
elif driver == "zarr":
fn_out = join(data_root, f"{data_name}.zarr")
obj.to_zarr(fn_out, **kwargs)
Expand Down
36 changes: 31 additions & 5 deletions hydromt/data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ def export_data(
source_names: List = [],
unit_conversion: bool = True,
meta: Dict = {},
append: bool = False,
) -> None:
"""Export a data slice of each dataset and a data_catalog.yml file to disk.

Expand All @@ -481,23 +482,44 @@ def export_data(
Start and end date of period of interest. By default the entire time period
of the dataset is returned.
source_names: list, optional
List of source names to export
List of source names to export, by default None in which case all sources are exported.
Specific variables can be selected by appending them to the source name in square brackets.
For example, to export all variables of 'source_name1' and only 'var1' and 'var2' of 'source_name'
use source_names=['source_name1', 'source_name2[var1,var2]']
unit_conversion: boolean, optional
If False skip unit conversion when parsing data from file, by default True.
meta: dict, optional
key-value pairs to add to the data catalog meta section, such as 'version', by default empty.
append: bool, optional
If True, append to existing data catalog, by default False.
"""
data_root = abspath(data_root)
if not os.path.isdir(data_root):
os.makedirs(data_root)

# create copy of data with selected source names
sources = copy.deepcopy(self.sources)
source_vars = {}
if len(source_names) > 0:
sources = {n: sources[n] for n in source_names}
sources = {}
for name in source_names:
# deduce variables from name
if "[" in name:
variables = name.split("[")[-1].split("]")[0].split(",")
name = name.split("[")[0]
source_vars[name] = variables
sources[name] = copy.deepcopy(self.sources[name])
else:
sources = copy.deepcopy(self.sources)

# read existing data catalog if it exists
fn = join(data_root, "data_catalog.yml")
if isfile(fn) and append:
self.logger.info(f"Appending existing data catalog {fn}")
sources_out = DataCatalog(fn).sources
else:
sources_out = {}

# export data and update sources
sources_out = {}
for key, source in sources.items():
try:
# read slice of source and write to file
Expand All @@ -510,6 +532,7 @@ def export_data(
fn_out, driver = source.to_file(
data_root=data_root,
data_name=key,
variables=source_vars.get(key, None),
bbox=bbox,
time_tuple=time_tuple,
logger=self.logger,
Expand All @@ -529,14 +552,17 @@ def export_data(
source.filesystem = "local"
source.kwargs = {}
source.rename = {}
if key in sources_out:
self.logger.warning(
f"{key} already exists in data catalog and is overwritten."
)
sources_out[key] = source
except FileNotFoundError:
self.logger.warning(f"{key} file not found at {source.path}")

# write data catalog to yml
data_catalog_out = DataCatalog()
data_catalog_out._sources = sources_out
fn = join(data_root, "data_catalog.yml")
data_catalog_out.to_yml(fn, root="auto", meta=meta)

def get_rasterdataset(
Expand Down
12 changes: 10 additions & 2 deletions tests/test_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def test_export_global_datasets(tmpdir):
time_tuple = ("2010-02-10", "2010-02-15")
data_catalog = DataCatalog() # read artifacts by default
source_names = [
"era5",
"era5[precip,temp]",
"grwl_mask",
"modis_lai",
"osm_coastlines",
Expand All @@ -175,12 +175,20 @@ def test_export_global_datasets(tmpdir):
source_names=source_names,
meta={"version": 1},
)
# test append and overwrite source
data_catalog.export_data(
tmpdir,
bbox=bbox,
source_names=["corine"],
append=True,
meta={"version": 2},
)
data_lib_fn = join(tmpdir, "data_catalog.yml")
# check if meta is written
with open(data_lib_fn, "r") as f:
yml_list = f.readlines()
assert yml_list[0].strip() == "meta:"
assert yml_list[1].strip() == "version: 1"
assert yml_list[1].strip() == "version: 2"
assert yml_list[2].strip().startswith("root:")
# check if data is parsed correctly
data_catalog1 = DataCatalog(data_lib_fn)
Expand Down