Deltares · DirkEilander · Apr 3, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -15,6 +15,8 @@ Changed
 
 Added
 -----
+- Option to export individual variables from a data source and append to an existing data catalog in DataCatalog.export_data. PR #302
+
 
 Fixed
 -----

diff --git a/examples/export_data.ipynb b/examples/export_data.ipynb
@@ -160,7 +160,8 @@
    "outputs": [],
    "source": [
     "# List of data sources to export\n",
-    "source_list = [\"merit_hydro\", \"era5\"]\n",
+    "# NOTE that for ERA5 we only export the precip variable and for merit_hydro we only export the elevtn variable\n",
+    "source_list = [\"merit_hydro[elevtn,flwdir]\", \"era5[precip]\", \"vito\"]\n",
     "# Geographic extent\n",
     "bbox = [12.0, 46.0, 13.0, 46.5]\n",
     "# Time extent\n",
@@ -325,7 +326,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.10.9"
   },
   "vscode": {
    "interpreter": {

diff --git a/hydromt/data_adapter/geodataset.py b/hydromt/data_adapter/geodataset.py
@@ -132,18 +132,30 @@ def to_file(
             Name of driver to read data with, see :py:func:`~hydromt.data_catalog.DataCatalog.get_geodataset`
         """
         obj = self.get_data(
-            bbox=bbox, time_tuple=time_tuple, variables=variables, logger=logger
+            bbox=bbox,
+            time_tuple=time_tuple,
+            variables=variables,
+            logger=logger,
+            single_var_as_array=variables is None,
         )
         if obj.vector.index.size == 0 or ("time" in obj.coords and obj.time.size == 0):
             return None, None
 
         if driver is None or driver == "netcdf":
             # always write netcdf
             driver = "netcdf"
-            fn_out = join(data_root, f"{data_name}.nc")
             dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.vector.vars
-            encoding = {k: {"zlib": True} for k in dvars}
-            obj.to_netcdf(fn_out, encoding=encoding)
+            if variables is None:
+                encoding = {k: {"zlib": True} for k in dvars}
+                fn_out = join(data_root, f"{data_name}.nc")
+                obj.to_netcdf(fn_out, encoding=encoding)
+            else:  # save per variable
+                if not os.path.isdir(join(data_root, data_name)):
+                    os.makedirs(join(data_root, data_name))
+                for var in dvars:
+                    fn_out = join(data_root, data_name, f"{var}.nc")
+                    obj[var].to_netcdf(fn_out, encoding={var: {"zlib": True}})
+                fn_out = join(data_root, data_name, "{variable}.nc")
         elif driver == "zarr":
             fn_out = join(data_root, f"{data_name}.zarr")
             obj.to_zarr(fn_out, **kwargs)

diff --git a/hydromt/data_adapter/rasterdataset.py b/hydromt/data_adapter/rasterdataset.py
@@ -139,24 +139,33 @@ def to_file(
 
         try:
             obj = self.get_data(
-                bbox=bbox, time_tuple=time_tuple, variables=variables, logger=logger
+                bbox=bbox,
+                time_tuple=time_tuple,
+                variables=variables,
+                logger=logger,
+                single_var_as_array=variables is None,
             )
         except IndexError as err:  # out of bounds
             logger.warning(str(err))
             return None, None
 
         if driver is None:
-            driver = self.driver
-            if driver in ["raster_tindex", "raster"]:
-                # by default write 2D raster data to GeoTiff and 3D raster data to netcdf
-                driver = "netcdf" if len(obj.dims) == 3 else "GTiff"
+            # by default write 2D raster data to GeoTiff and 3D raster data to netcdf
+            driver = "netcdf" if len(obj.dims) == 3 else "GTiff"
         # write using various writers
         if driver in ["netcdf"]:  # TODO complete list
-            fn_out = join(data_root, f"{data_name}.nc")
-            if "encoding" not in kwargs:
-                dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.raster.vars
-                kwargs.update(encoding={k: {"zlib": True} for k in dvars})
-            obj.to_netcdf(fn_out, **kwargs)
+            dvars = [obj.name] if isinstance(obj, xr.DataArray) else obj.raster.vars
+            if variables is None:
+                encoding = {k: {"zlib": True} for k in dvars}
+                fn_out = join(data_root, f"{data_name}.nc")
+                obj.to_netcdf(fn_out, encoding=encoding, **kwargs)
+            else:  # save per variable
+                if not os.path.isdir(join(data_root, data_name)):
+                    os.makedirs(join(data_root, data_name))
+                for var in dvars:
+                    fn_out = join(data_root, data_name, f"{var}.nc")
+                    obj[var].to_netcdf(fn_out, encoding={var: {"zlib": True}}, **kwargs)
+                fn_out = join(data_root, data_name, "{variable}.nc")
         elif driver == "zarr":
             fn_out = join(data_root, f"{data_name}.zarr")
             obj.to_zarr(fn_out, **kwargs)

diff --git a/hydromt/data_catalog.py b/hydromt/data_catalog.py
@@ -468,6 +468,7 @@ def export_data(
         source_names: List = [],
         unit_conversion: bool = True,
         meta: Dict = {},
+        append: bool = False,
     ) -> None:
         """Export a data slice of each dataset and a data_catalog.yml file to disk.
 
@@ -481,23 +482,44 @@ def export_data(
             Start and end date of period of interest. By default the entire time period
             of the dataset is returned.
         source_names: list, optional
-            List of source names to export
+            List of source names to export, by default None in which case all sources are exported.
+            Specific variables can be selected by appending them to the source name in square brackets.
+            For example, to export all variables of 'source_name1' and only 'var1' and 'var2' of 'source_name'
+            use source_names=['source_name1', 'source_name2[var1,var2]']
         unit_conversion: boolean, optional
             If False skip unit conversion when parsing data from file, by default True.
         meta: dict, optional
             key-value pairs to add to the data catalog meta section, such as 'version', by default empty.
+        append: bool, optional
+            If True, append to existing data catalog, by default False.
         """
         data_root = abspath(data_root)
         if not os.path.isdir(data_root):
             os.makedirs(data_root)
 
         # create copy of data with selected source names
-        sources = copy.deepcopy(self.sources)
+        source_vars = {}
         if len(source_names) > 0:
-            sources = {n: sources[n] for n in source_names}
+            sources = {}
+            for name in source_names:
+                # deduce variables from name
+                if "[" in name:
+                    variables = name.split("[")[-1].split("]")[0].split(",")
+                    name = name.split("[")[0]
+                    source_vars[name] = variables
+                sources[name] = copy.deepcopy(self.sources[name])
+        else:
+            sources = copy.deepcopy(self.sources)
+
+        # read existing data catalog if it exists
+        fn = join(data_root, "data_catalog.yml")
+        if isfile(fn) and append:
+            self.logger.info(f"Appending existing data catalog {fn}")
+            sources_out = DataCatalog(fn).sources
+        else:
+            sources_out = {}
 
         # export data and update sources
-        sources_out = {}
         for key, source in sources.items():
             try:
                 # read slice of source and write to file
@@ -510,6 +532,7 @@ def export_data(
                 fn_out, driver = source.to_file(
                     data_root=data_root,
                     data_name=key,
+                    variables=source_vars.get(key, None),
                     bbox=bbox,
                     time_tuple=time_tuple,
                     logger=self.logger,
@@ -529,14 +552,17 @@ def export_data(
                 source.filesystem = "local"
                 source.kwargs = {}
                 source.rename = {}
+                if key in sources_out:
+                    self.logger.warning(
+                        f"{key} already exists in data catalog and is overwritten."
+                    )
                 sources_out[key] = source
             except FileNotFoundError:
                 self.logger.warning(f"{key} file not found at {source.path}")
 
         # write data catalog to yml
         data_catalog_out = DataCatalog()
         data_catalog_out._sources = sources_out
-        fn = join(data_root, "data_catalog.yml")
         data_catalog_out.to_yml(fn, root="auto", meta=meta)
 
     def get_rasterdataset(

diff --git a/tests/test_data_catalog.py b/tests/test_data_catalog.py
@@ -160,7 +160,7 @@ def test_export_global_datasets(tmpdir):
     time_tuple = ("2010-02-10", "2010-02-15")
     data_catalog = DataCatalog()  # read artifacts by default
     source_names = [
-        "era5",
+        "era5[precip,temp]",
         "grwl_mask",
         "modis_lai",
         "osm_coastlines",
@@ -175,12 +175,20 @@ def test_export_global_datasets(tmpdir):
         source_names=source_names,
         meta={"version": 1},
     )
+    # test append and overwrite source
+    data_catalog.export_data(
+        tmpdir,
+        bbox=bbox,
+        source_names=["corine"],
+        append=True,
+        meta={"version": 2},
+    )
     data_lib_fn = join(tmpdir, "data_catalog.yml")
     # check if meta is written
     with open(data_lib_fn, "r") as f:
         yml_list = f.readlines()
     assert yml_list[0].strip() == "meta:"
-    assert yml_list[1].strip() == "version: 1"
+    assert yml_list[1].strip() == "version: 2"
     assert yml_list[2].strip().startswith("root:")
     # check if data is parsed correctly
     data_catalog1 = DataCatalog(data_lib_fn)