In [1]:
import valenspy as vp
from pathlib import Path
from valenspy._utilities import load_yml
#Import re
import re
import os

## The Input Manager
The input manager makes accessing shared standard datasets easy.
The input manager is HPC cluster specific - see [dataset_PATHS.yml](../src/valenspy/ancilliary_data/dataset_PATHS.yml) for the available HPC cluster datasets and paths.

In [3]:
manager = vp.InputManager(machine='hortense')

In [3]:
def _get_file_paths_walk(
        dataset_name,
        variables=["tas"],
        period=None,
        freq=None,
        region=None,
        path_identifiers=[],
    ):
        """Get the file paths for the specified dataset, variables, period and frequency."""
        
        # ERA5Land has same lookuptable as ERA5
        if dataset_name == "ERA5-Land":
            dataset_name_lookup = "ERA5"
        else:
            dataset_name_lookup = dataset_name

        raw_LOOKUP = load_yml(f"{dataset_name_lookup}_lookup")

        dataset_path = Path("/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/postprocessing/")
        file_paths = []
        variables = (
            [variables] if isinstance(variables, str) else variables
        )  # if single variable inputted as string, convert to list
        for path, dirs, files, in os.walk(dataset_path):
            for f in files:
                for variable in variables:
                    if variable not in raw_LOOKUP:
                        var_regex = f"{variable}"
                    else:
                        raw_long_name = raw_LOOKUP[variable]["raw_long_name"]
                        raw_name = raw_LOOKUP[variable]["raw_name"]
                        var_regex = f"({raw_long_name}|{raw_name}_|{variable}_)"
                    components = [var_regex] + path_identifiers
                    if period:
                        if isinstance(period, int):
                            year_regex = f"({period})"
                        else:
                            year_regex = f"({'|'.join([str(year) for year in range(period[0], period[1]+1)])})"
                        components.append(year_regex)
                    if freq:
                        components.append(freq)
                    if region:
                        components.append(region)
                    if all(
                            re.search(f"{dataset_path}/.*{component}.*", str(f))
                            for component in components
                        ):
                        file_paths.append(f)
                        break
        return list(set(file_paths))

In [4]:
def _get_file_paths(
        dataset_name,
        variables=["tas"],
        period=None,
        freq=None,
        region=None,
        path_identifiers=[],
    ):
        """Get the file paths for the specified dataset, variables, period and frequency."""
        
        # ERA5Land has same lookuptable as ERA5
        if dataset_name == "ERA5-Land":
            dataset_name_lookup = "ERA5"
        else:
            dataset_name_lookup = dataset_name

        raw_LOOKUP = load_yml(f"{dataset_name_lookup}_lookup")

        dataset_path = Path("/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/postprocessing/")
        file_paths = []
        variables = (
            [variables] if isinstance(variables, str) else variables
        )  # if single variable inputted as string, convert to list
        for f in dataset_path.glob("**/*.nc"):
            for variable in variables:
                if variable not in raw_LOOKUP:
                    var_regex = f"{variable}"
                else:
                    raw_long_name = raw_LOOKUP[variable]["raw_long_name"]
                    raw_name = raw_LOOKUP[variable]["raw_name"]
                    var_regex = f"({raw_long_name}|{raw_name}_|{variable}_)"
                components = [var_regex] + path_identifiers
                if period:
                    if isinstance(period, int):
                        year_regex = f"({period})"
                    else:
                        year_regex = f"({'|'.join([str(year) for year in range(period[0], period[1]+1)])})"
                    components.append(year_regex)
                if freq:
                    components.append(freq)
                if region:
                    components.append(region)
                if all(
                        re.search(f"{dataset_path}/.*{component}.*", str(f))
                        for component in components
                    ):
                    file_paths.append(f)
                    break
        return list(set(file_paths))

In [5]:
def _get_file_paths_old(
        dataset_name,
        variables=["tas"],
        period=None,
        freq=None,
        region=None,
        path_identifiers=[],
    ):
        """Get the file paths for the specified dataset, variables, period and frequency."""

        # ERA5Land has same lookuptable as ERA5
        if dataset_name == "ERA5-Land":
            dataset_name_lookup = "ERA5"
        else:
            dataset_name_lookup = dataset_name

        raw_LOOKUP = load_yml(f"{dataset_name_lookup}_lookup")

        dataset_path = Path("/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/postprocessing/")
        file_paths = []
        variables = (
            [variables] if isinstance(variables, str) else variables
        )  # if single variable inputted as string, convert to list
        for variable in variables:
            if variable not in raw_LOOKUP:
                var_regex = f"{variable}"
            else:
                raw_long_name = raw_LOOKUP[variable]["raw_long_name"]
                raw_name = raw_LOOKUP[variable]["raw_name"]
                var_regex = f"({raw_long_name}|{raw_name}_|{variable}_)"
            components = [var_regex] + path_identifiers
            if period:
                if isinstance(period, int):
                    year_regex = f"({period})"
                else:
                    year_regex = f"({'|'.join([str(year) for year in range(period[0], period[1]+1)])})"
                components.append(year_regex)
            if freq:
                components.append(freq)
            if region:
                components.append(region)
            file_paths += [
                f
                for f in dataset_path.glob("**/*.nc")
                if all(
                    re.search(f"{dataset_path}/.*{component}.*", str(f))
                    for component in components
                )
            ]

        return list(set(file_paths))

## Time testing the input manager


In [None]:
%%time
ds = manager.load_data("ERA5",["tas"], period=[2000],freq="daily",region="europe", path_identifiers=["min"])

File paths found:
/dodrio/scratch/projects/2022_200/external/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature_min-2000.nc
The file is ValEnsPy CF compliant.
50.00% of the variables are ValEnsPy CF compliant
ValEnsPy CF compliant: ['tas']
Unknown to ValEnsPy: ['time_bnds']
CPU times: user 257 ms, sys: 620 ms, total: 878 ms
Wall time: 13.8 s


In [4]:
%%timeit -n 10
len(manager._get_file_paths("ERA5",["tas"],freq="daily",region="europe", path_identifiers=["min"]))

164 ms ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit -n 10
len(manager._get_file_paths_walk("ERA5",["tas"],freq="daily",region="europe", path_identifiers=["min"]))

102 ms ± 6.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Usage

Access the data using load_data function. 
Variables are **always** accessed through their CORDEX variable name - not the original dataset variable name.

E.g. In era5, the 2m temperature is called 'tp' but is accessed here through the CORDEX variable name 'tas'.

The files that are found and used to load the data are printed and the CF_status of the ds is checked to help debug if the data is not loaded as expected.


In [4]:
ds = manager.load_data("ERA5",["pr"], period=[2000],freq="daily",region="europe", path_identifiers=["min"])
ds

File paths found:
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/daily/total_precipitation/era5-daily_min-europe-total_precipitation-2000.nc
The file is ValEnsPy CF compliant.
50.00% of the variables are ValEnsPy CF compliant
ValEnsPy CF compliant: ['pr']
Unknown to ValEnsPy: ['time_bnds']


Unnamed: 0,Array,Chunk
Bytes,5.72 kiB,5.72 kiB
Shape,"(366, 2)","(366, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 5.72 kiB 5.72 kiB Shape (366, 2) (366, 2) Dask graph 1 chunks in 2 graph layers Data type datetime64[ns] numpy.ndarray",2  366,

Unnamed: 0,Array,Chunk
Bytes,5.72 kiB,5.72 kiB
Shape,"(366, 2)","(366, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,131.54 MiB,32.44 MiB
Shape,"(366, 163, 289)","(362, 81, 145)"
Dask graph,12 chunks in 6 graph layers,12 chunks in 6 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 131.54 MiB 32.44 MiB Shape (366, 163, 289) (362, 81, 145) Dask graph 12 chunks in 6 graph layers Data type float64 numpy.ndarray",289  163  366,

Unnamed: 0,Array,Chunk
Bytes,131.54 MiB,32.44 MiB
Shape,"(366, 163, 289)","(362, 81, 145)"
Dask graph,12 chunks in 6 graph layers,12 chunks in 6 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


Depending on your search criteria, metadata is added to the dataset.

With the load_dataset functionalilty you can also:
- load multiple variables simultaneously and/or
- **not** convert the ds to cf-compliant format and/or ``cf_convert=False``
- Add additional meta_data to the ds by using the metdata_info dictionary

In [5]:
ds = manager.load_data("EOBS",["tas","pr"], path_identifiers=["mean"], cf_convert=True, metadata_info={"creator":"ME"})
ds

File paths found:
/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/tg_ens_mean_0.1deg_reg_v29.0e.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/rr_ens_mean_0.1deg_reg_v29.0e.nc
The file is ValEnsPy CF compliant.
100.00% of the variables are ValEnsPy CF compliant
ValEnsPy CF compliant: ['pr', 'tas']


Unnamed: 0,Array,Chunk
Bytes,263.87 GiB,255.11 MiB
Shape,"(27028, 930, 1409)","(102, 465, 705)"
Dask graph,1060 chunks in 17 graph layers,1060 chunks in 17 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 263.87 GiB 255.11 MiB Shape (27028, 930, 1409) (102, 465, 705) Dask graph 1060 chunks in 17 graph layers Data type float64 numpy.ndarray",1409  930  27028,

Unnamed: 0,Array,Chunk
Bytes,263.87 GiB,255.11 MiB
Shape,"(27028, 930, 1409)","(102, 465, 705)"
Dask graph,1060 chunks in 17 graph layers,1060 chunks in 17 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,131.94 GiB,127.56 MiB
Shape,"(27028, 930, 1409)","(102, 465, 705)"
Dask graph,1060 chunks in 17 graph layers,1060 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 131.94 GiB 127.56 MiB Shape (27028, 930, 1409) (102, 465, 705) Dask graph 1060 chunks in 17 graph layers Data type float32 numpy.ndarray",1409  930  27028,

Unnamed: 0,Array,Chunk
Bytes,131.94 GiB,127.56 MiB
Shape,"(27028, 930, 1409)","(102, 465, 705)"
Dask graph,1060 chunks in 17 graph layers,1060 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## A peak inside the manager

The input_manager uses the path specified in the dataset_PATHS.yml for the given dataset and machine to search all .nc files and file paths that match the filtering requested. The following function is doing all the "magic":

In [6]:
manager._get_file_paths("EOBS",["tas","pr"], path_identifiers=["mean"]) #The magic happens here ! All

[PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/tg_ens_mean_0.1deg_reg_v29.0e.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/rr_ens_mean_0.1deg_reg_v29.0e.nc')]

Above all paths starting with '/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/' and containing the original name (long or short name) for 'tas' or 'pr' in this case tg and rr and "mean" are selected. Other options are:
- region: e.g. europe, belgium
- period: [start_year, end_year] possibly more is covered (note some datasets are not stored by year)!
- frequency: eg. yearly, daily, monthly
- other: Any other keywords to filter by are specified in the path_identifier. E.g. 'mean' for monthly mean data or "min" for minimum daily temperatures

For more information see the documentation on the input_manager and the load_data function.

## Tests of input manager 
Finding exceptional and rare cases

### 1. EOBS finding mean and spread files

In [7]:
manager._get_file_paths("EOBS",["tas"], path_identifiers=[],) #The magic happens here ! All

[PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/tg_ens_spread_0.1deg_reg_v29.0e.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/EOBS/0.1deg/tg_ens_mean_0.1deg_reg_v29.0e.nc')]

## 2. ERA5 not giving only "mean" values when also min and max exist (due to naming of files!)
-> I don't think we need to resolve this here? But rather when we give the ERA5 data a new structure

In [8]:
manager._get_file_paths("ERA5",["tas"], period=[2000,2001],freq="daily",region="europe") #The magic happens here ! All

[PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/daily/2m_temperature/era5-daily_min-europe-2m_temperature-2001.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature_max-2001.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature-2000.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature-2001.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature_min-2000.nc'),
 PosixPath('/dodrio/scratch/projects/2022_200/project_input/External/observations/era5/europe/2m_temperature/daily/era5-daily-europe-2m_temperature_min-2001.nc'),
 PosixPath('/dodrio/scratch/pr

Same is true for ERA5-Land, here different values for pr (mean, min and max) are loaded. 

In [9]:
ds = manager.load_data("ERA5-Land",[ "pr","hfls"], period=[2000,2001], freq="daily", region="belgium", path_identifiers=[])

File paths found:
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/surface_latent_heat_flux/era5-land-daily_max-belgium-surface_latent_heat_flux-2000.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/total_precipitation/era5-land-daily_max-belgium-total_precipitation-2000.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/surface_latent_heat_flux/era5-land-daily_min-belgium-surface_latent_heat_flux-2000.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/total_precipitation/era5-land-daily_max-belgium-total_precipitation-2001.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/total_precipitation/era5-land-daily_min-belgium-total_precipitation-2000.nc
/dodrio/scratch/projects/2022_200/project_input/External/observations/era5-land/belgium/daily/surface_latent_hea