Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog/28.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Deduplicate datasets if multiple timespans are specified.
The superset of the timespans is used to create the output files.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
7 changes: 1 addition & 6 deletions registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,19 @@ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_A
CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_187001-188912.nc d7086d4bd6d7934f8f194da01ad668840e8be1e5c1fa5056e81be410d3571856
CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_189001-190912.nc b0556ce505d39d62d01d4a62933476dbc001715e2a331edcba3a6170db075d89
CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_191001-191412.nc 46e8f4e5b5f2ff9cc018d6713a0e8d4666925e803a0e17886ff7137006791c04
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc d1e216bbe42192813609adbb2ee11920a1e321b81b06c4555ec90ed0e410bc42
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_015801-025012.nc 4006bcca6390aaab329fee31446d009cc05e92e17c40c89c974eb15504a27693
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc d29c1e0651d6c179ad5dc8ac8961ade5cd9456f506d9743255664e44360bac62
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn.nc 07ae2f59188889030a7c453bca5f8c6a19f22f1b544b3987ba50a7a4f306c82d
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0da7114197033589e61a7ed6f53412e0727b540e5da9d1b7ed6a51ee2a4629c6
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 3b00f242368a30fabe0db6a8789cf06cacaa0a3ff3726ade731f2ee488a751c5
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 4303d70390ef0e1dac94cbe4cf6354e452cda9a7892eb06dd6c6b834ff09bd86
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0c71cbeb2667a00a452cd7acb2380a162b90abbc5413d31c103941b3bde1882a
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn.nc 3a1846b06105c44c93d4612518fc7f068e67a115f69b21b6cd81225fe82e4f60
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc acc821dc400f53166379d2e23095bc2690d7ca7db6c7a6f88ae29a8771b3c65a
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e78a8899fec1c9ce6c640ac7923a7fa7aa05e23aa245a22632138e63fcae2d6a
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/psl/gn/v20191115/psl_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc b63a3d4051cf17568df808836b189826da580ca8e1db949b1e93a71c80756c8d
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 44a3c90a41744101afb00344f50947fe46444fe5d6bd3623c0c19aa02a378c86
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc a4e1fc3a4a4d00c2fa18ec616338426eb3d91165db3bc57e565ffdc8d6bd9d34
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 8d492ef1f2bb654220fe64977d9942a33af0962ee9afa4017dcc75b6f0103015
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 38e055e57aea5a9ae76ed3fc5325be6783b5694a9edc28aafd24dd462b32e5ce
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e944208c4aeb9d8212089564c110f80c6aad28834fa326f79071d4fa2c73cc11
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/ts/gn/v20191115/ts_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e02530449c92e0ffc72e9edeba57f5d38ab8652a28486c1c2b9ddada1f38fbd9
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Lmon/gpp/gn/v20191115/gpp_Lmon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc da36ed1653f7aafe40a4fc9b99004a46cb45231697ce6b3413dfc171980c37df
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Ofx/areacello/gn/v20191115/areacello_Ofx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc 6808b64c7328bd118537bfb7cfd35748b4e84cae3f6a5586403aa9d8040e4d0b
Expand All @@ -37,7 +34,6 @@ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r2i1p1f1/fx/areacella/gn/v20191128/are
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rlut/gn/v20210316/rlut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 375990b89a38ab390826d3c3efeef4e9295299164eba119e4545165079b86942
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsdt/gn/v20210316/rsdt_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc e647cd4f0cb0ff9e2727f1a5f8a636ddad6c62bded06c415d28f6d1c0632c471
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsut/gn/v20210316/rsut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 63cc0aa1927ded178e79f836ac9f2a058ca96b4cf901339754440bf9a0c55d04
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc fc7d161d8aadf6b679d76515868bec049891039f5e5455ac6711bdea6cd1f6d4
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-018012.nc 28267d35d304d3f3d4bb222eb2a0631a951ed3aaa626e3d0364f83e9ad6e0554
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/fx/areacella/gn/v20210316/areacella_fx_ACCESS-ESM1-5_piControl_r1i1p1f1_gn.nc 0eeabbcf35b548cb943e3f45befadf8c4c605e1ad097996cd04cf95ea073b706
CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_185001-186912.nc 98dc5c8453e98e008b63b73a3004d984644d45ceaad9776534693f209e96deed
Expand All @@ -54,7 +50,6 @@ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r2i1p1f1/fx/areacella/gn/v20200615/area
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/pr/gn/v20210318/pr_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 0fe1c4b7c49ce1d7e7213c5bb5ea7b2597f68aef50f1795deefe07a5bafbc67c
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3afba9008a6b334d2bc44b4038b012ae1eca95ab1c886936a7d07bbb2070a9c8
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc b6f624150e1bfe987d10ef750b9ae72e2486927496285defc2a686ffaa5387bc
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc fb5a034a92de6855258c790f3815b9ee5909dd9c1fad210b9de16cc981a5fe1c
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 640678c83d60c562651fa409f09df8bb7ce560576938fdfd7c932ea10e585db6
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 31a85fade7f921d2650fbcd43f3886f7111d64e65d9c9b32d61e184efdd042bc
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc b5ed05309c4a3000b551b1548d88cf1b910ad23347bc39f0094a935e26d3afe6
Expand Down
107 changes: 98 additions & 9 deletions scripts/fetch_test_data.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,57 @@
import pathlib
from pathlib import Path
from typing import Annotated

import pandas as pd
import pooch
import typer
import xarray as xr
from loguru import logger

from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest, Obs4REFRequest

OUTPUT_PATH = Path("data")
app = typer.Typer()


def _get_match(dataset: pd.DataFrame, source_type: str, key: str) -> pd.Series | None:
"""
Get the matching dataset from the processed datasets

Parameters
----------
dataset
The dataset to match against
source_type
The source type to match against
key
The key to match against

Returns
-------
The matching dataset
"""
matches = dataset.loc[(dataset.source_type == source_type) & (dataset.key == key)]
if len(matches) > 1:
raise ValueError(f"Found multiple datasets with the same key: {key}")

if len(matches) == 0:
return None
return matches.iloc[0]


def process_sample_data_request(
request: DataRequest, decimate: bool, output_directory: Path, quiet: bool
) -> None:
processed_datasets: pd.DataFrame,
request: DataRequest,
decimate: bool,
output_directory: Path,
) -> pd.DataFrame:
"""
Fetch and create sample datasets

Parameters
----------
processed_datasets
The datasets that have already been processed
request
The request to execute

Expand All @@ -28,12 +60,45 @@ def process_sample_data_request(
Whether to decimate the datasets
output_directory
The directory to write the output to
quiet
Whether to suppress progress messages

Returns
-------
The processed datasets from this request
"""
datasets = request.fetch_datasets()
items = []

for _, dataset in datasets.iterrows():
match = _get_match(processed_datasets, request.source_type, dataset.key)

# Check if the dataset has already been processed and can be skipped
if match is not None and request.time_span is not None:
# Dataset has already been processed and a time span was specified
# Check if the dataset already covers the requested time span
if int(match.time_start) <= int(dataset["time_start"]) and int(match.time_end) >= int(
dataset["time_end"]
):
# Already have a dataset that covers the requested time span
logger.info(
f"Skipping regenerating {dataset.key} as it already covers the requested time span"
)
continue

# Update the request to match the superset of the time spans
time_start = (
dataset["time_start"] if dataset["time_start"] < match.time_start else match.time_start
)
time_end = dataset["time_end"] if dataset["time_end"] > match.time_end else match.time_end
request.time_span = (str(time_start), str(time_end))

logger.info(f"Regenerating dataset with new time span: {dataset.key} {request.time_span}")
for file in match.files:
file_path = pathlib.Path(file)
if file_path.exists():
logger.info(f"Removing existing file: {file}")
file_path.unlink()

output_filenames = []
for ds_filename in dataset["files"]:
ds_orig = xr.open_dataset(ds_filename)

Expand All @@ -47,9 +112,22 @@ def process_sample_data_request(
output_filename = output_directory / request.generate_filename(dataset, ds_decimated, ds_filename)
output_filename.parent.mkdir(parents=True, exist_ok=True)
ds_decimated.to_netcdf(output_filename)
output_filenames.append(output_filename)

item = {
"source_type": request.source_type,
"key": dataset.key,
"files": output_filenames,
}
if request.time_span is not None:
item["time_start"] = request.time_span[0]
item["time_end"] = request.time_span[1]

items.append(item)

# Regenerate the registry.txt file
pooch.make_registry(str(OUTPUT_PATH), "registry.txt")
return pd.DataFrame(items)


DATASETS_TO_FETCH = [
Expand All @@ -64,7 +142,7 @@ def process_sample_data_request(
remove_ensembles=True,
time_span=("2000", "2025"),
),
# Climate at global warmings levels data
# ESMValTool Climate at global warmings levels data
CMIP6Request(
facets=dict(
source_id="ACCESS-ESM1-5",
Expand Down Expand Up @@ -184,12 +262,23 @@ def process_sample_data_request(
def create_sample_data(
decimate: bool = True,
output: Path = OUTPUT_PATH,
quiet: Annotated[bool, typer.Argument(envvar="QUIET")] = False,
) -> None:
"""Fetch and create sample datasets"""
processed_datasets = pd.DataFrame(columns=["source_type", "key", "files", "time_start", "time_end"])

for dataset_requested in DATASETS_TO_FETCH:
process_sample_data_request(
dataset_requested, decimate=decimate, output_directory=pathlib.Path(output), quiet=quiet
# Process the request
new_datasets = process_sample_data_request(
processed_datasets,
dataset_requested,
decimate=decimate,
output_directory=pathlib.Path(output),
)
# Remove duplicate source_type and key values, but keep the latest one
processed_datasets = (
pd.concat([processed_datasets, new_datasets], ignore_index=True)
.drop_duplicates(subset=["source_type", "key"], keep="last")
.reset_index(drop=True)
)


Expand Down
7 changes: 6 additions & 1 deletion src/ref_sample_data/data_request/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ class DataRequest(Protocol):
differently to generate the sample data.
"""

source_type: str
time_span: tuple[str, str] | None = None

def fetch_datasets(self) -> pd.DataFrame:
"""
Fetch the datasets from the source

Returns a dataframe of the metadata and paths to the fetched datasets.
This dataframe must contain at minimimum the following columns:
* key: A unique identifier for the dataset
* files: A list of files for the dataset
"""
...

Expand Down Expand Up @@ -69,7 +75,6 @@ class IntakeESGFDataRequest(DataRequest):

facets: dict[str, str | tuple[str, ...]]
remove_ensembles: bool
time_span: tuple[str, str]

def fetch_datasets(self) -> pd.DataFrame:
"""Fetch the datasets from the ESGF."""
Expand Down
3 changes: 3 additions & 0 deletions src/ref_sample_data/data_request/cmip6.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ class CMIP6Request(IntakeESGFDataRequest):
"""
Represents a CMIP6 dataset request

These data are fetched from ESGF and decimated according to their grid type
"""

source_type = "CMIP6"

cmip6_path_items = (
"mip_era",
"activity_drs",
Expand Down
2 changes: 2 additions & 0 deletions src/ref_sample_data/data_request/obs4mips.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Obs4MIPsRequest(IntakeESGFDataRequest):
Represents a Obs4MIPs dataset request
"""

source_type = "obs4MIPs"

obs4mips_path_items = (
"activity_id",
"institution_id",
Expand Down
2 changes: 2 additions & 0 deletions src/ref_sample_data/data_request/obs4ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Obs4REFRequest(DataRequest):
```
"""

source_type = "obs4REF"

def fetch_datasets(self) -> pd.DataFrame:
"""
Fetch the datasets from the source
Expand Down
Loading