diff --git a/changelog/28.feature.md b/changelog/28.feature.md new file mode 100644 index 00000000..ed938c86 --- /dev/null +++ b/changelog/28.feature.md @@ -0,0 +1,2 @@ +Deduplicate datasets if multiple timespans are specified. +The superset of the timespans is used to create the output files. diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc deleted file mode 100644 index 1733b00b..00000000 Binary files a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc and /dev/null differ diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_015801-025012.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc similarity index 59% rename from data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_015801-025012.nc rename to data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc index 4bc72ebc..7dac3f92 100644 Binary files a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_015801-025012.nc and b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc differ diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc deleted file mode 100644 index 9f4129ef..00000000 Binary files a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc and /dev/null differ diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc deleted file mode 100644 index eff3c05b..00000000 Binary files a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc and /dev/null differ diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc deleted file mode 100644 index bf88264a..00000000 Binary files a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc and /dev/null differ diff --git a/data/CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc b/data/CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc deleted file mode 100644 index bdb198a7..00000000 Binary files a/data/CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc and /dev/null differ diff --git a/registry.txt b/registry.txt index 0775c281..5b0d60fb 100644 --- a/registry.txt +++ b/registry.txt @@ -8,8 +8,7 @@ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_A CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_187001-188912.nc d7086d4bd6d7934f8f194da01ad668840e8be1e5c1fa5056e81be410d3571856 CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_189001-190912.nc b0556ce505d39d62d01d4a62933476dbc001715e2a331edcba3a6170db075d89 CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_191001-191412.nc 46e8f4e5b5f2ff9cc018d6713a0e8d4666925e803a0e17886ff7137006791c04 -CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc d1e216bbe42192813609adbb2ee11920a1e321b81b06c4555ec90ed0e410bc42 -CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_015801-025012.nc 4006bcca6390aaab329fee31446d009cc05e92e17c40c89c974eb15504a27693 +CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc d29c1e0651d6c179ad5dc8ac8961ade5cd9456f506d9743255664e44360bac62 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn.nc 07ae2f59188889030a7c453bca5f8c6a19f22f1b544b3987ba50a7a4f306c82d CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0da7114197033589e61a7ed6f53412e0727b540e5da9d1b7ed6a51ee2a4629c6 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 3b00f242368a30fabe0db6a8789cf06cacaa0a3ff3726ade731f2ee488a751c5 @@ -17,13 +16,11 @@ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsut/gn/v20191115/rsut CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0c71cbeb2667a00a452cd7acb2380a162b90abbc5413d31c103941b3bde1882a CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn.nc 3a1846b06105c44c93d4612518fc7f068e67a115f69b21b6cd81225fe82e4f60 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc acc821dc400f53166379d2e23095bc2690d7ca7db6c7a6f88ae29a8771b3c65a -CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e78a8899fec1c9ce6c640ac7923a7fa7aa05e23aa245a22632138e63fcae2d6a CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/psl/gn/v20191115/psl_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc b63a3d4051cf17568df808836b189826da580ca8e1db949b1e93a71c80756c8d CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 44a3c90a41744101afb00344f50947fe46444fe5d6bd3623c0c19aa02a378c86 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc a4e1fc3a4a4d00c2fa18ec616338426eb3d91165db3bc57e565ffdc8d6bd9d34 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 8d492ef1f2bb654220fe64977d9942a33af0962ee9afa4017dcc75b6f0103015 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 38e055e57aea5a9ae76ed3fc5325be6783b5694a9edc28aafd24dd462b32e5ce -CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e944208c4aeb9d8212089564c110f80c6aad28834fa326f79071d4fa2c73cc11 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/ts/gn/v20191115/ts_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e02530449c92e0ffc72e9edeba57f5d38ab8652a28486c1c2b9ddada1f38fbd9 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Lmon/gpp/gn/v20191115/gpp_Lmon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc da36ed1653f7aafe40a4fc9b99004a46cb45231697ce6b3413dfc171980c37df CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Ofx/areacello/gn/v20191115/areacello_Ofx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc 6808b64c7328bd118537bfb7cfd35748b4e84cae3f6a5586403aa9d8040e4d0b @@ -37,7 +34,6 @@ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r2i1p1f1/fx/areacella/gn/v20191128/are CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rlut/gn/v20210316/rlut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 375990b89a38ab390826d3c3efeef4e9295299164eba119e4545165079b86942 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsdt/gn/v20210316/rsdt_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc e647cd4f0cb0ff9e2727f1a5f8a636ddad6c62bded06c415d28f6d1c0632c471 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsut/gn/v20210316/rsut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 63cc0aa1927ded178e79f836ac9f2a058ca96b4cf901339754440bf9a0c55d04 -CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc fc7d161d8aadf6b679d76515868bec049891039f5e5455ac6711bdea6cd1f6d4 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-018012.nc 28267d35d304d3f3d4bb222eb2a0631a951ed3aaa626e3d0364f83e9ad6e0554 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/fx/areacella/gn/v20210316/areacella_fx_ACCESS-ESM1-5_piControl_r1i1p1f1_gn.nc 0eeabbcf35b548cb943e3f45befadf8c4c605e1ad097996cd04cf95ea073b706 CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_185001-186912.nc 98dc5c8453e98e008b63b73a3004d984644d45ceaad9776534693f209e96deed @@ -54,7 +50,6 @@ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r2i1p1f1/fx/areacella/gn/v20200615/area CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/pr/gn/v20210318/pr_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 0fe1c4b7c49ce1d7e7213c5bb5ea7b2597f68aef50f1795deefe07a5bafbc67c CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3afba9008a6b334d2bc44b4038b012ae1eca95ab1c886936a7d07bbb2070a9c8 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc b6f624150e1bfe987d10ef750b9ae72e2486927496285defc2a686ffaa5387bc -CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc fb5a034a92de6855258c790f3815b9ee5909dd9c1fad210b9de16cc981a5fe1c CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 640678c83d60c562651fa409f09df8bb7ce560576938fdfd7c932ea10e585db6 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 31a85fade7f921d2650fbcd43f3886f7111d64e65d9c9b32d61e184efdd042bc CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc b5ed05309c4a3000b551b1548d88cf1b910ad23347bc39f0094a935e26d3afe6 diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py index ac032922..18c8ecf5 100755 --- a/scripts/fetch_test_data.py +++ b/scripts/fetch_test_data.py @@ -1,10 +1,11 @@ import pathlib from pathlib import Path -from typing import Annotated +import pandas as pd import pooch import typer import xarray as xr +from loguru import logger from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest, Obs4REFRequest @@ -12,14 +13,45 @@ app = typer.Typer() +def _get_match(dataset: pd.DataFrame, source_type: str, key: str) -> pd.Series | None: + """ + Get the matching dataset from the processed datasets + + Parameters + ---------- + dataset + The dataset to match against + source_type + The source type to match against + key + The key to match against + + Returns + ------- + The matching dataset + """ + matches = dataset.loc[(dataset.source_type == source_type) & (dataset.key == key)] + if len(matches) > 1: + raise ValueError(f"Found multiple datasets with the same key: {key}") + + if len(matches) == 0: + return None + return matches.iloc[0] + + def process_sample_data_request( - request: DataRequest, decimate: bool, output_directory: Path, quiet: bool -) -> None: + processed_datasets: pd.DataFrame, + request: DataRequest, + decimate: bool, + output_directory: Path, +) -> pd.DataFrame: """ Fetch and create sample datasets Parameters ---------- + processed_datasets + The datasets that have already been processed request The request to execute @@ -28,12 +60,45 @@ def process_sample_data_request( Whether to decimate the datasets output_directory The directory to write the output to - quiet - Whether to suppress progress messages + + Returns + ------- + The processed datasets from this request """ datasets = request.fetch_datasets() + items = [] for _, dataset in datasets.iterrows(): + match = _get_match(processed_datasets, request.source_type, dataset.key) + + # Check if the dataset has already been processed and can be skipped + if match is not None and request.time_span is not None: + # Dataset has already been processed and a time span was specified + # Check if the dataset already covers the requested time span + if int(match.time_start) <= int(dataset["time_start"]) and int(match.time_end) >= int( + dataset["time_end"] + ): + # Already have a dataset that covers the requested time span + logger.info( + f"Skipping regenerating {dataset.key} as it already covers the requested time span" + ) + continue + + # Update the request to match the superset of the time spans + time_start = ( + dataset["time_start"] if dataset["time_start"] < match.time_start else match.time_start + ) + time_end = dataset["time_end"] if dataset["time_end"] > match.time_end else match.time_end + request.time_span = (str(time_start), str(time_end)) + + logger.info(f"Regenerating dataset with new time span: {dataset.key} {request.time_span}") + for file in match.files: + file_path = pathlib.Path(file) + if file_path.exists(): + logger.info(f"Removing existing file: {file}") + file_path.unlink() + + output_filenames = [] for ds_filename in dataset["files"]: ds_orig = xr.open_dataset(ds_filename) @@ -47,9 +112,22 @@ def process_sample_data_request( output_filename = output_directory / request.generate_filename(dataset, ds_decimated, ds_filename) output_filename.parent.mkdir(parents=True, exist_ok=True) ds_decimated.to_netcdf(output_filename) + output_filenames.append(output_filename) + + item = { + "source_type": request.source_type, + "key": dataset.key, + "files": output_filenames, + } + if request.time_span is not None: + item["time_start"] = request.time_span[0] + item["time_end"] = request.time_span[1] + + items.append(item) # Regenerate the registry.txt file pooch.make_registry(str(OUTPUT_PATH), "registry.txt") + return pd.DataFrame(items) DATASETS_TO_FETCH = [ @@ -64,7 +142,7 @@ def process_sample_data_request( remove_ensembles=True, time_span=("2000", "2025"), ), - # Climate at global warmings levels data + # ESMValTool Climate at global warmings levels data CMIP6Request( facets=dict( source_id="ACCESS-ESM1-5", @@ -184,12 +262,23 @@ def process_sample_data_request( def create_sample_data( decimate: bool = True, output: Path = OUTPUT_PATH, - quiet: Annotated[bool, typer.Argument(envvar="QUIET")] = False, ) -> None: """Fetch and create sample datasets""" + processed_datasets = pd.DataFrame(columns=["source_type", "key", "files", "time_start", "time_end"]) + for dataset_requested in DATASETS_TO_FETCH: - process_sample_data_request( - dataset_requested, decimate=decimate, output_directory=pathlib.Path(output), quiet=quiet + # Process the request + new_datasets = process_sample_data_request( + processed_datasets, + dataset_requested, + decimate=decimate, + output_directory=pathlib.Path(output), + ) + # Remove duplicate source_type and key values, but keep the latest one + processed_datasets = ( + pd.concat([processed_datasets, new_datasets], ignore_index=True) + .drop_duplicates(subset=["source_type", "key"], keep="last") + .reset_index(drop=True) ) diff --git a/src/ref_sample_data/data_request/base.py b/src/ref_sample_data/data_request/base.py index f55df2ea..17e49b26 100644 --- a/src/ref_sample_data/data_request/base.py +++ b/src/ref_sample_data/data_request/base.py @@ -15,11 +15,17 @@ class DataRequest(Protocol): differently to generate the sample data. """ + source_type: str + time_span: tuple[str, str] | None = None + def fetch_datasets(self) -> pd.DataFrame: """ Fetch the datasets from the source Returns a dataframe of the metadata and paths to the fetched datasets. + This dataframe must contain at minimimum the following columns: + * key: A unique identifier for the dataset + * files: A list of files for the dataset """ ... @@ -69,7 +75,6 @@ class IntakeESGFDataRequest(DataRequest): facets: dict[str, str | tuple[str, ...]] remove_ensembles: bool - time_span: tuple[str, str] def fetch_datasets(self) -> pd.DataFrame: """Fetch the datasets from the ESGF.""" diff --git a/src/ref_sample_data/data_request/cmip6.py b/src/ref_sample_data/data_request/cmip6.py index 2dff4640..445458ac 100644 --- a/src/ref_sample_data/data_request/cmip6.py +++ b/src/ref_sample_data/data_request/cmip6.py @@ -41,8 +41,11 @@ class CMIP6Request(IntakeESGFDataRequest): """ Represents a CMIP6 dataset request + These data are fetched from ESGF and decimated according to their grid type """ + source_type = "CMIP6" + cmip6_path_items = ( "mip_era", "activity_drs", diff --git a/src/ref_sample_data/data_request/obs4mips.py b/src/ref_sample_data/data_request/obs4mips.py index bda45bdb..371f9157 100644 --- a/src/ref_sample_data/data_request/obs4mips.py +++ b/src/ref_sample_data/data_request/obs4mips.py @@ -16,6 +16,8 @@ class Obs4MIPsRequest(IntakeESGFDataRequest): Represents a Obs4MIPs dataset request """ + source_type = "obs4MIPs" + obs4mips_path_items = ( "activity_id", "institution_id", diff --git a/src/ref_sample_data/data_request/obs4ref.py b/src/ref_sample_data/data_request/obs4ref.py index 763e5023..30f2e162 100644 --- a/src/ref_sample_data/data_request/obs4ref.py +++ b/src/ref_sample_data/data_request/obs4ref.py @@ -19,6 +19,8 @@ class Obs4REFRequest(DataRequest): ``` """ + source_type = "obs4REF" + def fetch_datasets(self) -> pd.DataFrame: """ Fetch the datasets from the source