In [None]:
import time
import cdsapi
from pathlib import Path
from tqdm import tqdm
import requests

# ===================== User Config =====================
DATASET = "derived-era5-land-daily-statistics"
OUT_DIR = Path("/mnt/cephfs-mount/chenchen/ERA5_Climate_Data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Start from August 2018 → through Dec 2024
YEARS = list(range(2019, 2020))  # 2018..2024
MONTHS_BY_YEAR = {
    2019: [f"{m:02d}" for m in range(9, 13)],   # Aug–Dec 2018
    **{y: [f"{m:02d}" for m in range(1, 13)] for y in range(2020, 2025)},
}

DAYS = [f"{d:02d}" for d in range(1, 32)]  # CDS ignores invalid days automatically
AREA = [40, -180, -40, 180]                # [North, West, South, East] 40N..40S

# ===== Daily MEAN variables supported by the derived daily-stats product =====
# (Do NOT include accumulated vars like surface_runoff / evaporation / snowfall / tp)
VARS_DAILY_MEAN = [
    "2m_temperature",
    "2m_dewpoint_temperature",
    "skin_temperature",
    "10m_u_component_of_wind",   # correct naming for this dataset
    "10m_v_component_of_wind",   # correct naming for this dataset
    "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4",
    "snow_depth",
    "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation",
]

TIMEZONE = "utc+00:00"
FREQUENCY = "1_hourly"   # derive daily stats from hourly data
FORMAT = "zip"           # 'netcdf' may also be available; 'zip' is widely supported

# ===================== Helpers =====================
def download_with_progress(client, dataset, request, out_file):
    """Retrieve a CDSAPI job and download with a tqdm progress bar."""
    result = client.retrieve(dataset, request)
    url = result.location
    response = client.session.get(url, stream=True)
    if response.status_code == 403:
        response.close()
        raise requests.HTTPError("403 Forbidden (cost limits exceeded). Reduce chunk size.", response=response)
    response.raise_for_status()

    total_size = int(response.headers.get("Content-Length", 0))
    block_size = 1024 * 1024  # 1 MB

    with open(out_file, "wb") as f, tqdm(
        total=total_size,
        unit="B",
        unit_scale=True,
        desc=f"Downloading {out_file.name}",
        ncols=80,
    ) as progress:
        for data in response.iter_content(block_size):
            f.write(data)
            progress.update(len(data))
    response.close()

def build_task(year, month, variables, daily_stat):
    """Builds a single monthly request payload."""
    return {
        "product_type": "reanalysis",
        "variable": variables,
        "year": [str(year)],
        "month": [str(month)],
        "day": DAYS,
        "daily_statistic": daily_stat,
        "time_zone": TIMEZONE,
        "frequency": FREQUENCY,
        "area": AREA,
        "format": FORMAT,
    }

# ===================== Main =====================
def main():
    client = cdsapi.Client()

    # Build only daily_mean tasks from Aug 2018 onwards
    tasks = []
    for y in YEARS:
        for m in MONTHS_BY_YEAR[y]:
            tasks.append((
                y, m, VARS_DAILY_MEAN, "daily_mean",
                OUT_DIR / f"ERA5Land_{y}_{m}_daily_mean.zip"
            ))

    with tqdm(total=len(tasks), desc="Overall progress") as overall:
        for (year, month, var_list, stat, out_path) in tasks:
            if out_path.exists():  # skip existing
                overall.update(1)
                continue

            req = build_task(year, month, var_list, stat)

            # Retry loop (handles transient server/network errors)
            for attempt in range(1, 5):
                try:
                    print(f"\n=== Requesting {year}-{month} [{stat}] ({len(var_list)} vars) ===")
                    download_with_progress(client, DATASET, req, out_path)
                    break  # success
                except requests.HTTPError as e:
                    msg = str(e)
                    if "403" in msg and "cost" in msg.lower():
                        print(f"⚠️  403 cost limit for {year}-{month} [{stat}] — splitting variables into halves.")
                        # split variables into smaller groups and try again
                        if len(var_list) > 1:
                            mid = len(var_list) // 2
                            smaller_groups = [var_list[:mid], var_list[mid:]]
                        else:
                            smaller_groups = [var_list]

                        for i, sg in enumerate(smaller_groups, 1):
                            sub_out = out_path.with_name(out_path.stem + f"_part{i}.zip")
                            if sub_out.exists():
                                continue
                            sub_req = build_task(year, month, sg, stat)
                            print(f"→ Retrying with smaller var group ({i}/{len(smaller_groups)}): {sg}")
                            download_with_progress(client, DATASET, sub_req, sub_out)
                        break  # after splitting vars, move on
                    elif attempt < 4:
                        wait = 15 * attempt
                        print(f"⚠️  HTTP error ({e}). Retry {attempt}/3 after {wait}s...")
                        time.sleep(wait)
                    else:
                        print(f"❌ Failed after retries: {year}-{month} [{stat}] — {e}")
                except Exception as e:
                    if attempt < 4:
                        wait = 10 * attempt
                        print(f"⚠️  Error ({e}). Retry {attempt}/3 after {wait}s...")
                        time.sleep(wait)
                    else:
                        print(f"❌ Failed after retries: {year}-{month} [{stat}] — {e}")
            overall.update(1)

if __name__ == "__main__":
    main()

Overall progress:   0%|                                                                                | 0/4 [00:00<?, ?it/s]


=== Requesting 2019-09 [daily_mean] (12 vars) ===


2025-10-13 07:05:23,269 INFO Request ID is 2b4a746a-0635-46aa-ab61-7d6f8ee0713d
2025-10-13 07:05:23,416 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-13 13:11:03,484 INFO status has been updated to running
2025-10-13 14:23:30,493 INFO status has been updated to successful

Downloading ERA5Land_2019_09_daily_mean.zip:   0%|   | 0.00/715M [00:00<?, ?B/s][A
Downloading ERA5Land_2019_09_daily_mean.zip:   0%| | 1.05M/715M [00:00<07:09, 1.[A
Downloading ERA5Land_2019_0


=== Requesting 2019-10 [daily_mean] (12 vars) ===


Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds


In [3]:
import time
import cdsapi
from pathlib import Path
from tqdm import tqdm
import requests

# ===================== User Config =====================
DATASET = "derived-era5-land-daily-statistics"
OUT_DIR = Path("/mnt/cephfs-mount/chenchen/ERA5_Climate_Data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Start from August 2018 → through Dec 2024
YEARS = list(range(2024, 2025))  # 2018..2024
MONTHS_BY_YEAR = {
    2024: [f"{m:02d}" for m in range(1, 11)],   # Aug–Dec 2018
    **{y: [f"{m:02d}" for m in range(1, 13)] for y in range(2025, 2026)},
}

DAYS = [f"{d:02d}" for d in range(1, 32)]  # CDS ignores invalid days automatically
AREA = [40, -180, -40, 180]                # [North, West, South, East] 40N..40S

# ===== Daily MEAN variables supported by the derived daily-stats product =====
# (Do NOT include accumulated vars like surface_runoff / evaporation / snowfall / tp)
VARS_DAILY_MEAN = [
    "2m_temperature",
    "2m_dewpoint_temperature",
    "skin_temperature",
    "10m_u_component_of_wind",   # correct naming for this dataset
    "10m_v_component_of_wind",   # correct naming for this dataset
    "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4",
    "snow_depth",
    "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation",
]

TIMEZONE = "utc+00:00"
FREQUENCY = "1_hourly"   # derive daily stats from hourly data
FORMAT = "zip"           # 'netcdf' may also be available; 'zip' is widely supported

# ===================== Helpers =====================
def download_with_progress(client, dataset, request, out_file):
    """Retrieve a CDSAPI job and download with a tqdm progress bar."""
    result = client.retrieve(dataset, request)
    url = result.location
    response = client.session.get(url, stream=True)
    if response.status_code == 403:
        response.close()
        raise requests.HTTPError("403 Forbidden (cost limits exceeded). Reduce chunk size.", response=response)
    response.raise_for_status()

    total_size = int(response.headers.get("Content-Length", 0))
    block_size = 1024 * 1024  # 1 MB

    with open(out_file, "wb") as f, tqdm(
        total=total_size,
        unit="B",
        unit_scale=True,
        desc=f"Downloading {out_file.name}",
        ncols=80,
    ) as progress:
        for data in response.iter_content(block_size):
            f.write(data)
            progress.update(len(data))
    response.close()

def build_task(year, month, variables, daily_stat):
    """Builds a single monthly request payload."""
    return {
        "product_type": "reanalysis",
        "variable": variables,
        "year": [str(year)],
        "month": [str(month)],
        "day": DAYS,
        "daily_statistic": daily_stat,
        "time_zone": TIMEZONE,
        "frequency": FREQUENCY,
        "area": AREA,
        "format": FORMAT,
    }

# ===================== Main =====================
def main():
    client = cdsapi.Client()

    # Build only daily_mean tasks from Aug 2018 onwards
    tasks = []
    for y in YEARS:
        for m in MONTHS_BY_YEAR[y]:
            tasks.append((
                y, m, VARS_DAILY_MEAN, "daily_mean",
                OUT_DIR / f"ERA5Land_{y}_{m}_daily_mean.zip"
            ))

    with tqdm(total=len(tasks), desc="Overall progress") as overall:
        for (year, month, var_list, stat, out_path) in tasks:
            if out_path.exists():  # skip existing
                overall.update(1)
                continue

            req = build_task(year, month, var_list, stat)

            # Retry loop (handles transient server/network errors)
            for attempt in range(1, 5):
                try:
                    print(f"\n=== Requesting {year}-{month} [{stat}] ({len(var_list)} vars) ===")
                    download_with_progress(client, DATASET, req, out_path)
                    break  # success
                except requests.HTTPError as e:
                    msg = str(e)
                    if "403" in msg and "cost" in msg.lower():
                        print(f"⚠️  403 cost limit for {year}-{month} [{stat}] — splitting variables into halves.")
                        # split variables into smaller groups and try again
                        if len(var_list) > 1:
                            mid = len(var_list) // 2
                            smaller_groups = [var_list[:mid], var_list[mid:]]
                        else:
                            smaller_groups = [var_list]

                        for i, sg in enumerate(smaller_groups, 1):
                            sub_out = out_path.with_name(out_path.stem + f"_part{i}.zip")
                            if sub_out.exists():
                                continue
                            sub_req = build_task(year, month, sg, stat)
                            print(f"→ Retrying with smaller var group ({i}/{len(smaller_groups)}): {sg}")
                            download_with_progress(client, DATASET, sub_req, sub_out)
                        break  # after splitting vars, move on
                    elif attempt < 4:
                        wait = 15 * attempt
                        print(f"⚠️  HTTP error ({e}). Retry {attempt}/3 after {wait}s...")
                        time.sleep(wait)
                    else:
                        print(f"❌ Failed after retries: {year}-{month} [{stat}] — {e}")
                except Exception as e:
                    if attempt < 4:
                        wait = 10 * attempt
                        print(f"⚠️  Error ({e}). Retry {attempt}/3 after {wait}s...")
                        time.sleep(wait)
                    else:
                        print(f"❌ Failed after retries: {year}-{month} [{stat}] — {e}")
            overall.update(1)

if __name__ == "__main__":
    main()

Overall progress:   0%|                                                                               | 0/10 [00:00<?, ?it/s]


=== Requesting 2024-01 [daily_mean] (12 vars) ===


2025-10-16 04:01:19,235 INFO Request ID is 8d4747ff-f7b7-49ae-b8d2-0308f3257431
2025-10-16 04:01:19,389 INFO status has been updated to accepted
2025-10-16 07:00:46,520 INFO status has been updated to running
2025-10-16 07:34:59,027 INFO status has been updated to successful

Downloading ERA5Land_2024_01_daily_mean.zip:   0%|   | 0.00/728M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_01_daily_mean.zip:   0%| | 1.05M/728M [00:00<06:26, 1.[A
Downloading ERA5Land_2024_01_daily_mean.zip:   0%| | 2.10M/728M [00:00<03:25, 3.[A
Downloading ERA5Land_2024_01_daily_mean.zip:   1%| | 4.19M/728M [00:00<01:41, 7.[A
Downloading ERA5Land_2024_01_daily_mean.zip:   1%| | 6.29M/728M [00:00<01:21, 8.[A
Downloading ERA5Land_2024_01_daily_mean.zip:   1%| | 9.44M/728M [00:01<00:59, 12[A
Downloading ERA5Land_2024_01_daily_mean.zip:   2%| | 11.5M/728M [00:01<01:05, 10[A
Downloading ERA5Land_2024_01_daily_mean.zip:   2%| | 13.6M/728M [00:01<00:57, 12[A
Downloading ERA5Land_2024_01_daily_mean.zip:   2%| 


=== Requesting 2024-02 [daily_mean] (12 vars) ===


2025-10-16 07:36:36,882 INFO Request ID is 9848347f-42e1-42e0-b8e1-e5e0735acf5e
2025-10-16 07:36:37,034 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-16 11:04:14,596 INFO status has been updated to running
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-16 11:37:26,474 INFO status has been updated to successful

Downloading ERA5Land_2024_02_daily_mean.zip:   0%|   | 0.00/694M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_02_daily_mean.zip:   0%| | 1.05M/694M [00:02<24:03, 48[A
Downloading ERA5Land_2024_0


=== Requesting 2024-03 [daily_mean] (12 vars) ===


2025-10-16 11:39:57,271 INFO Request ID is 371ee6d3-5b8a-447a-a276-b0ffadd1c2e2
2025-10-16 11:39:57,426 INFO status has been updated to accepted
2025-10-16 15:39:43,437 INFO status has been updated to running
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-16 16:17:20,954 INFO status has been updated to successful

Downloading ERA5Land_2024_03_daily_mean.zip:   0%|   | 0.00/731M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_03_daily_mean.zip:   0%| | 1.05M/731M [00:00<07:35, 1.[A
Downloading ERA5Land_2024_03_daily_mean.zip:   0%| | 2.10M/731M [00:00<03:52, 3.[A
Downloading ERA5Land_2024_03_daily_mean.zip:   1%| | 4.19M/731M [00:00<01:50, 6.[A
Downloading ERA5Land_2024_03_daily_mean.zip:   1%| | 6.29M/731M [00:01<01:36, 7.[A
Downloading ERA5Land_2024_03_daily_mean.zip:   1%| | 9.44M/731M [00:01<01:26, 8.[A
Downloading ERA5Land_2024_03_daily_mean.zip


=== Requesting 2024-04 [daily_mean] (12 vars) ===


2025-10-16 16:18:09,515 INFO Request ID is b4a587ca-7336-4b59-bf57-40fbaf8d26db
2025-10-16 16:18:09,646 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Max retries exceeded with url: /api/retrieve/v1/jobs/b4a587ca-7336-4b59-bf57-40fbaf8d26db?log=True&request=True (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fb6a55130a0>: Failed to resolve 'cds.climate.copernicus.eu' ([Errno -3] Temporary failure in name resolution)"))], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(h


=== Requesting 2024-05 [daily_mean] (12 vars) ===


2025-10-16 20:48:00,456 INFO Request ID is 9b4ca8f7-c87a-4188-ba4d-d5618ca3d3d9
2025-10-16 20:48:00,605 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-17 00:24:37,171 INFO status has been updated to running
2025-10-17 00:58:49,057 INFO status has been updated to successful

Downloading ERA5Land_2024_05_daily_mean.zip:   0%|   | 0.00/716M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_05_daily_mean.zip:   0%| | 1.05M/716M [00:00<06:42, 1.[A
Downloading ERA5Land_2024_0


=== Requesting 2024-06 [daily_mean] (12 vars) ===


2025-10-17 00:59:36,063 INFO Request ID is b32876ad-789e-44bc-b270-a1a7620783c3
2025-10-17 00:59:36,214 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-17 04:27:07,931 INFO status has been updated to running
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu',


=== Requesting 2024-07 [daily_mean] (12 vars) ===


2025-10-17 05:50:33,434 INFO Request ID is bcb49355-7cda-4f1a-94a6-9dde47f4db17
2025-10-17 05:50:33,617 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-17 09:30:17,396 INFO status has been updated to running
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-17 10:09:31,136 INFO status has been updated to successful

Downloading ERA5Land_2024_07_daily_mean.zip:   0%|   | 0.00/719M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_07_daily_mean.zip:   0%| | 1.05M/719M [00:00<06:32, 1.[A
Downloading ERA5Land_2024_0


=== Requesting 2024-10 [daily_mean] (12 vars) ===


2025-10-17 10:10:35,451 INFO Request ID is af1327c4-1146-449c-b30b-e83df6c69b83
2025-10-17 10:10:35,701 INFO status has been updated to accepted
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
Recovering from connection error [HTTPSConnectionPool(host='cds.climate.copernicus.eu', port=443): Read timed out. (read timeout=60)], attempt 1 of 500
Retrying in 120 seconds
2025-10-17 13:47:12,924 INFO status has been updated to successful

Downloading ERA5Land_2024_10_daily_mean.zip:   0%|   | 0.00/735M [00:00<?, ?B/s][A
Downloading ERA5Land_2024_10_daily_mean.zip:   0%| | 1.05M/735M [00:00<06:42, 1.[A
Downloading ERA5Land_2024_10_daily_mean.zip:   0%| | 2.10M/735M [00:00<03:33, 3.[A
Downloa