In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Download Global WaterPack daily files (main water mask + obs layer)
from 2018-09-01 to 2022-12-31.

Output structure:
    /mnt/cephfs-mount/chenchen/GlobalWaterPack/YYYY/MM/DD/*.tif
"""

import os
import time
import datetime as dt
from pathlib import Path
import requests
from tqdm import tqdm

# ========= CONFIG =========
BASE_URL = "https://download.geoservice.dlr.de/GWP/files/daily"
OUT_ROOT = Path("/mnt/cephfs-mount/chenchen/GlobalWaterPack")

START_DATE = dt.date(2021, 12, 7)
END_DATE   = dt.date(2022, 12, 31)

# Download list: main water mask + observation layer
SUFFIXES = [
    ".tif",        # main daily water mask
    ".obs.tif",    # observation layer
]

# Retry settings
MAX_RETRIES = 3
TIMEOUT     = 60  # seconds


def download_file(url: str, out_path: Path) -> bool:
    """
    Download a single file with retries.
    Returns True if success (or file already exists and non-empty), False otherwise.
    """
    if out_path.exists() and out_path.stat().st_size > 0:
        return True  # skip silently, progressbar

    out_path.parent.mkdir(parents=True, exist_ok=True)

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            with requests.get(url, stream=True, timeout=TIMEOUT) as r:
                if r.status_code == 404:
                    return False
                r.raise_for_status()

                tmp_path = out_path.with_suffix(out_path.suffix + ".part")
                with open(tmp_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024 * 1024):
                        if chunk:
                            f.write(chunk)
                tmp_path.rename(out_path)
            return True
        except Exception:
            time.sleep(3 * attempt)

    return False


def main():
    OUT_ROOT.mkdir(parents=True, exist_ok=True)

    # date range
    total_days = (END_DATE - START_DATE).days + 1
    date_list = [START_DATE + dt.timedelta(days=i) for i in range(total_days)]
# 8
    print(f"Total days to download: {total_days} (from {START_DATE} to {END_DATE})")
    print(f"Saving to: {OUT_ROOT}")

    # Main progressbar over days
    for current in tqdm(date_list, desc="Downloading daily GWP files", unit="day"):

        y = current.year
        m = current.month
        d = current.day
        ymd = f"{y:04d}{m:02d}{d:02d}"

        remote_dir = f"{BASE_URL}/{y:04d}/{m:02d}/{d:02d}"
        stem = f"GWP.OSWF.DAILY.{ymd}.v1"

        out_dir = OUT_ROOT / f"{y:04d}" / f"{m:02d}" / f"{d:02d}"

        for suffix in SUFFIXES:
            filename = stem + suffix
            url = f"{remote_dir}/{filename}"
            out_path = out_dir / filename
            download_file(url, out_path)

    print("\nAll downloads finished.")


if __name__ == "__main__":
    main()

Total days to download: 390 (from 2021-12-07 to 2022-12-31)
Saving to: /mnt/cephfs-mount/chenchen/GlobalWaterPack


Downloading daily GWP files: 100%|███████████████████████████████████████████████████████████████████████| 390/390 [3:34:10<00:00, 32.95s/day]


All downloads finished.



