In [None]:
import xarray as xr

In [6]:
# Set the path to your NetCDF file here. Use a relative path or update as needed.
DATA_PATH = "./era5_caribbean.nc"  # <-- Update this path as needed
original_nc = xr.open_dataset(DATA_PATH)
original_nc

In [None]:

"""
Split a single NetCDF file into timestamped per-time files that can be collected
by `MeteoDataset.collect(...)` as "analysis" data.

Output file naming follows:
    <dataset_name>.<YYYYMMDD_HHMM>.nc

Place all files directly inside the chosen output directory.
This matches the code path that collects "analysis" datasets.

Usage (example):
    python split_to_meteodataset.py \
        --in path/to/input.nc \
        --out /path/to/output_dir \
        --name mydataset \
        --map u10=wind_u v10=wind_v msl=barometric_pressure tp=precipitation
"""

from __future__ import annotations
import argparse
from pathlib import Path
from typing import Dict

import numpy as np


# ---- helpers ----

def _maybe_rename_dims(ds: xr.Dataset) -> xr.Dataset:
    """Rename common dimension/coord variants to lon/lat/time if needed."""
    rename_map = {}
    if "longitude" in ds:
        rename_map["longitude"] = "lon"
    if "latitude" in ds:
        rename_map["latitude"] = "lat"
    # Some models use "x"/"y" as coords with a geographic CRS. If they already are "lon/lat" names, we leave them.
    if "x" in ds and "lon" not in ds and np.ndim(ds["x"]) == 1:
        rename_map["x"] = "lon"
    if "y" in ds and "lat" not in ds and np.ndim(ds["y"]) == 1:
        rename_map["y"] = "lat"
    if "valid_time" in ds and "time" not in ds:
        rename_map["valid_time"] = "time"
    if "Time" in ds and "time" not in ds:
        rename_map["Time"] = "time"
    if rename_map:
        ds = ds.rename(rename_map)
    return ds


def _standardize_longitude(ds: xr.Dataset) -> xr.Dataset:
    """Ensure longitude is in [-180, 180] and sorted increasingly, matching what MeteoDataset expects.
    Also ensure latitude is sorted from low to high."""
    # Longitude standardization
    if "lon" in ds:
        lon = ds["lon"].to_numpy()
        if np.nanmin(lon) >= 0 and np.nanmax(lon) > 180:
            lon_wrapped = ((lon + 180) % 360) - 180
            ds = ds.assign_coords(lon=("lon", lon_wrapped))
        ds = ds.sortby("lon")
    # Latitude standardization
    if "lat" in ds:
        lat = ds["lat"].to_numpy()
        if lat[0] > lat[-1]:
            ds = ds.sortby("lat")
    return ds


def _apply_var_mapping(ds: xr.Dataset, var_map: Dict[str, str]) -> xr.Dataset:
    """Rename variables according to --map (e.g., u10=wind_u). Missing sources are ignored."""
    rename_dict = {src: dst for src, dst in var_map.items() if src in ds}
    if rename_dict:
        ds = ds.rename(rename_dict)
    return ds


def _coerce_types(ds: xr.Dataset) -> xr.Dataset:
    """Make data float32 to save space; leave coordinates as-is."""
    for v in list(ds.data_vars):
        if np.issubdtype(ds[v].dtype, np.number):
            ds[v] = ds[v].astype("float32")
    return ds


def _select_expected_vars(ds: xr.Dataset) -> xr.Dataset:
    """Keep only variables used by MeteoDataset by default."""
    wanted = {"wind_u", "wind_v", "barometric_pressure", "precipitation"}
    present = [v for v in ds.data_vars if v in wanted]
    if present:
        ds = ds[present]
    return ds


def split_to_meteodataset(
    in_path: Path,
    out_dir: Path,
    dataset_name: str,
    var_map: Dict[str, str] | None = None,
    drop_reftime: bool = True,
) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    with xr.open_dataset(in_path) as ds:
        ds = _maybe_rename_dims(ds)
        ds = _standardize_longitude(ds)
        if var_map:
            ds = _apply_var_mapping(ds, var_map)
        ds = _select_expected_vars(ds)
        ds = _coerce_types(ds)

        if "time" not in ds:
            # If no time dimension, write a single timestamped file using "now"
            raise ValueError("Input dataset has no 'time' coordinate; please add one.")

        # Ensure time is datetime64[ns]
        # first check in it is already a datetimeindex
        if not ds["time"].dtype == "datetime64[ns]":
            ds["time"] = xr.decode_cf(ds).indexes["time"].to_datetimeindex()

        times = ds["time"].values
        if times.size == 0:
            raise ValueError("No timesteps found in 'time' coordinate.")

        for i in range(times.size):
            t = np.datetime64(times[i], "ns")
            tstr = np.datetime_as_string(t, unit="m").replace("-", "").replace("T", "_").replace(":", "")[:13]
            # tstr now like YYYYMMDD_HHMM
            # Slice and drop the time dimension/coord in the file (MeteoDataset.collect expects 2D fields)
            dsi = ds.isel(time=i)
            if drop_reftime and "reftime" in dsi.variables:
                dsi = dsi.drop_vars("reftime")
            # Keep lon/lat coords and data vars
            # Write file
            fname = f"{dataset_name}.{tstr}.nc"
            dsi.drop_vars("time").to_netcdf(out_dir / fname)

    print(f"Wrote per-time files to: {out_dir}")


def parse_var_map(items):
    """Parse --map entries like 'u10=wind_u v10=wind_v' into a dict."""
    mapping = {}
    if not items:
        return mapping
    for item in items:
        if "=" not in item:
            raise argparse.ArgumentError(None, f"Bad map entry '{item}', expected src=dst")
        src, dst = item.split("=", 1)
        mapping[src.strip()] = dst.strip()
    return mapping




Splitting netcdf such that it us compatible with meteodataset

In [16]:
from pathlib import Path

# Example variable mapping (adjust to your dataset’s variable names)
var_map = {
    "u10": "wind_u",
    "v10": "wind_v",
}

split_to_meteodataset(
    in_path=Path(r"c:\projects\2psips\Hurrywave\Manual\era5_caribbean.nc"),    # your big NetCDF file
    out_dir=Path(r"c:\projects\2psips\Hurrywave\Manual\era5_caribean"),  # where small files should go
    dataset_name="era5",            # prefix for file names
    var_map=var_map
)


Wrote per-time files to: c:\projects\2psips\Hurrywave\Manual\era5_caribean
