In [None]:
#primary requirements: # python3, numpy, pandas, xarray, dask, bottleneck, netCDF4, cftime, matplotlib
# run this script to check if all dependencies are installed and working
# also fixes np.NaN to np.nan in marineHeatWaves module

import sys, platform
import numpy as np
import pandas as pd, xarray as xr, dask, bottleneck, netCDF4, cftime, matplotlib
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Union
import matplotlib.pyplot as plt
import os
import matplotlib.dates as mdates
from importlib.metadata import version, PackageNotFoundError
from glob import glob

# convert np.NAN to np.nan to avoid dependency issues
if not hasattr(np, "NaN"):
    np.NaN = np.nan  

print(sys.executable)
print(platform.python_version())

In [None]:
# create daily mean files data from 3-hourly ERA5 wind data
from pathlib import Path
from glob import glob
import re
import pandas as pd
import xarray as xr
import dask

# set the file paths
# input 3-hourly files path
SRC_GLOB = "/home/Desktop/ERA5_wind_data_30S_30N_20E_110E/era5_wind_*.nc"
# output daily mean files path
OUT_DIR  = Path("/home/Desktop/Jupyter files/outputs/wind/daily")
OUT_DIR.mkdir(parents=True, exist_ok=True)

tname = "valid_time"     # file has 'valid_time' variable instead of 'time'
# variable name
u_name, v_name = "u10", "v10"

# write-friendly chunks (daily ~ 365 along time)
# lat/lon sizes below assume your full 30S–30N, 20–110E 0.25° grid (241x361). Adjust if yours differ.
ENC = {
    u_name: {"zlib": True, "complevel": 4, "dtype": "float32", "chunksizes": (90, 120, 120)},
    v_name: {"zlib": True, "complevel": 4, "dtype": "float32", "chunksizes": (90, 120, 120)},
}

dask.config.set(**{"array.slicing.split_large_chunks": True})

# ------------- helpers ------------------
def year_from_path(p: str):
    m = re.search(r"(19|20)\d{4}|(19|20)\d{2}", p)  # tries YYYYM? or YYYY
    # Prefer pure 4-digit year; fallback if filenames have YYYYMM
    m4 = re.search(r"(19|20)\d{2}", p)
    return int(m4.group(0)) if m4 else None

def open_year(paths):
    # Keep each file as one contiguous chunk along time;
    # netcdf4 engine + lock=False avoids over-locking
    return xr.open_mfdataset(
        paths,
        combine="by_coords",
        engine="netcdf4",
        chunks={tname: -1},
        parallel=False,
        lock=False,
        decode_times=True,
        drop_variables=None,
    )[[u_name, v_name]]

# ------------- collect by year ----------
paths = sorted(glob(SRC_GLOB))
by_year = {}
for p in paths:
    y = year_from_path(p)
    if y is not None:
        by_year.setdefault(y, []).append(p)

years = [y for y in sorted(by_year) if 1982 <= y <= 2024]
print(f"Found years: {years[0]}–{years[-1]} ({len(years)} years)")

# ------------- convert per year ----------
for y in years:
    out_nc = OUT_DIR / f"era5_wind_daily_{y}.nc"
    if out_nc.exists():
        print(f"[SKIP] {out_nc.name} already exists.")
        continue

    print(f"[{y}] opening {len(by_year[y])} file(s)…")
    ds3h = open_year(by_year[y])

    # daily mean for u10 and v10
    daily = (
        ds3h
        .resample({tname: "1D"})
        .mean()
        .astype("float32")        # compact on disk
        .sortby(tname)
    )

    # ensure exactly that year range (handles any file overlap)
    daily = daily.sel({tname: slice(f"{y}-01-01", f"{y}-12-31")})

    # write NetCDF
    print(f"[{y}] writing {out_nc.name} …")
    daily.to_netcdf(out_nc, format="NETCDF4", engine="netcdf4", encoding=ENC)

    # small sanity print
    tt = pd.to_datetime(daily[tname].values)
    print(f"[{y}] saved {tt.size} daily steps; {tt[0].date()} → {tt[-1].date()}")

print("✓ Done. Daily files are in:", OUT_DIR)

"""


In [None]:
import numpy as np, pandas as pd, xarray as xr
from glob import glob

FILES_GLOB = "/home/Desktop/ERA5 wind data/ERA5_daily_wind_data_30S_30N_20E_110E/era5_wind_daily_*.nc"

# open files path 
ds = xr.open_mfdataset(sorted(glob(FILES_GLOB)),combine="by_coords",engine="netcdf4",chunks={"valid_time": 1752})  

# Names
tname = "time" if "time" in ds.dims else "valid_time"
latn  = "lat" if "lat" in ds.coords else "latitude"
lonn  = "lon" if "lon" in ds.coords else "longitude"
u_name = "u10"; v_name = "v10"
print(f"Detected wind variables: u='{u_name}', v='{v_name}' (10 m)")

# coordinates & coverage
print("\n1st three cordinated ")
def _preview(arr):
    vals = arr.values
    if vals.ndim == 0:
        return f"{vals!r} (scalar)"
    return f"{vals[:3]} ... {vals[-3:]}"
for c in [tname, latn, lonn]:
    print(f"{c:>10s}: {_preview(ds[c])}")
if "expver" in ds.coords:
    print(f"{'expver':>10s}: {_preview(ds['expver'])}")

# print dimentions
print("\nDimensions:")
for k, v in ds.sizes.items():
    print(f"{k}: {v}")

# set the variable attributes & chunking data
def show_var(name):
    da = ds[name]
    print(f"\nVariable '{name}' attrs")
    for k, v in da.attrs.items():
        print(f"{k}: {v}")
    if hasattr(da.data, "chunks") and da.data.chunks:
        print("chunks:", da.data.chunks)

show_var(u_name)
show_var(v_name)

# quick check of time coverage & values 
t = pd.to_datetime(ds[tname].values)
t0, t1 = t[0], t[-1]
print(f"\n=== Time Coverage ===\nStart: {t0}  End: {t1}  Length: {t.size} steps")

# compute quick spatial means for the first and last day to check values
u, v = ds[u_name], ds[v_name]
first_day = slice(str(pd.to_datetime(t0).date()), str(pd.to_datetime(t0).date()))
last_day  = slice(str(pd.to_datetime(t1).date()), str(pd.to_datetime(t1).date()))

u_first = u.sel({tname: first_day}).mean([tname, latn, lonn]).compute().item()
v_first = v.sel({tname: first_day}).mean([tname, latn, lonn]).compute().item()
u_last  = u.sel({tname: last_day}).mean([tname, latn, lonn]).compute().item()
v_last  = v.sel({tname: last_day}).mean([tname, latn, lonn]).compute().item()
print(f"\nsample spatial velocities plotting u10 and v10 in m/s\n"
      f"{str(t0.date())}: u={u_first:.2f}, v={v_first:.2f}\n"
      f"{str(t1.date())}: u={u_last:.2f},  v={v_last:.2f}")
