In [14]:
import os, time, shutil
# 1) Disable HDF5 file locking (safe on local disks)
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

# 2) (Optional) move outputs OFF OneDrive/Dropbox folders
# e.g., to something like: r"C:\Data\era5_clean"


In [2]:
import os

In [3]:
print(os.getcwd())

C:\Users\_s2218026


In [4]:
os.chdir('/Users/_s2218026/Documents/lab')

In [5]:
!pip install numpy pandas xarray netCDF4 cdsapi scikit-learn matplotlib seaborn



In [6]:
import cdsapi
import calendar
import time

In [7]:
c = cdsapi.Client()
print("cdsapi is working")

cdsapi is working


In [8]:
os.makedirs('./data', exist_ok=True)

# Create one client; add some robustness
c = cdsapi.Client()

# Tune these to shrink requests if needed
VAR_GROUPS = [
    ['u_component_of_wind','v_component_of_wind'],  # group 1
    ['temperature','relative_humidity'],             # group 2
    ['vorticity'],                                   # group 3
]
PLEV_GROUPS = [
    ['850'],   # do one level at a time to keep requests tiny
    ['600'],
    ['200'],
]
TIMES = ['00:00','12:00']  # reduce volume; use 4 times if you really need them

def days_in_month(y, m):
    return [f"{d:02d}" for d in range(1, calendar.monthrange(y, m)[1] + 1)]

def retrieve_small_month(y, m, vars_, plev_, outpath):
    # tiny request; retry a couple times with short backoff
    for attempt in range(3):
        try:
            print(f"â†’ {y}-{m:02d} vars={vars_} plev={plev_} -> {outpath}")
            c.retrieve(
                'reanalysis-era5-pressure-levels',
                {
                    'product_type': 'reanalysis',
                    'format': 'netcdf',
                    'variable': vars_,
                    'pressure_level': plev_,
                    'year': f"{y}",
                    'month': f"{m:02d}",
                    'day': days_in_month(y, m),
                    'time': TIMES,
                    'area': [30, 100, 0, 180],   # N, W, S, E
                    'grid': [0.25, 0.25],        # change to [0.5,0.5] if still too big
                },
                outpath
            )
            return
        except Exception as e:
            print(f"   attempt {attempt+1} failed: {e}")
            time.sleep(5*(attempt+1))
    raise RuntimeError(f"Failed after retries: {y}-{m:02d} {vars_} {plev_}")

def retrieve_range_small(start_year, end_year_inclusive):
    for y in range(start_year, end_year_inclusive+1):
        for m in range(1, 13):
            for vg in VAR_GROUPS:
                for pg in PLEV_GROUPS:
                    out = f'./data/era_{y}_{m:02d}_{"-".join(vg)}_{pg[0]}.nc'
                    if os.path.exists(out):
                        print(f"skip existing {out}")
                        continue
                    retrieve_small_month(y, m, vg, pg, out)

In [9]:
retrieve_range_small(1994, 2024)

skip existing ./data/era_1994_01_u_component_of_wind-v_component_of_wind_850.nc
skip existing ./data/era_1994_01_u_component_of_wind-v_component_of_wind_600.nc
skip existing ./data/era_1994_01_u_component_of_wind-v_component_of_wind_200.nc
skip existing ./data/era_1994_01_temperature-relative_humidity_850.nc
skip existing ./data/era_1994_01_temperature-relative_humidity_600.nc
skip existing ./data/era_1994_01_temperature-relative_humidity_200.nc
skip existing ./data/era_1994_01_vorticity_850.nc
skip existing ./data/era_1994_01_vorticity_600.nc
skip existing ./data/era_1994_01_vorticity_200.nc
skip existing ./data/era_1994_02_u_component_of_wind-v_component_of_wind_850.nc
skip existing ./data/era_1994_02_u_component_of_wind-v_component_of_wind_600.nc
skip existing ./data/era_1994_02_u_component_of_wind-v_component_of_wind_200.nc
skip existing ./data/era_1994_02_temperature-relative_humidity_850.nc
skip existing ./data/era_1994_02_temperature-relative_humidity_600.nc
skip existing ./data

In [15]:
import os, re, gc, calendar, warnings
import numpy as np
import xarray as xr

import os

DATA_DIR   = r"C:\Users\_s2218026\Documents\lab\data"   # <-- your actual folder
DATA_CLEAN = r"C:\Users\_s2218026\Documents\lab\data_clean"  # new output folder
os.makedirs(DATA_CLEAN, exist_ok=True)


LAT_MAX, LAT_MIN = 30.0, 0.0
LON_MIN, LON_MAX = 100.0, 180.0

def _std_lon360(ds):
    if "longitude" in ds.coords:
        lon = ((ds.longitude + 360) % 360)
        ds  = ds.assign_coords(longitude=lon).sortby("longitude")
    return ds

def _std_lat_desc(ds):
    if "latitude" in ds.coords:
        lat = ds.latitude
        if lat[0] < lat[-1]:
            ds = ds.sortby("latitude", ascending=False)
    return ds

def _clip_bbox(ds):
    if "latitude" in ds.coords and "longitude" in ds.coords:
        return ds.sel(latitude=slice(LAT_MAX+0.5, LAT_MIN-0.5),
                      longitude=slice(LON_MIN-0.5, LON_MAX+0.5))
    return ds

def _normalize_vars(ds):
    rename_map = {
        'u_component_of_wind': 'u',
        'v_component_of_wind': 'v',
        'temperature': 't',
        'relative_humidity': 'r',
        'vorticity': 'vo',
    }
    to_rename = {k:v for k,v in rename_map.items() if k in ds.data_vars}
    return ds.rename(to_rename) if to_rename else ds

def _standardize_dims(ds):
    for cand in ["time","valid_time"]:
        if cand in ds.coords:
            if cand != "time": ds = ds.rename({cand:"time"})
    for cand in ["level","pressure_level","isobaricInhPa"]:
        if cand in ds.coords:
            if cand != "level": ds = ds.rename({cand:"level"})
    return ds

ENGINE = "netcdf4"  # more stable on Windows; switch back to 'h5netcdf' later if you want

def _save_nc(ds, path):
    enc = {v: {'dtype':'float32', 'zlib': False, 'contiguous': True} for v in ds.data_vars}  # fast IO
    ds = ds.map(lambda da: da.astype('float32')).chunk({'time': -1, 'latitude': -1, 'longitude': -1})

    tmp = path + ".tmp"
    # remove stale temp if exists
    try:
        if os.path.exists(tmp):
            os.remove(tmp)
    except Exception:
        pass

    for attempt in range(4):
        try:
            with xr.set_options(file_cache_maxsize=1):
                ds.to_netcdf(tmp, encoding=enc, engine=ENGINE, mode="w")
            # atomic replace so indexers/AV see a complete file
            os.replace(tmp, path)
            return
        except Exception as e:
            # brief backoff; try to clean temp file
            try:
                if os.path.exists(tmp):
                    os.remove(tmp)
            except Exception:
                pass
            if attempt == 3:
                raise
            time.sleep(1.0 + 1.5*attempt)

In [16]:
def clean_month(y, m):
    keys = [
        "u_component_of_wind-v_component_of_wind_200",
        "u_component_of_wind-v_component_of_wind_600",
        "u_component_of_wind-v_component_of_wind_850",
        "temperature-relative_humidity_200",
        "temperature-relative_humidity_600",
        "temperature-relative_humidity_850",
        "vorticity_850",
    ]
    for key in keys:
        src = os.path.join(DATA_DIR, f"era_{y}_{m:02d}_{key}.nc")
        dst = os.path.join(DATA_CLEAN, f"era_{y}_{m:02d}_{key}.nc")

        if not os.path.exists(src):
            print(f"[{y}-{m:02d}] missing {key}, skip.")
            continue
        if os.path.exists(dst):
            print(f"[{y}-{m:02d}] exists(clean): {key}")
            continue

        ds = xr.open_dataset(src)
        ds = _standardize_dims(_normalize_vars(_clip_bbox(_std_lat_desc(_std_lon360(ds)))))

        assert "time" in ds.coords, "Missing time coord"
        if "level" in ds.coords: assert ds.level.ndim == 1

        for v in ds.data_vars:
            if v in ("u","v"): assert ds[v].max() < 200 and ds[v].min() > -200
            if v == "t":       assert ds[v].max() < 360 and ds[v].min() > 150
            if v == "r":       ds[v] = ds[v].clip(0,100)
            if v == "vo":      assert ds[v].max() < 1e-2 and ds[v].min() > -1e-2

        _save_nc(ds, dst)
        ds.close(); del ds; gc.collect()
        print(f"[{y}-{m:02d}] cleaned -> {dst}")


In [12]:
pip install h5netcdf

Note: you may need to restart the kernel to use updated packages.


In [None]:
for y in range(1994, 2024):
    for m in range(1, 13):
        try:
            clean_month(y, m)
        except Exception as e:
            warnings.warn(f"[{y}-{m:02d}] cleaning failed: {e}")


[1994-01] exists(clean): u_component_of_wind-v_component_of_wind_200
[1994-01] exists(clean): u_component_of_wind-v_component_of_wind_600
[1994-01] exists(clean): u_component_of_wind-v_component_of_wind_850
[1994-01] exists(clean): temperature-relative_humidity_200
[1994-01] exists(clean): temperature-relative_humidity_600
[1994-01] exists(clean): temperature-relative_humidity_850
[1994-01] exists(clean): vorticity_850
[1994-02] exists(clean): u_component_of_wind-v_component_of_wind_200
[1994-02] exists(clean): u_component_of_wind-v_component_of_wind_600
[1994-02] exists(clean): u_component_of_wind-v_component_of_wind_850
[1994-02] exists(clean): temperature-relative_humidity_200
[1994-02] exists(clean): temperature-relative_humidity_600
[1994-02] exists(clean): temperature-relative_humidity_850
[1994-02] exists(clean): vorticity_850
[1994-03] exists(clean): u_component_of_wind-v_component_of_wind_200
[1994-03] exists(clean): u_component_of_wind-v_component_of_wind_600
[1994-03] exists