In [1]:
import os, glob, calendar, warnings, gc
import numpy as np
import pandas as pd
import xarray as xr

DATA_CLEAN   = r"C:\Users\_s2218026\Documents\lab\data_clean"     # your cleaned ERA5
FEATURE_DIR  = r"C:\Users\_s2218026\Documents\lab\features_safe"  # new
SAMPLES_DIR  = r"C:\Users\_s2218026\Documents\lab\ml_samples"     # new
IBTRACS_CSV  = r"C:\Users\_s2218026\Documents\lab\ibtracs_genesis.csv"  # your genesis file

os.makedirs(FEATURE_DIR, exist_ok=True)
os.makedirs(SAMPLES_DIR, exist_ok=True)


In [5]:
KEYS = [
    "u_component_of_wind-v_component_of_wind_200",
    "u_component_of_wind-v_component_of_wind_600",
    "u_component_of_wind-v_component_of_wind_850",
    "temperature-relative_humidity_200",
    "temperature-relative_humidity_600",
    "temperature-relative_humidity_850",
    "vorticity_850",
]

def load_clean_month(y, m):
    ds_list = []
    for key in KEYS:
        path = os.path.join(DATA_CLEAN, f"era_{y}_{m:02d}_{key}.nc")
        if os.path.exists(path):
            ds_list.append(xr.open_dataset(path))
        else:
            print(f"  missing cleaned file: {os.path.basename(path)}")

    if not ds_list:
        raise FileNotFoundError(f"No cleaned ERA5 files for {y}-{m:02d}")

    ds = xr.merge(ds_list, compat="override", join="inner")
    return ds

def open_clean_file(year, month, key):
    path = os.path.join(DATA_CLEAN, f"era_{year}_{month:02d}_{key}.nc")
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    return xr.open_dataset(path)
    
def compute_features_month(year, month):
    # --- open only the needed fields
    ds_uv200  = open_clean_file(year, month, "u_component_of_wind-v_component_of_wind_200")
    ds_uv850  = open_clean_file(year, month, "u_component_of_wind-v_component_of_wind_850")
    ds_trh600 = open_clean_file(year, month, "temperature-relative_humidity_600")
    ds_trh850 = open_clean_file(year, month, "temperature-relative_humidity_850")
    ds_vo850  = open_clean_file(year, month, "vorticity_850")

    # --- 1) Build common time axis (intersection)
    time_idx = ds_uv200.time.to_index()
    for ds in [ds_uv850, ds_trh600, ds_trh850, ds_vo850]:
        time_idx = time_idx.intersection(ds.time.to_index())

    if len(time_idx) == 0:
        raise ValueError("No common times across fields")

    common_time = xr.DataArray(time_idx, dims=("time",), name="time")

    # --- 2) Reindex all to common_time
    ds_uv200  = ds_uv200.reindex(time=common_time)
    ds_uv850  = ds_uv850.reindex(time=common_time)
    ds_trh600 = ds_trh600.reindex(time=common_time)
    ds_trh850 = ds_trh850.reindex(time=common_time)
    ds_vo850  = ds_vo850.reindex(time=common_time)

    # --- 3) Extract variables at their (single) level
    u200 = ds_uv200["u"].isel(level=0, drop=True)
    v200 = ds_uv200["v"].isel(level=0, drop=True)
    u850 = ds_uv850["u"].isel(level=0, drop=True)
    v850 = ds_uv850["v"].isel(level=0, drop=True)

    r600 = ds_trh600["r"].isel(level=0, drop=True)
    t850 = ds_trh850["t"].isel(level=0, drop=True)
    vo850 = ds_vo850["vo"].isel(level=0, drop=True)

    shear_u = u200 - u850
    shear_v = v200 - v850
    shear_mag_850_200 = np.sqrt(shear_u**2 + shear_v**2)

    feat = xr.Dataset(
        {
            "vo850": vo850,
            "shear_mag_850_200": shear_mag_850_200,
            "r600": r600,
            "t850": t850,
        },
        coords={
            "time": common_time,
            "latitude": ds_uv200.latitude,
            "longitude": ds_uv200.longitude,
        },
    )

    # --- 4) Clean up
    ds_uv200.close(); ds_uv850.close()
    ds_trh600.close(); ds_trh850.close(); ds_vo850.close()
    gc.collect()

    return feat


FEATURE_DIR = r"C:\Users\_s2218026\Documents\lab\features_safe"
os.makedirs(FEATURE_DIR, exist_ok=True)

def save_features_month(year, month):
    outpath = os.path.join(FEATURE_DIR, f"era_{year}_{month:02d}_features.nc")
    if os.path.exists(outpath):
        print(f"[{year}-{month:02d}] features exist, skip.")
        return

    try:
        feat = compute_features_month(year, month)
    except FileNotFoundError as e:
        warnings.warn(f"[{year}-{month:02d}] missing file: {e}")
        return
    except Exception as e:
        warnings.warn(f"[{year}-{month:02d}] feature build failed: {e}")
        return

    enc = {v: {"dtype": "float32", "zlib": True, "complevel": 3} for v in feat.data_vars}
    feat.astype("float32").to_netcdf(outpath, encoding=enc)
    feat.close()
    gc.collect()
    print(f"[{year}-{month:02d}] features -> {outpath}")


In [12]:
for y in range(1994, 2025):
    for m in range(1, 13):
        save_features_month(y, m)


[1994-01] features exist, skip.
[1994-02] features exist, skip.
[1994-03] features exist, skip.
[1994-04] features exist, skip.
[1994-05] features exist, skip.
[1994-06] features exist, skip.
[1994-07] features exist, skip.
[1994-08] features exist, skip.
[1994-09] features exist, skip.
[1994-10] features exist, skip.
[1994-11] features exist, skip.
[1994-12] features exist, skip.
[1995-01] features exist, skip.
[1995-02] features exist, skip.
[1995-03] features exist, skip.
[1995-04] features exist, skip.
[1995-05] features exist, skip.
[1995-06] features exist, skip.
[1995-07] features exist, skip.
[1995-08] features exist, skip.
[1995-09] features exist, skip.
[1995-10] features exist, skip.
[1995-11] features exist, skip.
[1995-12] features exist, skip.
[1996-01] features exist, skip.
[1996-02] features exist, skip.
[1996-03] features exist, skip.
[1996-04] features exist, skip.
[1996-05] features exist, skip.
[1996-06] features exist, skip.
[1996-07] features exist, skip.
[1996-08



[2000-08] features exist, skip.
[2000-09] features exist, skip.
[2000-11] features exist, skip.
[2001-01] features exist, skip.
[2001-03] features exist, skip.
[2001-04] features exist, skip.
[2001-05] features exist, skip.
[2001-07] features exist, skip.
[2001-08] features exist, skip.




[2001-10] features exist, skip.
[2001-12] features exist, skip.
[2002-01] features exist, skip.
[2002-03] features exist, skip.
[2002-04] features exist, skip.
[2002-05] features exist, skip.
[2002-06] features exist, skip.
[2002-07] features exist, skip.
[2002-08] features exist, skip.
[2002-09] features exist, skip.
[2002-10] features exist, skip.
[2002-12] features exist, skip.
[2003-01] features exist, skip.
[2003-02] features exist, skip.
[2003-03] features exist, skip.
[2003-04] features exist, skip.
[2003-06] features exist, skip.
[2003-07] features exist, skip.
[2003-09] features exist, skip.
[2003-10] features exist, skip.


https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html


[2003-12] features exist, skip.
[2004-02] features exist, skip.
[2004-04] features exist, skip.
[2004-05] features exist, skip.
[2004-07] features exist, skip.
[2004-08] features exist, skip.




[2004-10] features exist, skip.
[2004-11] features exist, skip.
[2004-12] features exist, skip.
[2005-02] features exist, skip.
[2005-03] features exist, skip.
[2005-04] features exist, skip.
[2005-05] features exist, skip.
[2005-06] features exist, skip.
[2005-07] features exist, skip.
[2005-08] features exist, skip.
[2005-09] features exist, skip.
[2005-11] features exist, skip.
[2006-02] features exist, skip.




[2006-04] features exist, skip.
[2006-07] features exist, skip.
[2006-08] features exist, skip.
[2006-10] features exist, skip.
[2006-12] features exist, skip.
[2007-01] features exist, skip.
[2007-02] features exist, skip.




[2007-04] features exist, skip.
[2007-05] features exist, skip.
[2007-07] features exist, skip.
[2007-08] features exist, skip.
[2007-10] features exist, skip.
[2007-11] features exist, skip.
[2007-12] features exist, skip.
[2008-01] features exist, skip.
[2008-02] features exist, skip.
[2008-03] features exist, skip.
[2008-04] features exist, skip.
[2008-05] features exist, skip.
[2008-06] features exist, skip.
[2008-07] features exist, skip.
[2008-08] features exist, skip.
[2008-09] features exist, skip.
[2008-10] features exist, skip.
[2008-11] features exist, skip.
[2008-12] features exist, skip.
[2009-01] features exist, skip.
[2009-02] features exist, skip.
[2009-03] features exist, skip.
[2009-04] features exist, skip.
[2009-05] features exist, skip.
[2009-06] features exist, skip.
[2009-07] features exist, skip.
[2009-08] features exist, skip.
[2009-09] features exist, skip.
[2009-10] features exist, skip.
[2009-11] features exist, skip.
[2009-12] features exist, skip.
[2010-01

In [16]:
import os

FEATURE_DIR = r"C:\Users\_s2218026\Documents\lab\features_safe"
missing = []
present = []

for y in range(1994, 2024):
    for m in range(1, 13):
        path = os.path.join(FEATURE_DIR, f"era_{y}_{m:02d}_features.nc")
        if os.path.exists(path):
            present.append((y, m))
        else:
            missing.append((y, m))

print("Total present:", len(present))
print("Total missing:", len(missing))
print(" missing:", missing[:32])


Total present: 328
Total missing: 32
 missing: [(1999, 11), (2000, 1), (2000, 3), (2000, 5), (2000, 7), (2000, 10), (2000, 12), (2001, 2), (2001, 6), (2001, 9), (2001, 11), (2002, 2), (2002, 11), (2003, 5), (2003, 8), (2003, 11), (2004, 1), (2004, 3), (2004, 6), (2004, 9), (2005, 1), (2005, 10), (2005, 12), (2006, 1), (2006, 3), (2006, 5), (2006, 6), (2006, 9), (2006, 11), (2007, 3), (2007, 6), (2007, 9)]
