#### Imports

In [10]:
# Core scientific and data libraries
import numpy as np
import pandas as pd
import dask
import xarray as xr
import zarr
import shapely
import geopandas as gpd
from shapely.lib import unary_union
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from dask.distributed import Client
import time



#### Variables


In [11]:
dataset_url = 'gs://gcp-public-data-arco-era5/co/single-level-reanalysis.zarr'

#### Helper Functions

In [12]:
def lon_to_360(x):
    return (360 + (x % 360)) % 360

def get_region_shape(region, shapefile = '../data/raw/shapefiles_regions/NSIDC-0780_SeaIceRegions_NH_v1.0.shp'):
    gdf = gpd.read_file(shapefile)

    region_of_interest = gdf[gdf['Region'] == region]

    geom = unary_union(region_of_interest.geometry)

    return geom.iloc[0]

def get_pan_arctic_shape():
    gdf = gpd.read_file('../data/raw/shapefiles_regions/NSIDC-0780_SeaIceRegions_NH_v1.0.shp').to_crs("EPSG:4326")

    name_col = "Region"
    excl = {"Baltic","Japan","Bohai","Gulf_Alaska","St_Lawr","Okhotsk"}
    incl = [n for n in gdf[name_col] if n not in excl]

    pan_arctic_geom = unary_union(gdf[gdf[name_col].isin(incl)].geometry)
    return pan_arctic_geom.iloc[0]

def slice_dataset_to_region(dataset, region):
    if region == "pan_arctic":
        geom = get_pan_arctic_shape()
    else:
        geom = get_region_shape(region)
    lon = dataset["longitude"].compute().values
    lat = dataset["latitude"].compute().values
    pts = shapely.points(lon, lat)
    mask_vals = shapely.contains(geom, pts)
    mask = xr.DataArray(mask_vals, dims=("values",))
    return dataset.where(mask, drop=True)


In [13]:
# Opening the dataset
reanalysis = xr.open_zarr(
    dataset_url,
    chunks={'time': 48, 'values': 'auto'},
    consolidated=True,
    decode_timedelta=False,)

reanalysis = reanalysis.assign_coords(longitude=((reanalysis.longitude + 180) % 360) - 180)

To continue decoding into a timedelta64 dtype, either set `decode_timedelta=True` when opening this dataset, or add the attribute `dtype='timedelta64[ns]'` to this variable on disk.
To opt-in to future behavior, set `decode_timedelta=False`.
  reanalysis = xr.open_zarr(
  reanalysis = xr.open_zarr(


In [14]:
def create_and_cache_mask():
    coords_only = xr.open_zarr(dataset_url, chunks={'values': -1},
                               decode_timedelta=False,)
    lon = coords_only["longitude"].compute().values
    lat = coords_only["latitude"].compute().values

    geom = get_pan_arctic_shape()
    pts = shapely.points(lon, lat)
    mask_vals = shapely.contains(geom, pts)

    np.save('../data/processed/pan_arctic_mask.npy', mask_vals)
    return mask_vals

def load_and_apply_mask(dataset):
    mask_vals = np.load('../data/processed/pan_arctic_mask.npy')
    mask = xr.DataArray(mask_vals, dims=("values",))
    return dataset.where(mask, drop=True)

In [16]:
create_and_cache_mask()
ds_roi = load_and_apply_mask(reanalysis)

To continue decoding into a timedelta64 dtype, either set `decode_timedelta=True` when opening this dataset, or add the attribute `dtype='timedelta64[ns]'` to this variable on disk.
To opt-in to future behavior, set `decode_timedelta=False`.
  coords_only = xr.open_zarr(dataset_url, chunks={'values': -1})


In [17]:

ds_roi = ds_roi.chunk({'time': 1440, 'values': -1})

lat = ds_roi["latitude"]
w = xr.DataArray(np.cos(np.deg2rad(lat)), dims=("values",))


In [18]:
def check_for_duplicates(year, base_dir="../data/processed"):
    outpath = Path(base_dir) / f"pan_arctic_{year}.parquet"
    if outpath.exists():
        print(f"[skip ] {year} → {outpath} (already exists)")
        return True, str(outpath)
    return False, str(outpath)


client = Client(n_workers=4, threads_per_worker=3, memory_limit="8GB")

years = range(1979, 2022)
for y in years:
    exists, outpath = check_for_duplicates(y)
    if exists:
        continue

    print(f"[start] {y}")
    t0 = time.perf_counter()
    block = (ds_roi[["t2m", "u10", "v10"]]
             .sel(time=slice(f"{y}-01-01", f"{y}-12-31"))
             .resample(time="1D").mean()
             .weighted(w).mean("values"))

    df_y = (block.assign(t2m_c=lambda d: d.t2m - 273.15)
            .drop_vars("t2m")
            .to_dataframe()
            .reset_index()
            .rename(columns={"time": "date"}))
    df_y["region"] = "pan_arctic"
    df_y = df_y[["date", "region", "t2m_c", "u10", "v10"]]
    Path(outpath).parent.mkdir(parents=True, exist_ok=True)

    table = pa.Table.from_pandas(df_y, preserve_index=False)
    pq.write_table(table, outpath)
    dt = time.perf_counter() - t0
    print(f"[done ] {y} → {outpath} ({dt:.1f}s)")


Perhaps you already have a cluster running?
Hosting the HTTP server on port 64961 instead


[skip ] 1979 → ..\data\processed\pan_arctic_1979.parquet (already exists)
[skip ] 1980 → ..\data\processed\pan_arctic_1980.parquet (already exists)
[skip ] 1981 → ..\data\processed\pan_arctic_1981.parquet (already exists)
[skip ] 1982 → ..\data\processed\pan_arctic_1982.parquet (already exists)
[skip ] 1983 → ..\data\processed\pan_arctic_1983.parquet (already exists)
[skip ] 1984 → ..\data\processed\pan_arctic_1984.parquet (already exists)
[skip ] 1985 → ..\data\processed\pan_arctic_1985.parquet (already exists)
[skip ] 1986 → ..\data\processed\pan_arctic_1986.parquet (already exists)
[skip ] 1987 → ..\data\processed\pan_arctic_1987.parquet (already exists)
[skip ] 1988 → ..\data\processed\pan_arctic_1988.parquet (already exists)
[skip ] 1989 → ..\data\processed\pan_arctic_1989.parquet (already exists)
[skip ] 1990 → ..\data\processed\pan_arctic_1990.parquet (already exists)
[skip ] 1991 → ..\data\processed\pan_arctic_1991.parquet (already exists)
[skip ] 1992 → ..\data\processed\pan_a

2025-08-21 17:31:51,523 - ERROR - Task exception was never retrieved
future: <Task finished name='Task-2255634' coro=<Client._gather.<locals>.wait() done, defined at C:\projects\private_projects\arctic-ice-extent\.venv\Lib\site-packages\distributed\client.py:2385> exception=AllExit()>
Traceback (most recent call last):
  File "C:\projects\private_projects\arctic-ice-extent\.venv\Lib\site-packages\distributed\client.py", line 2394, in wait
    raise AllExit()
distributed.client.AllExit


KeyboardInterrupt: 

In [None]:
client.close()
