#### Imports

In [2]:
# Core scientific and data libraries
import numpy as np
from pathlib import Path
from src.coordinate_utils import *
import dask
from dask.distributed import Client

#### Variables


In [3]:
dataset_url = 'gs://gcp-public-data-arco-era5/co/single-level-reanalysis.zarr'
relevant_regions = ['Central_Arctic', 'Beaufort', 'Chukchi-NA', 'Chukchi-Asia', 'E_Siberian', 'Laptev', 'Kara', 'Barents', 'E_Greenland', 'Baffin', 'Hudson', 'Can_Arch', 'Bering-NA', 'Bering-Asia', 'pan_arctic']

In [4]:
dataset = xr.open_zarr(dataset_url, chunks={'time': 720 , 'values': 'auto'},
                       consolidated=True, decode_timedelta=False)
dataset = dataset.assign_coords(longitude=((dataset.longitude + 180) % 360) - 180)


  dataset = xr.open_zarr(dataset_url, chunks={'time': 720 , 'values': 'auto'},


In [5]:
selected_vars_dataset = dataset[["t2m", "u10", "v10", "msl"]]

#### Helper Functions

In [6]:
def create_and_cache_mask(region_name):
    """
    Creates and caches a mask for a specified geographical region.

    This function generates a mask by determining whether geographical points
    belong to a specified region shape, using longitude and latitude coordinates
    from a dataset. The mask is then cached as a .npy file for future use.
    It supports both "pan_arctic" and other custom region names.

    Parameters:
        region_name (str): Name of the geographical region for which the mask
            is to be created. If "pan_arctic", a predefined shape is used, otherwise
            a specified regional shape is retrieved.

    Raises:
        Any exceptions associated with file I/O or operations on the dataset.

    Returns:
        numpy.ndarray: A boolean array where each element indicates whether
            the corresponding geographical point is within the specified region.
    """
    coords_only = xr.open_zarr(dataset_url, chunks={'values': -1},
                               decode_timedelta=False, )
    lon = coords_only["longitude"].compute().values
    lon = lon_to_180(lon)
    lat = coords_only["latitude"].compute().values

    if region_name == "pan_arctic":
        geom = get_pan_arctic_shape()
    else:
        geom = get_region_shape(region_name)

    pts = shapely.points(lon, lat)
    mask_vals = shapely.contains(geom, pts)
    true_count = int(np.count_nonzero(mask_vals))

    mask_file = f'../data/processed/area_masks/{region_name}_mask.npy'
    Path(mask_file).parent.mkdir(parents=True, exist_ok=True)
    np.save(mask_file, mask_vals)
    return mask_vals


def load_and_apply_mask(dataset, region_name="pan_arctic"):
    """
    Loads a dataset, applies a mask, and returns the masked dataset. If the mask
    does not already exist in the specified path, it is created and cached.

    Parameters:
    dataset (xarray.Dataset): The dataset to which the mask will be applied.
    region_name (str): The name of the region defining the mask. Defaults to
        "pan_arctic".

    Returns:
    xarray.Dataset: The dataset after applying the mask.

    Raises:
    FileNotFoundError: If the mask file cannot be created or found in the specified
        path.
    """
    mask_file = f'../data/processed/area_masks/{region_name}_mask.npy'

    if not Path(mask_file).exists():
        mask_vals = create_and_cache_mask(region_name)
    else:
        mask_vals = np.load(mask_file)

    values_dim = dataset.sizes.get("values", None)
    time_dim = dataset.sizes.get("time", None)
    mask = xr.DataArray(mask_vals, dims=("values",))
    masked = dataset.where(mask, drop=True)
    values_dim_after = masked.sizes.get("values", None)
    return masked


def list_available_masks():
    """
    Lists all available masks from the specified directory.

    This function scans through a designated directory to identify files with
    a specific naming pattern. It extracts the base names of the mask files,
    removes unnecessary suffixes, and returns them in a sorted order. If the
    designated directory is not found, an empty list is returned.

    Returns
    -------
    list of str
        A sorted list of available mask names (excluding file suffixes and
        unnecessary suffixes), or an empty list if the directory does not exist.
    """
    mask_dir = Path('../data/processed/area_masks')
    if mask_dir.exists():
        masks = [f.stem.replace('_mask', '') for f in mask_dir.glob('*_mask.npy')]
        masks_sorted = sorted(masks)
        return masks_sorted
    return []


In [7]:
client = Client(n_workers=2, threads_per_worker=6, memory_limit='8GB')

In [8]:
masked_dataset = load_and_apply_mask(selected_vars_dataset, region_name='Beaufort')
masked_dataset = masked_dataset.persist()

In [9]:
masked_dataset.chunks

ERROR! Session/line number was not unique in database. History logging moved to new session 76


Frozen({'time': (720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 720

In [8]:
daily_means = masked_dataset.resample(time='1D').mean()
daily_means = daily_means.compute()
df = daily_means.to_dataframe().reset_index()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.

KeyboardInterrupt


KeyboardInterrupt



In [18]:
client.close()