[Reference](https://medium.com/pineview-labs/cloud-native-pipelines-for-scientific-data-processing-with-prefect-and-dask-4ddd8ecfcd40)

# Run an xarray/Zarr example

In [1]:
# create a venv first
!python3 -m venv .venv
!source .venv/bin/activate

# install dependencies
!pip install xarray zarr dask s3fs

Error: Command '['/content/.venv/bin/python3', '-m', 'ensurepip', '--upgrade', '--default-pip']' returned non-zero exit status 1.
/bin/bash: line 1: .venv/bin/activate: No such file or directory
Collecting zarr
  Downloading zarr-3.1.3-py3-none-any.whl.metadata (10 kB)
Collecting s3fs
  Downloading s3fs-2025.9.0-py3-none-any.whl.metadata (1.4 kB)
Collecting donfig>=0.8 (from zarr)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting numcodecs>=0.14 (from numcodecs[crc32c]>=0.14->zarr)
  Downloading numcodecs-0.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.24.2-py3-none-any.whl.metadata (25 kB)
Collecting fsspec>=2021.09.0 (from dask)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collectin

In [2]:
import xarray as xr
import numpy as np
import zarr

# Create a small example dataset and write with a chunking strategy
time = np.arange(0, 3600, 1)  # seconds
range_bin = np.arange(0, 1800, 1)  # samples

ds = xr.Dataset(
    data_vars=dict(
        Sv=(["time", "range_bin"], np.random.randn(time.size, range_bin.size).astype("float32")),
    ),
    coords=dict(time=("time", time), range_bin=("range_bin", range_bin)),
    attrs={"convention": "SONAR-netCDF4-like (illustrative)"},
)

# chunk sizes should reflect read patterns (time-scan, or depth stripes, etc.)
ds_chunked = ds.chunk({"time": 300, "range_bin": 256})

# Write to a local Zarr store
out = "sonar_scan.zarr"
ds_chunked.to_zarr(out, mode="w")

# Lazy open without loading into memory
ds2 = xr.open_zarr(out)# Create a small example dataset and write with a chunking strategy
time = np.arange(0, 3600, 1)  # seconds
range_bin = np.arange(0, 1800, 1)  # samples



# Save the Zarr store to Amazon S3

## Create a free AWS account

```
aws configure
```

## Create a new S3 bucket with anonymous access
```
aws s3 mb s3://xarray-zarr-demo

aws s3api put-public-access-block \
  --bucket xarray-zarr-demo \
  --public-access-block-configuration "BlockPublicAcls=false,IgnorePublicAcls=false,BlockPublicPolicy=false,RestrictPublicBuckets=false"
aws s3api put-public-access-block \
  --bucket xarray-zarr-demo \
  --public-access-block-configuration '{
    "BlockPublicAcls": false,
    "IgnorePublicAcls": false,
    "BlockPublicPolicy": false,
    "RestrictPublicBuckets": false
  }'
```

## Save the dataset to the S3 Zarr store

In [4]:
# Save the chunked dataset to the S3 bucket
ds_chunked.to_zarr("s3://xarray-zarr-demo/sonar_scan.zarr",
           mode="w",
           consolidated=True,
           zarr_format=2)

# load directly from S3
ds_from_s3 = xr.open_zarr("s3://xarray-zarr-demo/sonar_scan.zarr")

In [5]:
# load directly from S3
ds_from_s3 = xr.open_zarr("s3://xarray-zarr-demo/sonar_scan.zarr")

# index-based window
subset = ds2["Sv"].isel(time=slice(600, 1200), range_bin=slice(0, 512)).load()

# Processing the raw data

```
# activate the venv if needed
source .venv/bin/activate

pip install echopype
```

In [6]:
from pathlib import Path
from typing import Optional, Dict

from prefect import flow, task
from dask.distributed import LocalCluster
from prefect_dask import DaskTaskRunner
from prefect.futures import as_completed


DEFAULT_INPUT_DIR = Path("../raw_data")
S3_BUCKET_NAME = "xarray-zarr-demo"


@task(
    retries=3,
    retry_delay_seconds=60,
    task_run_name="convert-to-zarr-{raw_path}",
)
def convert_single_raw_to_zarr(
    raw_path: str,
    s3_bucket: str,
    s3_prefix: str = "",
    sonar_model: str = "EK60",
    overwrite: bool = True,
    storage_options: Optional[Dict] = None
) -> str:
    # Lazy import so workers don't need echopype at collection time
    import echopype as ep

    raw_path_p = Path(raw_path)
    if not raw_path_p.exists():
        raise FileNotFoundError(f"Input RAW file not found: {raw_path}")

    key_prefix = s3_prefix.strip("/")

    key = f"{key_prefix}/{raw_path_p.stem}.zarr" if key_prefix else f"{raw_path_p.stem}.zarr"
    zarr_uri = f"s3://{s3_bucket}/{key}"

    ed = ep.open_raw(str(raw_path_p), sonar_model=sonar_model)
    ed.to_zarr(zarr_uri, overwrite=overwrite, output_storage_options=storage_options)

    return zarr_uri


@flow(
    name="convert-raw-to-zarr",
    log_prints=True,
    task_runner=DaskTaskRunner(address="tcp://127.0.0.1:8786")
)
def convert_raw_to_zarr(
    input_dir: str,
    s3_bucket: str,
    s3_prefix: str = "",
    sonar_model: str = "EK60",
    overwrite: bool = True,
    glob_pattern: str = "*.raw",
    storage_options: Optional[Dict] = None
):
    input_path = Path(input_dir)
    if not input_path.exists():
        raise FileNotFoundError(f"Input directory not found: {input_dir}")

    raw_files = sorted(input_path.glob(glob_pattern))
    if not raw_files:
        print(f"No files matching '{glob_pattern}' found in {input_dir}.")
        return

    in_flight = []
    batch_size = 2

    for rp in raw_files:
        task = convert_single_raw_to_zarr.submit(
            raw_path=str(rp),
            s3_bucket=s3_bucket,
            s3_prefix=s3_prefix,
            sonar_model=sonar_model,
            overwrite=overwrite,
            storage_options=storage_options
        )
        in_flight.append(task)

        if len(in_flight) >= batch_size:
            finished = next(as_completed(in_flight))
            in_flight.remove(finished)

    for future_task in in_flight:
        future_task.result()


if __name__ == "__main__":
    cluster = LocalCluster(
        n_workers=2,
        scheduler_port=8786,
        threads_per_worker=1,
        memory_limit="8GB"
    )
    client = cluster.get_client()

    convert_raw_to_zarr.serve(
        name="convert-raw-to-zarr-serve",
        parameters={
            "input_dir": str(DEFAULT_INPUT_DIR),
            "s3_bucket": S3_BUCKET_NAME,
            "s3_prefix": "echodata",
            "sonar_model": "EK60",
            "overwrite": True,
            "glob_pattern": "*.raw",
            "storage_options": {}
        },
    )