# EDITO Data Lake Playground
This notebook walks through setup, catalog exploration, filtered searches, and asset visualization against the EDITO data API.

## 1. Notebook setup and dependencies
Install the required geospatial stack and configure a retry-friendly HTTP session.

In [1]:
%pip install -q requests pandas xarray matplotlib cartopy python-dotenv rasterio fsspec s3fs


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from pathlib import Path
from typing import Any, Dict, List, Optional

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import pandas as pd
import rasterio
import requests
import xarray as xr
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

plt.style.use("seaborn-v0_8")

def build_session() -> requests.Session:
    """Create a requests session with sensible retry defaults."""
    retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=("GET", "POST"))
    adapter = HTTPAdapter(max_retries=retry)
    sess = requests.Session()
    sess.headers.update({"Accept": "application/json"})
    sess.mount("https://", adapter)
    sess.mount("http://", adapter)
    return sess

session = build_session()


## 2. Configuring environment variables for API access
Load credentials from the environment (or a `.env` file) and prepare shared paths.

In [3]:
from dotenv import load_dotenv

load_dotenv()

BASE_URL = os.getenv("EDITO_DATA_BASE_URL", "https://api.dive.edito.eu/data").rstrip("/")
TOKEN_ENV_NAMES = ("EDITO_API_TOKEN", "EDITO_ACCESS_TOKEN")
MANUAL_TOKEN = (
    "eyJhbGciOiJSUzUxMiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJuZVFHeThlN0pwMko0endhQnZPZTVGRFVWN3BrN2ZPYVFBaTRKeWQ1OHlzIn0.eyJleHAiOjE3NjM2NTI0ODksImlhdCI6MTc2MzU2NjA4OSwianRpIjoiNmI2M2U3YTQtMzJmNi00ZGE3LTg5YjgtZTU5MGJjMDgwMTVlIiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmRpdmUuZWRpdG8uZXUvYXV0aC9yZWFsbXMvZGF0YWxhYiIsImF1ZCI6WyJtaW5pbyIsImFjY291bnQiXSwic3ViIjoiYTQ2NjhmOGItOTVhMC00ODlhLTkxMGItMTE0MThiYmI2YzU1IiwidHlwIjoiQmVhcmVyIiwiYXpwIjoiZWRpdG8iLCJzZXNzaW9uX3N0YXRlIjoiYzUxOTc4MzQtZTNlZS00NzU3LTkzZTEtZTIyNzIwZWU2Mzc0IiwiYWNyIjoiMSIsImFsbG93ZWQtb3JpZ2lucyI6WyIqIiwiaHR0cHM6Ly9hcGkuZGl2ZS5lZGl0by5ldSJdLCJyZWFsbV9hY2Nlc3MiOnsicm9sZXMiOlsiZGVmYXVsdC1yb2xlcy1kYXRhbGFiIiwib2ZmbGluZV9hY2Nlc3MiLCJ1bWFfYXV0aG9yaXphdGlvbiJdfSwicmVzb3VyY2VfYWNjZXNzIjp7Im1pbmlvIjp7InJvbGVzIjpbInN0c29ubHkiXX0sImFjY291bnQiOnsicm9sZXMiOlsibWFuYWdlLWFjY291bnQiLCJtYW5hZ2UtYWNjb3VudC1saW5rcyIsInZpZXctcHJvZmlsZSJdfX0sInNjb3BlIjoib3BlbmlkIGVtYWlsIHByb2ZpbGUiLCJzaWQiOiJjNTE5NzgzNC1lM2VlLTQ3NTctOTNlMS1lMjI3MjBlZTYzNzQiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwibmFtZSI6IkRhbmllbCBUaGlsbyBTY2hyb2VkZXIiLCJncm91cHMiOlsiRURJVE9fVVNFUiIsInNlYWRpdG8tZmljb3MiXSwicHJlZmVycmVkX3VzZXJuYW1lIjoiZGFuaWVscyIsImdpdmVuX25hbWUiOiJEYW5pZWwgVGhpbG8iLCJmYW1pbHlfbmFtZSI6IlNjaHJvZWRlciIsImVtYWlsIjoiZGFuaWVsLnQuc2Nocm9lZGVyQHNpbnRlZi5ubyJ9.AAr_o8fMAz-j7Vli1PiJqnaVs3ULm-vyNwgdBoJjgDU1YDCsG145at8xR8GRBod0k8sSXxCrjvmKamFpB0lr9edf4HfSMq7cpapk9neL5bdE88obfy14oPOo2l7CVihRup6yk2gWI2N4hdf5C7tD1ueTp6ZKkjOdou482AjLRDci98rSOGgv1mLs88cXqcNjlZd6b2cdD26shHDuo_lMK0gAXUTVeD1Le-gX7zfVvvRBMVlI68u0M_FwBw_ehCxyVUD17xhKR0DSJoERAsN96YDmVqa0Nk49hXUiqGxl4JE9yZmqtqM8LXe4kaSLVHqAphZ0ucKm4ABICXdV765HNA"
)


def resolve_token() -> tuple[Optional[str], str]:
    for name in TOKEN_ENV_NAMES:
        value = os.getenv(name)
        if value:
            return value, name
    return MANUAL_TOKEN, "MANUAL_TOKEN"

API_TOKEN, TOKEN_NAME = resolve_token()
DOWNLOAD_DIR = Path(os.getenv("EDITO_DOWNLOAD_DIR", "../data")).resolve()
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

session.headers.pop("Authorization", None)
if API_TOKEN:
    session.headers["Authorization"] = f"Bearer {API_TOKEN}"
    print(f"Using token from {TOKEN_NAME}")
else:
    print("Warning: EDITO_API_TOKEN/EDITO_ACCESS_TOKEN is not set and MANUAL_TOKEN is empty. Authenticated endpoints will fail.")

BASE_URL, DOWNLOAD_DIR

Using token from EDITO_API_TOKEN


('https://api.dive.edito.eu/data',
 PosixPath('/Users/daniels/Mono/projects/work/sintef/Edito-Playground/data'))

## 3. Exploring available collections via `/catalog`
Use the STAC catalog root and the `/collections` endpoint to summarize available variables.

In [6]:
def absolute_url(path: str) -> str:
    return path if path.startswith("http") else f"{BASE_URL}{path}"

def api_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    response = session.get(absolute_url(path), params=params, timeout=60)
    response.raise_for_status()
    return response.json()

def api_post(path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
    response = session.post(absolute_url(path), json=payload, timeout=90)
    response.raise_for_status()
    return response.json()


In [7]:
pd.set_option("display.max_colwidth", None)

catalog_root = api_get("/catalogs")
root_children = [
    {
        "title": link.get("title"),
        "rel": link.get("rel"),
        "href": link.get("href"),
    }
    for link in catalog_root.get("links", [])
    if link.get("rel") == "child"
]
pd.DataFrame(root_children)


HTTPError: 401 Client Error: Unauthorized for url: https://api.dive.edito.eu/data/catalogs

In [8]:
collections_payload = api_get("/collections")
collections = collections_payload.get("collections", [])
collections_df = pd.DataFrame([
    {
        "id": col.get("id"),
        "title": col.get("title"),
        "license": col.get("license"),
        "time_start": col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0][0],
        "time_end": col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0][1],
        "bbox": col.get("extent", {}).get("spatial", {}).get("bbox", [[None]*4])[0],
        "keywords": col.get("keywords"),
        "links": len(col.get("links", [])),
    }
    for col in collections
])
collections_df.head(10)


HTTPError: 401 Client Error: Unauthorized for url: https://api.dive.edito.eu/data/collections

In [43]:
sample_collection_id = collections_df.iloc[0]["id"] if not collections_df.empty else None
sample_collection_id

'emodnet-3d_habitat_suitability_maps_of_the_30_main_commercial_fish_species_from_the_atlantic_ocean'

In [16]:
sample_items_payload = api_get(f"/collections/{sample_collection_id}/items", params={"limit": 5}) if sample_collection_id else {}
sample_items = sample_items_payload.get("features", [])
sample_items_df = summarize_items(sample_items) if sample_items else pd.DataFrame()
sample_items_df

NameError: name 'sample_collection_id' is not defined

## 4. Query asset metadata with bounding boxes
Build a helper around `/search` (STAC/OGC API) to filter by collection, bbox, and time.

In [45]:
def stac_search(collection_id: str, bbox: Optional[List[float]] = None, datetime_range: Optional[str] = None, limit: int = 10, max_pages: int = 3) -> List[Dict[str, Any]]:
    body: Dict[str, Any] = {
        "collections": [collection_id],
        "limit": limit
    }
    if bbox:
        body["bbox"] = bbox
    if datetime_range:
        body["datetime"] = datetime_range
    items: List[Dict[str, Any]] = []
    url = "/search"
    next_body = body
    for _ in range(max_pages):
        if next_body is not None:
            payload = api_post(url, next_body)
        else:
            payload = api_get(url)
        items.extend(payload.get("features", []))
        next_link = next((link.get("href") for link in payload.get("links", []) if link.get("rel") == "next"), None)
        if not next_link:
            break
        url = next_link
        next_body = None
    return items



In [46]:
COLLECTION_ID = sample_collection_id or "climate_forecast-sea_water_potential_temperature"
BBOX = sample_items_df.iloc[0]["bbox"] if not sample_items_df.empty else None
DATETIME = None  # e.g., "2024-09-01T00:00:00Z/2024-09-05T23:59:59Z"
search_results = stac_search(COLLECTION_ID, bbox=BBOX, datetime_range=DATETIME, limit=10, max_pages=2)
summary_df = summarize_items(search_results)
summary_df

Unnamed: 0,id,collection,datetime,asset_count,first_asset,bbox
0,0773e36b-cfd4-557a-8504-390d810f90bb,emodnet-3d_habitat_suitability_maps_of_the_30_...,,10,data,"[-180, -90, 180, 90]"
1,1c0070aa-621b-56c1-9a74-0707790eb2da,emodnet-3d_habitat_suitability_maps_of_the_30_...,,13,data,"[-180, -90, 180, 90]"
2,1fccceed-bb38-5448-a0f2-579c714427b3,emodnet-3d_habitat_suitability_maps_of_the_30_...,,4,data,"[-21, 23, 42.08000183105469, 66.08000183105469]"
3,0f257e93-753b-5be3-ae85-8816beba59c0,emodnet-3d_habitat_suitability_maps_of_the_30_...,,4,data,"[-21, 23, 42.08000183105469, 66.08000183105469]"
4,88b5c742-d094-5f73-95ee-a67bb59f1dd5,emodnet-3d_habitat_suitability_maps_of_the_30_...,,4,xml,"[-98, -83, 68.41, 89.93]"


## 5. Download raster tiles into Xarray
Select an asset, stream it to disk, and open it with `xarray` (delegating GeoTIFF support to Rasterio when needed).

In [47]:
if not search_results:
    raise RuntimeError("No search results to download. Adjust filters above.")
feature = search_results[0]
asset_key = next((k for k, v in feature.get("assets", {}).items() if "data" in v.get("roles", [])), None)
asset_key = asset_key or next(iter(feature["assets"].keys()))
asset = feature["assets"][asset_key]
asset_url = asset.get("href")
asset_key, asset_url


('data',
 'https://minio.dive.edito.eu/project-oceandownscale/mohid-water-2025-10-23T16:15:31/hackaton_data/WaterProperties.nc/WaterProperties.nc')

In [48]:
def download_asset(url: str, chunk_size: int = 1024 * 1024) -> Path:
    filename = url.split("/")[-1] or "asset.bin"
    destination = DOWNLOAD_DIR / filename
    with session.get(url, stream=True, timeout=300) as response:
        response.raise_for_status()
        with open(destination, "wb") as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
    return destination

local_asset = download_asset(asset_url)
local_asset


HTTPError: 404 Client Error: Not Found for url: https://minio.dive.edito.eu/project-oceandownscale/mohid-water-2025-10-23T16:15:31/hackaton_data/WaterProperties.nc/WaterProperties.nc

In [49]:
def open_spatial_asset(path: Path) -> xr.DataArray | xr.Dataset:
    suffix = path.suffix.lower()
    if suffix in (".tif", ".tiff"):
        return xr.open_dataset(path, engine="rasterio")
    if suffix in (".nc", ".cdf"):
        return xr.open_dataset(path)
    if suffix == ".zarr":
        return xr.open_zarr(path)
    raise ValueError(f"Unsupported asset format: {suffix}")

data_obj = open_spatial_asset(local_asset)
data_obj


NameError: name 'local_asset' is not defined

## 6. Visualize slices with Matplotlib and Cartopy
Plot a quicklook using `cartopy` for geographic context and overlay the query bounding box.

In [None]:
if isinstance(data_obj, xr.Dataset):
    var_name = next(iter(data_obj.data_vars))
    data_array = data_obj[var_name]
else:
    data_array = data_obj
    var_name = data_array.name or "variable"

slice_indexers = {dim: 0 for dim in data_array.dims if dim in ("time", "depth", "lev", "band")}
plot_da = data_array.isel(**slice_indexers).squeeze()

fig = plt.figure(figsize=(8, 4))
ax = plt.axes(projection=ccrs.PlateCarree())
plot_da.plot(ax=ax, transform=ccrs.PlateCarree(), cmap="viridis", cbar_kwargs={"shrink": 0.6, "label": var_name})
ax.coastlines(resolution="110m")
ax.add_feature(cfeature.BORDERS, linewidth=0.5)
ax.set_title(f"{var_name} quicklook")
min_lon, min_lat, max_lon, max_lat = BBOX
ax.plot([min_lon, max_lon, max_lon, min_lon, min_lon], [min_lat, min_lat, max_lat, max_lat, min_lat], color="red", linewidth=1, transform=ccrs.PlateCarree())
plt.show()
