# EDITO Data Lake Playground
This notebook walks through setup, catalog exploration, filtered searches, and asset visualization against the EDITO data API.

## 1. Notebook setup and dependencies
Install the required geospatial stack and configure a retry-friendly HTTP session.

In [None]:
%pip install -q requests pandas xarray matplotlib cartopy python-dotenv rasterio fsspec s3fs


In [None]:
import os
from pathlib import Path
from typing import Any, Dict, List, Optional

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import pandas as pd
import rasterio
import requests
import xarray as xr
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

plt.style.use("seaborn-v0_8")

def build_session() -> requests.Session:
    """Create a requests session with sensible retry defaults."""
    retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=("GET", "POST"))
    adapter = HTTPAdapter(max_retries=retry)
    sess = requests.Session()
    sess.headers.update({"Accept": "application/json"})
    sess.mount("https://", adapter)
    sess.mount("http://", adapter)
    return sess

session = build_session()


## 2. Configuring environment variables for API access
Load credentials from the environment (or a `.env` file) and prepare shared paths.

In [None]:
from dotenv import load_dotenv

load_dotenv()

BASE_URL = os.getenv("EDITO_DATA_BASE_URL", "https://api.dive.edito.eu/data").rstrip("/")
API_TOKEN = os.getenv("EDITO_API_TOKEN")
DOWNLOAD_DIR = Path(os.getenv("EDITO_DOWNLOAD_DIR", "../data")).resolve()
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

session.headers.pop("Authorization", None)
if API_TOKEN:
    session.headers["Authorization"] = f"Bearer {API_TOKEN}"
else:
    print("Warning: EDITO_API_TOKEN is not set. Authenticated endpoints will fail.")

BASE_URL, DOWNLOAD_DIR


## 3. Exploring available collections via `/catalog`
Use the STAC catalog root and the `/collections` endpoint to summarize available variables.

In [None]:
def absolute_url(path: str) -> str:
    return path if path.startswith("http") else f"{BASE_URL}{path}"

def api_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    response = session.get(absolute_url(path), params=params, timeout=60)
    response.raise_for_status()
    return response.json()

def api_post(path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
    response = session.post(absolute_url(path), json=payload, timeout=90)
    response.raise_for_status()
    return response.json()


In [None]:
catalog_root = api_get("/catalogs")
root_children = [
    {
        "title": link.get("title"),
        "rel": link.get("rel"),
        "href": link.get("href")
    }
    for link in catalog_root.get("links", [])
    if link.get("rel") == "child"
]
pd.DataFrame(root_children)


In [None]:
collections_payload = api_get("/collections")
collections = collections_payload.get("collections", [])
collections_df = pd.DataFrame([
    {
        "id": col.get("id"),
        "title": col.get("title"),
        "license": col.get("license"),
        "time_start": col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0][0],
        "time_end": col.get("extent", {}).get("temporal", {}).get("interval", [[None, None]])[0][1],
        "bbox": col.get("extent", {}).get("spatial", {}).get("bbox", [[None]*4])[0],
        "keywords": col.get("keywords"),
        "links": len(col.get("links", [])),
    }
    for col in collections
])
collections_df.head(10)


## 4. Query asset metadata with bounding boxes
Build a helper around `/search` (STAC/OGC API) to filter by collection, bbox, and time.

In [None]:
def stac_search(collection_id: str, bbox: List[float], datetime_range: str, limit: int = 10, max_pages: int = 3) -> List[Dict[str, Any]]:
    body: Dict[str, Any] = {
        "collections": [collection_id],
        "bbox": bbox,
        "datetime": datetime_range,
        "limit": limit
    }
    items: List[Dict[str, Any]] = []
    url = "/search"
    next_body = body
    for _ in range(max_pages):
        if next_body is not None:
            payload = api_post(url, next_body)
        else:
            payload = api_get(url)
        items.extend(payload.get("features", []))
        next_link = next((link.get("href") for link in payload.get("links", []) if link.get("rel") == "next"), None)
        if not next_link:
            break
        url = next_link
        next_body = None
    return items

def summarize_items(features: List[Dict[str, Any]]) -> pd.DataFrame:
    rows = []
    for feat in features:
        assets = feat.get("assets", {})
        rows.append({
            "id": feat.get("id"),
            "collection": feat.get("collection"),
            "datetime": feat.get("properties", {}).get("datetime"),
            "asset_count": len(assets),
            "first_asset": next(iter(assets.keys()), None),
            "bbox": feat.get("bbox"),
        })
    return pd.DataFrame(rows)


In [None]:
COLLECTION_ID = "climate_forecast-sea_water_potential_temperature"
BBOX = [-10.0, 40.0, 10.0, 60.0]
DATETIME = "2024-10-01T00:00:00Z/2024-10-05T00:00:00Z"
search_results = stac_search(COLLECTION_ID, BBOX, DATETIME, limit=10, max_pages=2)
summary_df = summarize_items(search_results)
summary_df


## 5. Download raster tiles into Xarray
Select an asset, stream it to disk, and open it with `xarray` (delegating GeoTIFF support to Rasterio when needed).

In [None]:
if not search_results:
    raise RuntimeError("No search results to download. Adjust filters above.")
feature = search_results[0]
asset_key = next((k for k, v in feature.get("assets", {}).items() if "data" in v.get("roles", [])), None)
asset_key = asset_key or next(iter(feature["assets"].keys()))
asset = feature["assets"][asset_key]
asset_url = asset.get("href")
asset_key, asset_url


In [None]:
def download_asset(url: str, chunk_size: int = 1024 * 1024) -> Path:
    filename = url.split("/")[-1] or "asset.bin"
    destination = DOWNLOAD_DIR / filename
    with session.get(url, stream=True, timeout=300) as response:
        response.raise_for_status()
        with open(destination, "wb") as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
    return destination

local_asset = download_asset(asset_url)
local_asset


In [None]:
def open_spatial_asset(path: Path) -> xr.DataArray | xr.Dataset:
    suffix = path.suffix.lower()
    if suffix in (".tif", ".tiff"):
        return xr.open_dataset(path, engine="rasterio")
    if suffix in (".nc", ".cdf"):
        return xr.open_dataset(path)
    if suffix == ".zarr":
        return xr.open_zarr(path)
    raise ValueError(f"Unsupported asset format: {suffix}")

data_obj = open_spatial_asset(local_asset)
data_obj


## 6. Visualize slices with Matplotlib and Cartopy
Plot a quicklook using `cartopy` for geographic context and overlay the query bounding box.

In [None]:
if isinstance(data_obj, xr.Dataset):
    var_name = next(iter(data_obj.data_vars))
    data_array = data_obj[var_name]
else:
    data_array = data_obj
    var_name = data_array.name or "variable"

slice_indexers = {dim: 0 for dim in data_array.dims if dim in ("time", "depth", "lev", "band")}
plot_da = data_array.isel(**slice_indexers).squeeze()

fig = plt.figure(figsize=(8, 4))
ax = plt.axes(projection=ccrs.PlateCarree())
plot_da.plot(ax=ax, transform=ccrs.PlateCarree(), cmap="viridis", cbar_kwargs={"shrink": 0.6, "label": var_name})
ax.coastlines(resolution="110m")
ax.add_feature(cfeature.BORDERS, linewidth=0.5)
ax.set_title(f"{var_name} quicklook")
min_lon, min_lat, max_lon, max_lat = BBOX
ax.plot([min_lon, max_lon, max_lon, min_lon, min_lon], [min_lat, min_lat, max_lat, max_lat, min_lat], color="red", linewidth=1, transform=ccrs.PlateCarree())
plt.show()
