# EMODnet & CMEMS Resource Scanner

Use this notebook to focus on the marine providers inside the EDITO data lake. It fetches catalogs/collections, highlights which ones originate from EMODnet or CMEMS, and confirms whether specific CMEMS dataset IDs are available.

## Prerequisites

- Activate the repo virtual environment (`source .venv/bin/activate.fish` on fish) and install the dependencies from `requirements.txt`.
- Export either `EDITO_API_TOKEN` or `EDITO_ACCESS_TOKEN`, or set `MANUAL_TOKEN` below if the kernel cannot see your environment.

In [1]:
# Optional helper if you run the notebook outside the prepared virtualenv.
# %pip install -r "../requirements.txt"

In [2]:
import os
import textwrap
from typing import Dict, List, Optional

import pandas as pd
import requests
from dotenv import load_dotenv

In [3]:
load_dotenv()

API_BASE = os.getenv("EDITO_DATA_BASE_URL", "https://api.dive.edito.eu/data")
API_TOKEN = (
    os.getenv("EDITO_API_TOKEN")
    or os.getenv("EDITO_ACCESS_TOKEN")
    or os.getenv("MANUAL_TOKEN")
)
MANUAL_TOKEN = "eyJhbGciOiJSUzUxMiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJuZVFHeThlN0pwMko0endhQnZPZTVGRFVWN3BrN2ZPYVFBaTRKeWQ1OHlzIn0.eyJleHAiOjE3NjM2NTI0ODksImlhdCI6MTc2MzU2NjA4OSwianRpIjoiNmI2M2U3YTQtMzJmNi00ZGE3LTg5YjgtZTU5MGJjMDgwMTVlIiwiaXNzIjoiaHR0cHM6Ly9hdXRoLmRpdmUuZWRpdG8uZXUvYXV0aC9yZWFsbXMvZGF0YWxhYiIsImF1ZCI6WyJtaW5pbyIsImFjY291bnQiXSwic3ViIjoiYTQ2NjhmOGItOTVhMC00ODlhLTkxMGItMTE0MThiYmI2YzU1IiwidHlwIjoiQmVhcmVyIiwiYXpwIjoiZWRpdG8iLCJzZXNzaW9uX3N0YXRlIjoiYzUxOTc4MzQtZTNlZS00NzU3LTkzZTEtZTIyNzIwZWU2Mzc0IiwiYWNyIjoiMSIsImFsbG93ZWQtb3JpZ2lucyI6WyIqIiwiaHR0cHM6Ly9hcGkuZGl2ZS5lZGl0by5ldSJdLCJyZWFsbV9hY2Nlc3MiOnsicm9sZXMiOlsiZGVmYXVsdC1yb2xlcy1kYXRhbGFiIiwib2ZmbGluZV9hY2Nlc3MiLCJ1bWFfYXV0aG9yaXphdGlvbiJdfSwicmVzb3VyY2VfYWNjZXNzIjp7Im1pbmlvIjp7InJvbGVzIjpbInN0c29ubHkiXX0sImFjY291bnQiOnsicm9sZXMiOlsibWFuYWdlLWFjY291bnQiLCJtYW5hZ2UtYWNjb3VudC1saW5rcyIsInZpZXctcHJvZmlsZSJdfX0sInNjb3BlIjoib3BlbmlkIGVtYWlsIHByb2ZpbGUiLCJzaWQiOiJjNTE5NzgzNC1lM2VlLTQ3NTctOTNlMS1lMjI3MjBlZTYzNzQiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwibmFtZSI6IkRhbmllbCBUaGlsbyBTY2hyb2VkZXIiLCJncm91cHMiOlsiRURJVE9fVVNFUiIsInNlYWRpdG8tZmljb3MiXSwicHJlZmVycmVkX3VzZXJuYW1lIjoiZGFuaWVscyIsImdpdmVuX25hbWUiOiJEYW5pZWwgVGhpbG8iLCJmYW1pbHlfbmFtZSI6IlNjaHJvZWRlciIsImVtYWlsIjoiZGFuaWVsLnQuc2Nocm9lZGVyQHNpbnRlZi5ubyJ9.AAr_o8fMAz-j7Vli1PiJqnaVs3ULm-vyNwgdBoJjgDU1YDCsG145at8xR8GRBod0k8sSXxCrjvmKamFpB0lr9edf4HfSMq7cpapk9neL5bdE88obfy14oPOo2l7CVihRup6yk2gWI2N4hdf5C7tD1ueTp6ZKkjOdou482AjLRDci98rSOGgv1mLs88cXqcNjlZd6b2cdD26shHDuo_lMK0gAXUTVeD1Le-gX7zfVvvRBMVlI68u0M_FwBw_ehCxyVUD17xhKR0DSJoERAsN96YDmVqa0Nk49hXUiqGxl4JE9yZmqtqM8LXe4kaSLVHqAphZ0ucKm4ABICXdV765HNA"

if MANUAL_TOKEN:
    API_TOKEN = MANUAL_TOKEN

if not API_TOKEN:
    raise RuntimeError(
        "No EDITO token found. Export EDITO_API_TOKEN or EDITO_ACCESS_TOKEN, "
        "or set MANUAL_TOKEN in this cell."
    )

session = requests.Session()
session.headers.update({"Authorization": f"Bearer {API_TOKEN}"})


In [39]:
def absolute_api_path(path: str) -> str:
    if path.startswith("http"):
        return path
    return f"{API_BASE.rstrip('/')}/{path.lstrip('/')}"


def api_get(path: str, params: Optional[Dict] = None) -> Dict:
    url = absolute_api_path(path)
    response = session.get(url, params=params, timeout=60)
    response.raise_for_status()
    return response.json()


def extract_provider_names(providers: Optional[List]) -> List[str]:
    names: List[str] = []
    if not providers:
        return names
    for provider in providers:
        if isinstance(provider, dict):
            name = provider.get("name") or provider.get("description") or ""
        else:
            name = str(provider)
        if name:
            names.append(name)
    return names


def summarize_collections(collections: List[Dict]) -> pd.DataFrame:
    rows = []
    for col in collections:
        providers = extract_provider_names(col.get("providers"))
        rows.append(
            {
                "id": col.get("id"),
                "title": col.get("title"),
                "keywords": ", ".join(col.get("keywords", [])),
                "providers": ", ".join(providers),
                "license": col.get("license"),
                "extent_start": col.get("extent", {})
                .get("temporal", {})
                .get("interval", [[None]])[0][0],
                "extent_end": col.get("extent", {})
                .get("temporal", {})
                .get("interval", [[None, None]])[0][1],
            }
        )
    return pd.DataFrame(rows)


def summarize_items(items: List[Dict]) -> pd.DataFrame:
    rows = []
    for item in items:
        rows.append(
            {
                "id": item.get("id"),
                "collection": item.get("collection"),
                "datetime": item.get("properties", {}).get("datetime"),
                "geometry_type": item.get("geometry", {}).get("type"),
                "asset_count": len(item.get("assets", {})),
            }
        )
    return pd.DataFrame(rows)


def stac_search(
    collection_id: str,
    limit: int = 20,
    bbox: Optional[List[float]] = None,
    datetime_range: Optional[str] = None,
) -> List[Dict]:
    payload: Dict = {"collections": [collection_id], "limit": limit}
    if bbox:
        payload["bbox"] = bbox
    if datetime_range:
        payload["datetime"] = datetime_range

    url = absolute_api_path("/search")
    response = session.post(url, json=payload, timeout=120)
    response.raise_for_status()
    return response.json().get("features", [])


In [40]:
pd.set_option("display.max_colwidth", None)

catalogs_payload = api_get("/catalogs")
catalog_entries = (
    catalogs_payload.get("catalogs")
    or catalogs_payload.get("children")
    or catalogs_payload.get("links")
    or []
)

catalogs_summary = pd.json_normalize(catalog_entries)
selected_columns = [
    col
    for col in ["id", "title", "description", "rel", "href"]
    if col in catalogs_summary.columns
]
if selected_columns:
    display(catalogs_summary[selected_columns])
else:
    display(catalogs_summary.head())


Unnamed: 0,id,title,description,rel,href
0,,,,self,https://api.dive.edito.eu/data/catalogs
1,,,,root,https://api.dive.edito.eu/data
2,,,,parent,https://api.dive.edito.eu/data
3,4dmed-sea,,"The 4DMED products tools are specifically designed to address key multi-disciplinary scientific questions and to reduce the knowledge gaps that concern the relation between the Mediterranean Sea dynamics and the most relevant biogeochemical/biological/ecological processes, targeting also the bio-physical interactions at the mesoscale and sub-mesoscale.",child,https://api.dive.edito.eu/data/catalogs/4dmed-sea
4,again-glyons-testing,again-glyons-testing,STAC Catalog for zarr.json,child,https://api.dive.edito.eu/data/catalogs/again-glyons-testing
5,copernicus-marine-products,Copernicus Marine Products,Catalog gathering all the Copernicus Marine Service products and datasets.,child,https://api.dive.edito.eu/data/catalogs/copernicus-marine-products
6,coral-bleach,coral-bleach,STAC Catalog for zarr.json,child,https://api.dive.edito.eu/data/catalogs/coral-bleach
7,coral-bleaching,coral-bleaching,STAC Catalog for zarr.json,child,https://api.dive.edito.eu/data/catalogs/coral-bleaching
8,coral-bleaching-sample,coral-bleaching-sample,STAC Catalog for zarr.json,child,https://api.dive.edito.eu/data/catalogs/coral-bleaching-sample
9,EDITO_Model_Lab,EDITO_Model_Lab,STAC Catalog for EDITO_Model_Lab,child,https://api.dive.edito.eu/data/catalogs/EDITO_Model_Lab


In [43]:
def count_links(payload: Dict, rel: str) -> int:
    return sum(1 for link in payload.get("links", []) if link.get("rel") == rel)


def count_collections_for_catalog(entry: Dict) -> Optional[int]:
    href = entry.get("href")
    if not href:
        return None
    try:
        payload = api_get(href)
    except Exception as exc:  # noqa: BLE001 - surface which catalog failed
        print(f"Warning: could not fetch catalog {href}: {exc}")
        return None

    collections = payload.get("collections")
    if isinstance(collections, list):
        return len(collections)

    collection_links = count_links(payload, "collection")
    if collection_links:
        return collection_links

    fallback_href = f"{href.rstrip('/')}/collections"
    try:
        fallback_payload = api_get(fallback_href)
        fallback_collections = fallback_payload.get("collections")
        if isinstance(fallback_collections, list):
            return len(fallback_collections)
    except Exception:
        return None

    return None


def count_child_catalogs(entry: Dict) -> Optional[int]:
    href = entry.get("href")
    if not href:
        return None
    try:
        payload = api_get(href)
    except Exception as exc:  # noqa: BLE001
        print(f"Warning: could not fetch catalog {href}: {exc}")
        return None
    return count_links(payload, "child")


catalog_counts = []
for entry in catalog_entries:
    catalog_counts.append(
        {
            "catalog_id": entry.get("id"),
            "title": entry.get("title"),
            "href": entry.get("href"),
            "collection_count": count_collections_for_catalog(entry),
            "child_catalog_count": count_child_catalogs(entry),
        }
    )

pd.DataFrame(catalog_counts)


Unnamed: 0,catalog_id,title,href,collection_count,child_catalog_count
0,,,https://api.dive.edito.eu/data/catalogs,,23
1,,,https://api.dive.edito.eu/data,442.0,2
2,,,https://api.dive.edito.eu/data,442.0,2
3,4dmed-sea,,https://api.dive.edito.eu/data/catalogs/4dmed-sea,,3
4,again-glyons-testing,again-glyons-testing,https://api.dive.edito.eu/data/catalogs/again-glyons-testing,,0
5,copernicus-marine-products,Copernicus Marine Products,https://api.dive.edito.eu/data/catalogs/copernicus-marine-products,,308
6,coral-bleach,coral-bleach,https://api.dive.edito.eu/data/catalogs/coral-bleach,,2
7,coral-bleaching,coral-bleaching,https://api.dive.edito.eu/data/catalogs/coral-bleaching,,1
8,coral-bleaching-sample,coral-bleaching-sample,https://api.dive.edito.eu/data/catalogs/coral-bleaching-sample,,1
9,EDITO_Model_Lab,EDITO_Model_Lab,https://api.dive.edito.eu/data/catalogs/EDITO_Model_Lab,,6


In [44]:
def list_child_catalogs(entry: Dict) -> List[Dict]:
    href = entry.get("href")
    if not href:
        return []
    try:
        payload = api_get(href)
    except Exception as exc:  # noqa: BLE001
        print(f"Warning: could not fetch catalog {href}: {exc}")
        return []
    return [link for link in payload.get("links", []) if link.get("rel") == "child"]


child_rows: List[Dict[str, Optional[str]]] = []
for entry in catalog_entries:
    child_links = list_child_catalogs(entry)
    for link in child_links:
        child_rows.append(
            {
                "parent_id": entry.get("id"),
                "parent_title": entry.get("title"),
                "child_title": link.get("title"),
                "child_href": link.get("href"),
            }
        )

if child_rows:
    child_df = pd.DataFrame(child_rows)
    display(child_df.sort_values(["parent_id", "child_title"], na_position="last").reset_index(drop=True))
else:
    print("No child catalogs exposed by the current entries.")


Unnamed: 0,parent_id,parent_title,child_title,child_href
0,4dmed-sea,,,https://api.dive.edito.eu/data/catalogs/4dmed-sea/4dmed-biophys
1,4dmed-sea,,,https://api.dive.edito.eu/data/catalogs/4dmed-sea/4dmed-pp
2,4dmed-sea,,,https://api.dive.edito.eu/data/catalogs/4dmed-sea/dataset-4dmed-phys-my-daily
3,EDITO_Model_Lab,EDITO_Model_Lab,DCSM,https://api.dive.edito.eu/data/catalogs/EDITO_Model_Lab/DCSM
4,EDITO_Model_Lab,EDITO_Model_Lab,DMI_HBM_temperature_salinity_currents,https://api.dive.edito.eu/data/catalogs/EDITO_Model_Lab/DMI_HBM_temperature_salinity_currents
...,...,...,...,...
416,,,my-catalog-name,https://api.dive.edito.eu/data/catalogs/my-catalog-name
417,,,pacific_green_corridor,https://api.dive.edito.eu/data/catalogs/pacific_green_corridor_
418,,,schism_test,https://api.dive.edito.eu/data/catalogs/schism_test_
419,,,,https://api.dive.edito.eu/data/catalogs/4dmed-sea


In [19]:
collections_payload = api_get("/collections")
collections_list = collections_payload.get("collections", [])
collections_df = summarize_collections(collections_list)
print(f"Total collections: {len(collections_df)}")
collections_df.head(10)

Total collections: 442


Unnamed: 0,id,title,keywords,providers,license,extent_start,extent_end
0,emodnet-3d_habitat_suitability_maps_of_the_30_main_commercial_fish_species_from_the_atlantic_ocean,3d habitat suitability maps of the 30 main commercial fish species from the atlantic ocean,,"AZTI, Marine Research",CC-BY-4.0+,1981-01-01T00:00:00.000000Z,2025-10-24T00:00:00.000000Z
1,emodnet-additional_information_coastal_vulnerability_index,Additional information coastal vulnerability index,,"Geological Survey of the Netherlands (TNO), EMODnet Geology",CC-BY-4.0+,2000-01-01T00:00:00.000000Z,2025-12-31T00:00:00.000000Z
2,emodnet-additional_information_coastal_vulnerability_index_of_closest_coastline,Additional information coastal vulnerability index of closest coastline,,"Geological Survey of the Netherlands (TNO), EMODnet Geology",CC-BY-4.0+,2024-11-07T00:00:00.000000Z,2025-12-31T00:00:00.000000Z
3,climate_forecast-age_of_sea_ice,Age of sea ice (Climate Forecast convention),,EDITO,proprietary,1993-01-01T00:00:00.000000Z,2023-09-01T03:00:00.000000Z
4,emodnet-aggregate_extraction,Aggregate extraction,,EMODnet Human Activities,CC-BY-4.0+,1932-01-01T00:00:00.000000Z,2051-12-31T00:00:00.000000Z
5,climate_forecast-aggregate_quality_flag,Aggregate quality flag (Climate Forecast convention),,EDITO,proprietary,2018-03-13T00:00:00.000000Z,2024-04-01T00:00:00.000000Z
6,climate_forecast-air_density,Air density (Climate Forecast convention),,EDITO,proprietary,1991-08-01T00:00:00.000000Z,2024-04-22T00:00:00.000000Z
7,climate_forecast-air_pressure,Air pressure (Climate Forecast convention),,EDITO,proprietary,1841-03-21T00:00:00.000000Z,2025-06-10T00:00:00.000000Z
8,climate_forecast-air_pressure_at_mean_sea_level,Air pressure at mean sea level (Climate Forecast convention),,EDITO,proprietary,1841-03-21T00:00:00.000000Z,2025-10-21T00:00:00.000000Z
9,climate_forecast-air_temperature,Air temperature (Climate Forecast convention),,EDITO,proprietary,1841-03-21T00:00:00.000000Z,2025-06-10T00:00:00.000000Z


In [20]:
def filter_collections_by_keywords(df: pd.DataFrame, keywords: List[str]) -> pd.DataFrame:
    if df.empty:
        return df
    lowered = [kw.lower() for kw in keywords]

    def match_cell(cell: Optional[str]) -> bool:
        if not cell:
            return False
        value = cell.lower()
        return any(keyword in value for keyword in lowered)

    mask = df.apply(
        lambda row: any(
            match_cell(str(row[col]))
            for col in ["id", "title", "keywords", "providers"]
            if col in row
        ),
        axis=1,
    )
    return df[mask].copy()


def highlight_provider(df: pd.DataFrame, label: str, keywords: List[str]) -> pd.DataFrame:
    filtered = filter_collections_by_keywords(df, keywords)
    if filtered.empty:
        print(f"No collections matched {label} keywords: {keywords}")
    else:
        print(f"Found {len(filtered)} collections for {label} keywords: {keywords}")
    return filtered

In [21]:
emodnet_keywords = ["emodnet"]
emodnet_collections = highlight_provider(collections_df, emodnet_keywords)
emodnet_collections

TypeError: highlight_provider() missing 1 required positional argument: 'keywords'

In [None]:
cmems_keywords = ["cmems", "copernicus", "nwshelf"]
cmems_collections = highlight_provider(collections_df, cmems_keywords)
cmems_collections

In [None]:
target_ids = [
    "NWSHELF_ANALYSISFORECAST_PHY_004_013",
    "NWSHELF_ANALYSISFORECAST_BGC_004_002",
]
target_matches = collections_df[collections_df["id"].isin(target_ids)]
if target_matches.empty:
    print("Target collections were not found. Double-check the IDs or inspect cmems_collections above.")
target_matches

In [None]:
for collection_id in target_ids:
    print("=" * 80)
    print(f"Collection: {collection_id}")
    items = stac_search(collection_id, limit=5)
    if not items:
        print("No items returned by /search. Try reducing filters or verifying the ID.")
        continue
    display(summarize_items(items))
    sample_assets = list(items[0].get("assets", {}).keys())
    if sample_assets:
        print("Sample asset keys:", ", ".join(sample_assets))
    geometry = items[0].get("geometry")
    if geometry:
        clipped = textwrap.shorten(str(geometry), width=140)
        print("Geometry preview:", clipped)
