In [0]:
# -----------------------------------------------------------------------------
# Notebook: Sentinel-2 STAC Item Ingestion into Delta (MPC Pro + Databricks)
# -----------------------------------------------------------------------------
# Purpose:
#   1. Authenticate to Microsoft Planetary Computer Pro (GeoCatalog) using the
#      cluster's User Assigned Managed Identity (UAMI).
#   2. Retrieve STAC Items for the Sentinel-2 Level-2A collection via the
#      preview STAC API endpoint.
#   3. Normalize the nested STAC JSON into a strongly typed Spark DataFrame
#      (including assets, bands, geometry, properties, and links).
#   4. Write the resulting DataFrame as a managed Delta table for downstream
#      analytics (spatial queries, filtering by metadata, joining to stats, etc.).
#
# What You Learn:
#   - Using azure.identity.ManagedIdentityCredential inside Databricks.
#   - Calling MPC Pro STAC & SAS style endpoints with a bearer token.
#   - Handling nested STAC structures with explicit Spark schemas to avoid
#     schema inference pitfalls (type drift, excessive nesting as strings).
#   - Preparing a reusable curated Delta representation of STAC Items.
#
# Prerequisites:
#   - Databricks cluster with a User Assigned Managed Identity attached and
#     granted appropriate access to the MPC Pro GeoCatalog resource.
#   - Libraries: azure-identity, requests (install via %pip if missing).
#   - Network egress allowed to the GeoCatalog endpoint.
#
# Notes:
#   - API version used here is a preview; field names or availability may change.
#   - Adjust geocatalog_url & collection_id for other regions / collections.
#   - For large collections, consider pagination or filtering (not shown here).
# -----------------------------------------------------------------------------

from azure.identity import ManagedIdentityCredential
from pyspark.sql.types import *
import json
import requests

# The resource ID (scope/audience) for the GeoCatalog service when requesting an AAD token.
MPCPRO_APP_ID = "https://geocatalog.spatio.azure.com"

# Helper: Ensure numeric raster band metadata (nodata, scale, offset) are floats
# so that Spark can map them to DoubleType without ambiguity. Some JSON encoders
# may emit these as strings.
def fix_raster_bands(item):
    for asset in item.get("assets", {}).values():
        for band in asset.get("raster:bands", []):
            if "nodata" in band:
                band["nodata"] = float(band["nodata"])
            if "scale" in band:
                band["scale"] = float(band["scale"])
            if "offset" in band:
                band["offset"] = float(band["offset"])
    return item

# -----------------------------------------------------------------------------
# 1. Authenticate using Databricks' User Assigned Managed Identity
# -----------------------------------------------------------------------------
credential = ManagedIdentityCredential()
# Acquire a bearer token scoped to the GeoCatalog resource.
token = credential.get_token(MPCPRO_APP_ID)

# Base endpoint & target collection. Replace with your own environment/region.
geocatalog_url = "{REPLACE-WITH-YOUR-GEOCATALOG-ENDPOINT}"
collection_id = "sentinel-2-l2a"

# -----------------------------------------------------------------------------
# 2. Call STAC Items endpoint
#    For production-scale use, you may wish to add query parameters for temporal
#    or spatial filtering, and handle pagination (e.g., 'next' links in STAC).
# -----------------------------------------------------------------------------
headers = {
    "Authorization": f"Bearer {token.token}",
}
params = {
    "api-version": "2025-04-30-preview",  
}
response = requests.get(
    f"{geocatalog_url}/stac/collections/{collection_id}/items",
    headers=headers,
    params=params,
)
response.raise_for_status()  # Surface errors clearly if auth or endpoint fails
json_data = response.json()

# Normalize raster band numeric fields for schema compatibility.
json_data["features"] = [fix_raster_bands(f) for f in json_data["features"]]

# -----------------------------------------------------------------------------
# 3. Define explicit Spark schemas for STAC Item components
#    Explicit schemas:
#      - Avoids Spark inferring maps/arrays as generic strings.
#      - Provides predictable column order & data types.
#      - Eases downstream SQL & data quality checks.
# -----------------------------------------------------------------------------

# Band metadata embedded inside eo:bands array.
band_schema = StructType([
    StructField("name", StringType(), True),
    StructField("common_name", StringType(), True),
    StructField("description", StringType(), True),
    StructField("center_wavelength", DoubleType(), True),
    StructField("full_width_half_max", DoubleType(), True)
])

# Raster band (raster:bands) technical metadata.
raster_band_schema = StructType([
    StructField("scale", DoubleType(), True),
    StructField("nodata", DoubleType(), True),
    StructField("offset", DoubleType(), True),
    StructField("data_type", StringType(), True),
    StructField("bits_per_sample", IntegerType(), True),
    StructField("spatial_resolution", IntegerType(), True),
    StructField("unit", StringType(), True)
])

# Asset schema captures geospatial transforms & both eo:bands and raster:bands.
asset_schema = StructType([
    StructField("gsd", IntegerType(), True),
    StructField("href", StringType(), True),
    StructField("type", StringType(), True),
    StructField("roles", ArrayType(StringType()), True),
    StructField("title", StringType(), True),
    StructField("proj:bbox", ArrayType(DoubleType()), True),
    StructField("proj:shape", ArrayType(IntegerType()), True),
    StructField("proj:transform", ArrayType(DoubleType()), True),
    StructField("eo:bands", ArrayType(band_schema), True),
    StructField("raster:bands", ArrayType(raster_band_schema), True),
    StructField("_msft:cog_converted", BooleanType(), True),
    StructField("view:azimuth", DoubleType(), True),
    StructField("view:incidence_angle", DoubleType(), True)
])

# Hypermedia links included in each Item (self, parent, assets, etc.).
link_schema = StructType([
    StructField("rel", StringType(), True),
    StructField("type", StringType(), True),
    StructField("href", StringType(), True)
])

# Provider (data source) metadata.
provider_schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("roles", ArrayType(StringType()), True)
])

# Geographic centroid of the Item (if provided).
centroid_schema = StructType([
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True),
])

# Item-level properties: acquisition times, platform, quality metrics, etc.
properties_schema = StructType([
    StructField("created", StringType(), True),
    StructField("datetime", StringType(), True),
    StructField("platform", StringType(), True),
    StructField("proj:epsg", IntegerType(), True),
    StructField("providers", ArrayType(provider_schema), True),
    StructField("instruments", ArrayType(StringType()), True),
    StructField("s2:mgrs_tile", StringType(), True),
    StructField("constellation", StringType(), True),
    StructField("mgrs:utm_zone", IntegerType(), True),
    StructField("proj:centroid", centroid_schema, True),
    StructField("s2:granule_id", StringType(), True),
    StructField("eo:cloud_cover", DoubleType(), True),
    StructField("s2:datatake_id", StringType(), True),
    StructField("s2:product_uri", StringType(), True),
    StructField("s2:datastrip_id", StringType(), True),
    StructField("s2:product_type", StringType(), True),
    StructField("sat:orbit_state", StringType(), True),
    StructField("mgrs:grid_square", StringType(), True),
    StructField("s2:datatake_type", StringType(), True),
    StructField("view:sun_azimuth", DoubleType(), True),
    StructField("mgrs:latitude_band", StringType(), True),
    StructField("s2:generation_time", StringType(), True),
    StructField("sat:relative_orbit", IntegerType(), True),
    StructField("view:sun_elevation", DoubleType(), True),
    StructField("s2:water_percentage", DoubleType(), True),
    StructField("s2:processing_baseline", StringType(), True),
    StructField("s2:snow_ice_percentage", DoubleType(), True),
    StructField("s2:vegetation_percentage", DoubleType(), True),
    StructField("s2:thin_cirrus_percentage", DoubleType(), True),
    StructField("s2:cloud_shadow_percentage", DoubleType(), True),
    StructField("s2:nodata_pixel_percentage", DoubleType(), True),
    StructField("s2:unclassified_percentage", DoubleType(), True),
    StructField("s2:dark_features_percentage", DoubleType(), True),
    StructField("s2:not_vegetated_percentage", DoubleType(), True),
    StructField("s2:degraded_msi_data_percentage", DoubleType(), True),
    StructField("s2:high_proba_clouds_percentage", DoubleType(), True),
    StructField("s2:reflectance_conversion_factor", DoubleType(), True),
    StructField("s2:medium_proba_clouds_percentage", DoubleType(), True),
    StructField("s2:saturated_defective_pixel_percentage", DoubleType(), True)
])

# Geometry polygon (footprint) of the Item (MultiPolygon coordinates nesting).
geometry_schema = StructType([
    StructField("type", StringType(), True),
    StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType()))), True)
])

# Aggregate Item schema combining all components.
item_schema = StructType([
    StructField("id", StringType(), True),
    StructField("bbox", ArrayType(DoubleType()), True),
    StructField("type", StringType(), True),
    StructField("links", ArrayType(link_schema), True),
    StructField("assets", MapType(StringType(), asset_schema), True),
    StructField("_msft:ts", StringType(), True),
    StructField("_msft:etag", StringType(), True),
    StructField("collection", StringType(), True),
    StructField("geometry", geometry_schema, True),
    StructField("properties", properties_schema, True),
    StructField("stac_extensions", ArrayType(StringType()), True),
    StructField("stac_version", StringType(), True)
])

# -----------------------------------------------------------------------------
# 4. Create DataFrame & Persist as Delta
# -----------------------------------------------------------------------------
# Convert list of STAC Item feature dicts into a DataFrame with the explicit schema.
df = spark.createDataFrame(json_data["features"], schema=item_schema)

# Overwrite ensures idempotent runs (refresh snapshot). For incremental updates
# consider MERGE into a partitioned Delta table keyed by acquisition date.
df.write.format("delta").mode("overwrite").saveAsTable("rt_demo.default.sentinel2")

# Optional: Uncomment to inspect schema or sample rows
# df.printSchema()
# display(df.limit(5))

# Success message (lightweight confirmation).
print("Ingestion complete: Delta table rt_demo.default.sentinel2 refreshed.")