# Explore the typology cube


In [44]:
import os
import time
from typing import List, Tuple

import dask
import dask_geopandas
import fsspec
import geopandas as gpd
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import panel as pn
import pystac
import xarray as xr
from coastmonitor.typology.preprocess import (
    geoslice_by_transect,
    make_region_of_interest_from_transect,
)
from coastmonitor.typology.specification import (
    RASTER_PREDICTORS,
    TRANSECT_ATTRIBUTES,
    VECTOR_PREDICTORS,
)
from coastmonitor.typology.utils import get_latest_records, read_records
from coastpy.io.utils import read_items_extent
from coastpy.utils.config import configure_instance
from dotenv import load_dotenv
from pyproj import CRS

load_dotenv(override=True)
instance_type = configure_instance()

# NOTE: access tokens to the data are available upon request from Floris Calkoen
sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
account_name = "coclico"
storage_options = {"account_name": account_name, "credential": sas_token}

# This is the URL's to the STAC catalog, which is used to filter the data following a standarized metadata specification
COCLICO_STAC_URL = "https://coclico.blob.core.windows.net/stac/v1/catalog.json"
CONTAINER = "typology"
LABELS_PREFIX = "labels"
TYPOLOGY_STORE = "az://typology/train/release/2024-09-12/typology.zarr"
TYPOLOGY_STORE = "/Users/calkoen/data/tmp/typology/release/2024-09-12/typology.zarr"

gctr_variables = list(TRANSECT_ATTRIBUTES.keys()) + list(VECTOR_PREDICTORS.keys())
coclico_stac = pystac.Catalog.from_file(COCLICO_STAC_URL)

## Read the latest coastal typology labels from cloud storage

In [45]:
records = read_records(account_name, CONTAINER, "labels", sas_token, storage_options)
latest_records = get_latest_records(records)

In [46]:
latest_records[latest_records["user"] == "floris-calkoen"].sort_values(
    "datetime_created", ascending=False
)

Unnamed: 0,uuid,user,transect_id,lon,lat,geometry,datetime_created,datetime_updated,shore_type,coastal_type,landform_type,is_built_environment,has_defense,is_challenging,comment,link
358,1221a44b725b,floris-calkoen,cl30794s00tr00203347,-1.905028,49.720844,"LINESTRING (-1.91237 49.71321, -1.89769 49.72848)",2024-10-27 11:23:34.687898+00:00,2024-10-27 11:23:34.687898+00:00,rocky_shore_platform_or_large_boulders,bedrock_plain,,false,false,False,,"https://www.google.com/maps/@49.7211853,-1.910..."
213,214f2b858763,floris-calkoen,cl30793s01tr00002065,-1.946787,49.708817,"LINESTRING (-1.93294 49.70828, -1.96063 49.70936)",2024-10-27 11:22:46.560454+00:00,2024-10-27 11:22:46.560454+00:00,rocky_shore_platform_or_large_boulders,moderately_sloped,,true,false,False,,"https://www.google.com/maps/@49.3457683,-0.638..."
359,c3615d30dc0e,floris-calkoen,cl30794s00tr00205147,-1.927849,49.726971,"LINESTRING (-1.92911 49.71801, -1.92658 49.73593)",2024-10-27 11:21:22.735054+00:00,2024-10-27 11:21:22.735054+00:00,rocky_shore_platform_or_large_boulders,bedrock_plain,,false,false,False,,
354,47cf0704c5f8,floris-calkoen,cl30794s00tr00049547,-0.624532,49.341110,"LINESTRING (-0.62668 49.33223, -0.62239 49.34999)",2024-10-27 11:18:47.441814+00:00,2024-10-27 11:18:47.441814+00:00,sandy_gravel_or_small_boulder_sediments,moderately_sloped,,true,true,False,,"https://www.google.com/maps/@49.3457683,-0.638..."
355,a6c0e54e5604,floris-calkoen,cl30794s00tr00050647,-0.638834,49.344196,"LINESTRING (-0.64447 49.33599, -0.63319 49.3524)",2024-10-27 11:18:26.686786+00:00,2024-10-27 11:18:26.686786+00:00,rocky_shore_platform_or_large_boulders,cliffed_or_steep,,false,false,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,ff24c7d12ff3,floris-calkoen,cl30793s01tr02915565,-1.806911,43.388515,"LINESTRING (-1.80237 43.38014, -1.81145 43.39689)",2024-08-19 16:04:38.590688+00:00,2024-08-19 16:04:38.590688+00:00,no_sediment_or_shore_platform,cliffed_or_steep,,false,false,False,,
352,63b507c77960,floris-calkoen,cl30794s00tr00027347,-0.330563,49.310085,"LINESTRING (-0.33809 49.30256, -0.32303 49.31761)",2024-08-19 15:37:31.739551+00:00,2024-08-19 15:37:31.739551+00:00,sandy_gravel_or_small_boulder_sediments,cliffed_or_steep,,false,false,False,,"https://www.google.com/maps/@49.3120537,-0.332..."
280,5928653170eb,floris-calkoen,cl30793s01tr02681965,-1.256371,44.680389,"LINESTRING (-1.24383 44.67939, -1.26891 44.68138)",2024-08-19 11:20:01.528803+00:00,2024-08-19 11:20:01.528803+00:00,sandy_gravel_or_small_boulder_sediments,dune,,false,false,False,,
498,7492d1e7c09c,floris-calkoen,cl32408s01tr00181148,4.557393,52.451035,"LINESTRING (4.57192 52.45247, 4.54287 52.4496)",2024-08-19 10:19:02.775456+00:00,2024-08-19 10:19:02.775456+00:00,sandy_gravel_or_small_boulder_sediments,dune,,false,false,False,,


In [47]:
latest_records.user.value_counts()

user
floris-calkoen      833
susan-hanson         41
rosh                  9
arjen-luijendijk      6
freek-scheel          3
jon-french            2
prayla-barli          2
ruben-white           2
elias-de-korte        1
jaap-langemeijer      1
robert-nicholls       1
Name: count, dtype: int64

In [48]:
latest_records.shape

(901, 16)

In [50]:
import hvplot.pandas
import pandas as pd
import panel as pn

label_selector = pn.widgets.Select(
    options=[
        "shore_type",
        "coastal_type",
        "is_built_environment",
        "has_defense",
        "non_existant",
    ],
    value="coastal_type",  # Default selection
)


def plot_label_distribution(variable):
    # Check if the variable exists in the dataset
    if variable not in latest_records.columns:
        return pn.pane.Markdown(
            f"**Error**: `{variable}` is not a valid column in the dataset",
        )

    counts = latest_records[variable].dropna().value_counts()

    if counts.empty:
        return pn.pane.Markdown(
            f"**No data available** for `{variable}`",
        )

    # Create the plot
    return counts.hvplot.bar(
        title=f"Distribution of {variable}", xlabel=variable, ylabel="Count", rot=45
    )


# Create a Panel layout with the widget and plot
layout = pn.Column(label_selector, pn.bind(plot_label_distribution, label_selector)).show()

Launching server at http://localhost:62066


In [4]:
latest_records[latest_records["user"] == "floris-calkoen"].sort_values(
    "datetime_created", ascending=False
).iloc[[0]]

Unnamed: 0,uuid,user,transect_id,lon,lat,geometry,datetime_created,datetime_updated,shore_type,coastal_type,landform_type,is_built_environment,has_defense,is_challenging,comment,link
242,0f46c40708a7,floris-calkoen,cl32263s00tr01107639,17.634491,40.302395,"LINESTRING (17.63339 40.31136, 17.6356 40.29343)",2024-10-12 18:35:11.076929+00:00,2024-10-12 18:35:11.076929+00:00,sandy_gravel_or_small_boulder_sediments,sediment_plain,,False,False,False,,"https://www.google.com/maps/@40.303557,17.6338..."


In [None]:
gctr_collection = coclico_stac.get_child("gctr")
# pd.DataFrame.from_records(gctr_collection.extra_fields["item_assets"]["data"]["table:columns"])  # read all available variables into Pandas
gctr_extent = read_items_extent(gctr_collection)
gctr_hrefs = gctr_extent.href.to_list()
gctr = dask_geopandas.read_parquet(
    gctr_hrefs, storage_options=storage_options, columns=gctr_variables
).compute()

In [None]:
latest_records = pd.merge(
    latest_records,
    gctr[gctr_variables].drop(
        columns=["geometry", "lon", "lat"]
    ),  # these are already included in the latest records dataframe
    on="transect_id",
    how="left",
)

## Read typology into memory 

In [None]:
xr.open_zarr(
    "az://typology/train/release/2024-09-17/typology.zarr",
    storage_options=storage_options,
)

In [None]:
cube = xr.open_zarr(TYPOLOGY_STORE, consolidated=True).compute()
predictors = list(RASTER_PREDICTORS.keys()) + list(VECTOR_PREDICTORS.keys())
cube[predictors]

## Small webapp to explore the training data

In [None]:
# Create dropdown widgets
offset_distance = 200

transect_ids = list(cube.transect_id.values)
data_variables = list(RASTER_PREDICTORS.keys())

transect_select = pn.widgets.Select(name="Transect ID", options=transect_ids)
data_variable_select = pn.widgets.Select(name="Data Variable", options=data_variables)


# Function to generate the plot
@pn.depends(transect_select.param.value, data_variable_select.param.value)
def generate_plot(transect_id, data_variable):
    # Load the data slice based on transect ID
    sliced_ds = geoslice_by_transect(cube, transect_id)

    # Select the data variable to plot
    raster_data = sliced_ds[data_variable]

    # Fetch the transect and ROI
    transect = latest_records.loc[latest_records["transect_id"] == transect_id]
    roi = make_region_of_interest_from_transect(
        transect, offset_distance=offset_distance
    )
    roi_view = roi.to_crs(4326)[["geometry"]].hvplot(
        geo=True,
        line_color="red",
        alpha=0,
        line_alpha=1,
        line_width=1,
        tiles="ESRI",
        width=1200,
    )

    # Generate the plot using hvPlot
    raster_view = raster_data.hvplot(
        x="x",
        y="y",
        cmap="viridis",
        title=f"{data_variable} for Transect {transect_id}",
        width=1200,
    )
    plot = pn.Column(raster_view, roi_view)

    return plot


# Set up the interactive view with Panel
interactive_view = pn.Column(
    pn.Row(transect_select, data_variable_select), generate_plot
)

# Display the app in a Jupyter notebook or serve it using pn.serve()
interactive_view.show()