# Tutorial notebook for working Planetary Computer

In [1]:
import sys

# add the src directory to the path so that we can import generic functions
sys.path.insert(0, "src")

import logging
import os
import pathlib

# Sometimes you need libraries which are not included in the planetary computer image. That's
# generally not a problem because you can install them with pip.
os.system("pip install python-dotenv -q")

import dask_geopandas
import geopandas as gpd
import hvplot.pandas
import pandas as pd
import pystac
import shapely
from dotenv import load_dotenv
from ipyleaflet import Map, basemaps

# load environment variables
load_dotenv(override=True)

# tokens to access data in private containers
sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
coclico_storage_options = {"account_name": "coclico", "credential": sas_token}

# disable logging messages from azure
logging.getLogger("azure").setLevel(logging.WARNING)

## Load from STAC catalog

Load the transects from our CoCliCo STAC catalog. 

In [None]:
coclico_catalog = pystac.Catalog.from_file(
    "https://coclico.blob.core.windows.net/stac/v1/catalog.json"
)

In [None]:
coclico_catalog

In [None]:
list(coclico_catalog.get_all_collections())

In [None]:
gcts = coclico_catalog.get_child("gcts-2000m")
gcts

### Use a dynamic map to extract data by region of interest

The IPyleaflet map below can be used to find the bbox coordinates of a certain region.
Zoom to the area where you want to extract data and run the next cell. Please keep in
mind to wait 1 second because the map has to be rendered before the coordinates can be
extracted. 

In [None]:
m = Map(basemap=basemaps.Esri.WorldImagery, scroll_wheel_zoom=True)
m.center = 43.406241, -2.976665
m.zoom = 9
m.layout.height = "800px"
m

## IMPORTANT NOTE: Wait for the map to render before you run the next cell

rendering the map takes a second, so you need to pause 1 second before running the next cell otherwise you cannot parse the north/west/east/south bounds

## Import functions from project directory 

In [None]:
from bilbao.utils import geo_bbox

# this makes a GeoPandas dataframe from the DynamicMap that is rendered above
roi = geo_bbox(m.west, m.south, m.east, m.north)
roi.explore()

In [None]:
# makes a list of all items (data partitions) in the GCTS STAC catalog
items = list(gcts.get_all_items())

## The dataset is partitioned into geospatial chunks

The dataset is divided into different chunks, that each span a different region of the world. In the next cell
we read the spatial extends of each chunk and compose that into a GeoDataFrame

In [None]:
bboxes = pd.concat([geo_bbox(*i.to_dict()["bbox"]) for i in items])
bboxes = bboxes.reset_index(drop=True)
bboxes.explore()

## Now we can find the bboxes that cover our region of interest

In [None]:
bboxes_roi = gpd.sjoin(bboxes, roi)[bboxes.columns]
items_roi = [items[i] for i in bboxes_roi.index]

In [None]:
items_roi

In [None]:
items_roi[0]

## The STAC items contain references to where the data is stored

In [None]:
hrefs = [i.assets["data"].href for i in items_roi]

## Cloud based data

The href that you see below is a url to a cloud bucket with the transects for the area of interest. The prefix "az://" is the protocol for Azure cloud storage. So if you don't have a STAC catalog write out the href's yourself. 

In [None]:
hrefs

## Reading the transect partitions that span our region of interest 

We will read the data from cloud storage - but only the data that spans our region of interest (the DynamicMap above). 

## Dask dataframes are lazy

These dataframes are not in memory yet. We still have to trigger the compute (see cell below)

In [None]:
dask_geopandas.read_parquet(hrefs, storage_options=coclico_storage_options)

## Compute the transects that span our region of interest

The transects are not in memory yet. In the next cell we will trigger the retrieval from cloud storage to local client by doing a `ddf.compute()` call. Note that we can also mix in regular Pandas operations, like sorting. Currently the transects are sorted by QuadKey to optimize fast read access by filter pushdown. If we want them sorted along the coastline we can do that by sorting the tr_name. 

In [None]:
%%time
transects = dask_geopandas.read_parquet(hrefs, storage_options=coclico_storage_options)
transects = (
    transects.sjoin(
        dask_geopandas.from_geopandas(roi.to_crs(transects.crs), npartitions=1)
    )
    .drop(columns=["index_right"])
    .sort_values("tr_name")
    .compute()
)

## Sorting the transects

In [None]:
transects.sample(500).explore(column="bearing")

## Holoviews interactive visualization

In [None]:
import colorcet as cc
import hvplot.pandas

transects_plot = (
    transects[["geometry", "bearing"]]
    .sample(500)
    .hvplot(
        geo=True,
        tiles="ESRI",
        color="bearing",
        cmap=cc.CET_C10,
        width=700,
        height=500,
        clabel="North bearing [deg]",
        xlabel="Longitude [deg]",
        ylabel="Latitude [deg]",
        title="Cross-shore transects (100m alongshore), Euskadi.",
        colorbar=True,
        tools=["wheel_zoom"],
    )
)
transects_plot

## Mapping ERA5 onto transects

In [None]:
transects