# Tutorial notebook for working Planetary Computer

In [1]:
import sys

sys.path.insert(0, "src")

import logging
import os
import pathlib

import dask_geopandas
import geopandas as gpd
import hvplot.pandas
import pandas as pd
import pystac
import shapely
from dotenv import load_dotenv
from ipyleaflet import Map, basemaps

from bilbao.utils import geo_bbox

# load environment variables
load_dotenv(override=True)

# tokens to access data in private containers 
sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
storage_options = {"account_name": account_name, "credential": sas_token}

# disable logging messages from azure 
logging.getLogger("azure").setLevel(logging.WARNING)

## Load from STAC catalog

Load the transects from our CoCliCo STAC catalog. 

In [2]:
coclico_catalog = pystac.Catalog.from_file(
    "https://coclico.blob.core.windows.net/stac/v1/catalog.json"
)

In [3]:
coclico_catalog

In [4]:
list(coclico_catalog.get_all_collections())

[<Collection id=ssl>,
 <Collection id=wef>,
 <Collection id=eesl>,
 <Collection id=floodmaps>,
 <Collection id=sc>,
 <Collection id=cbca>,
 <Collection id=cfr>,
 <Collection id=smd>,
 <Collection id=cisi>,
 <Collection id=slp5>,
 <Collection id=slp6>,
 <Collection id=slp6_pilot>,
 <Collection id=coastal-mask>,
 <Collection id=shorelinemonitor-shorelines>,
 <Collection id=gcts-2000m>]

In [5]:
gcts = coclico_catalog.get_child("gcts-2000m")
gcts

### Use a dynamic map to extract data by region of interest

The IPyleaflet map below can be used to find the bbox coordinates of a certain region.
Zoom to the area where you want to extract data and run the next cell. Please keep in
mind to wait 1 second because the map has to be rendered before the coordinates can be
extracted. 

In [6]:
m = Map(basemap=basemaps.Esri.WorldImagery, scroll_wheel_zoom=True)
m.center = 41.735966575868716, -70.10032653808595
m.zoom = 9
m.layout.height = "800px"
m

Map(center=[41.735966575868716, -70.10032653808595], controls=(ZoomControl(options=['position', 'zoom_in_text'…

## IMPORTANT NOTE: Wait for the map to render before you run the next cell

rendering the map takes a second, so you need to pause 1 second before running the next cell otherwise you cannot parse the north/west/east/south bounds

In [7]:
# this makes a GeoPandas dataframe from the DynamicMap that is rendered above
roi = geo_bbox(m.west, m.south, m.east, m.north)

In [8]:
# makes a list of all items (data partitions) in the GCTS STAC catalog
items = list(gcts.get_all_items())

## The dataset is partitioned into geospatial chunks

The dataset is divided into different chunks, that each span a different region of the world. In the next cell
we read the spatial extends of each chunk and compose that into a GeoDataFrame

In [9]:
bboxes = pd.concat([geo_bbox(*i.to_dict()["bbox"]) for i in items])
bboxes = bboxes.reset_index(drop=True)
bboxes.explore()

## Now we can find the bboxes that cover our region of interest

In [10]:
bboxes_roi = gpd.sjoin(bboxes, roi)[bboxes.columns]
items_roi = [items[i] for i in bboxes_roi.index]

In [11]:
items_roi

[<Item id=minx_-90.02_miny_-0.01_part_0>]

In [12]:
items_roi[0]

## The STAC items contain references to where the data is stored

In [13]:
hrefs = [i.assets["data"].href for i in items_roi]

## Cloud based data

The href that you see below is a url to a cloud bucket with the transects for the area of interest. The prefix "az://" is the protocol for Azure cloud storage.

In [14]:
hrefs

['az://transects/gcts-2000m.parquet/minx_-90.02_miny_-0.01_part_0.parquet']

## Reading the transect partitions that span our region of interest 

We will read the data from cloud storage - but only the data that spans our region of interest (the DynamicMap above). 

## Dask dataframes are lazy

These dataframes are not in memory yet. We still have to trigger the compute (see cell below)

In [15]:
dask_geopandas.read_parquet(hrefs, storage_options=storage_options)

Unnamed: 0_level_0,tr_name,lon,lat,bearing,utm_crs,coastline_name,geometry,bbox,quadkey,isoCountryCodeAlpha2,admin_level_1_name,isoSubCountryCode,admin_level_2_name,bounding_quadkey
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
,object,float32,float32,float32,int32,int32,geometry,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...


## Compute the transects that span our region of interest

The transects are not in memory yet. In the next cell we will trigger the retrieval from cloud storage to local client by doing a `ddf.compute()` call. 

In [16]:
%%time
transects = dask_geopandas.read_parquet(hrefs, storage_options=storage_options)
transects_roi = (
    transects.sjoin(
        dask_geopandas.from_geopandas(roi.to_crs(transects.crs), npartitions=1)
    )
    .drop(columns=["index_right"])
    .compute()
)

CPU times: user 8.56 s, sys: 2.59 s, total: 11.2 s
Wall time: 2min 1s


## Sorting the transects

Currently the transects are stored by QuadKey to optimize fast read access by filter pushdown. If we want them sorted by the coastline we can do that as follows. 

In [18]:
transects_roi = transects_roi.sort_values("tr_name")

## Compose the transect origins into coastlines

In [19]:
def transect_origins_to_coastline(df):
    # it's possible that a coastline within our region of interest has only transect. We need
    # at least two points to create a linestring.
    if len(df) > 1:
        return shapely.LineString(gpd.GeoSeries.from_xy(df.lon, df.lat).to_list())


coastline = (
    transects_roi.groupby("coastline_name")
    .apply(transect_origins_to_coastline)
    .dropna()
    .reset_index()
    .rename(columns={0: "geometry"})
)
coastline = gpd.GeoDataFrame(coastline, crs=4326)

In [20]:
coastline.explore(column="coastline_name")

  elif pd.api.types.is_categorical_dtype(gdf[column]):


In [21]:
transects_roi

Unnamed: 0,tr_name,lon,lat,bearing,utm_crs,coastline_name,geometry,bbox,quadkey,isoCountryCodeAlpha2,admin_level_1_name,isoSubCountryCode,admin_level_2_name,bounding_quadkey
1613563,cl17245tr00086270,-71.452827,41.412983,104.746452,32619,17245,"LINESTRING (-71.46448 41.41503, -71.44118 41.4...","{'maxx': -71.44118140843295, 'maxy': 41.415027...",031132313233,US,United States,US-RI,Rhode Island,03023323201
1578919,cl17245tr00086370,-71.452576,41.413864,95.861092,32619,17245,"LINESTRING (-71.46450 41.41453, -71.44065 41.4...","{'maxx': -71.44064683359623, 'maxy': 41.414527...",031132313021,US,United States,US-RI,Rhode Island,03023323201
1596573,cl17245tr00086470,-71.452484,41.414761,95.861084,32619,17245,"LINESTRING (-71.46441 41.41543, -71.44056 41.4...","{'maxx': -71.44055828565972, 'maxy': 41.415425...",031132313123,US,United States,US-RI,Rhode Island,03023323201
1596571,cl17245tr00086570,-71.452583,41.415657,86.572044,32619,17245,"LINESTRING (-71.46450 41.41486, -71.44067 41.4...","{'maxx': -71.44067200460991, 'maxy': 41.416449...",031132313123,US,United States,US-RI,Rhode Island,03023323201
1596570,cl17245tr00086670,-71.452789,41.416546,80.638443,32619,17245,"LINESTRING (-71.46453 41.41483, -71.44105 41.4...","{'maxx': -71.44104533123958, 'maxy': 41.418259...",031132313123,US,United States,US-RI,Rhode Island,03023323201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615621,cl17592tr00005627,-70.810341,41.261826,46.085129,32619,17592,"LINESTRING (-70.81876 41.25545, -70.80192 41.2...","{'maxx': -70.80191607302854, 'maxy': 41.268205...",031132313301,US,United States,US-MA,Massachusetts,0302332330322
1615491,cl17592tr00005727,-70.811195,41.262451,44.753674,32619,17592,"LINESTRING (-70.81942 41.25593, -70.80297 41.2...","{'maxx': -70.80297184580373, 'maxy': 41.268977...",031132313301,US,United States,US-MA,Massachusetts,0302332330322
1615652,cl17592tr00005827,-70.812119,41.263020,22.419268,32619,17592,"LINESTRING (-70.81644 41.25462, -70.80780 41.2...","{'maxx': -70.80779933584851, 'maxy': 41.271413...",031132313301,US,United States,US-MA,Massachusetts,0302332330322
1615494,cl17592tr00005927,-70.813255,41.263069,334.029449,32619,17592,"LINESTRING (-70.80781 41.25506, -70.81871 41.2...","{'maxx': -70.8078103240905, 'maxy': 41.2710791...",031132313301,US,United States,US-MA,Massachusetts,0302332330322
