In [1]:
import pystac
import urlpath
import stac2dcache
import xml.etree.ElementTree as ET

In [2]:
stac2dcache.__version__

'0.1.0'

# Link Sentinel-2 L1C assets to GCS

The Sentinel-2 L1C dataset on AWS is available requester-pays. We thus replace the link to these assets to Google Cloud Storage (GCS) dataset, which are publicly available.

We start by reading the catalog which we have saved on the dCache storage, using a macaroon-based authentication:

In [3]:
# configure connection to dCache
dcache = stac2dcache.configure(
    filesystem="dcache", 
    token_filename="macaroon.dat"
)

In [4]:
catalog_id = "red-glacier_sentinel-2"
catalog_url = (f"https://webdav.grid.surfsara.nl:2880/pnfs/grid.sara.nl"
               f"/data/eratosthenes/disk/{catalog_id}")

In [5]:
# read catalog from storage
catalog = pystac.Catalog.from_file(f"{catalog_url}/catalog.json")

The following function parses the `manifest.safe` file, where the path to the assets is provided, and compose the full asset URLs:  

In [6]:
def _get_band_urls(url):

    url = urlpath.URL(url)
    manifest_url = url / 'manifest.safe'
    manifest = manifest_url.get_text()

    band_urls = {}

    root = ET.fromstring(manifest)
    for data_object in root.iter('dataObject'):
        id = data_object.get('ID')
        if id is not None and 'IMG_DATA' in id:
            file_location = data_object.find('byteStream/fileLocation')
            band_url = url / file_location.get('href')
            band = band_url.stem.split('_')[-1] # 'XX_XXXX_B01.jp2' -> 'B01'
            band_urls[band] = band_url.as_uri()
            
    return band_urls

We replace the URLs in all the items of the Sentinel-2 L1C collection:

In [7]:
collection_id = "sentinel-s2-l1c"
collection = catalog.get_child(collection_id)

In [8]:
BASE_URL = 'http://storage.googleapis.com/gcp-public-data-sentinel-2/tiles'

for item in collection.get_all_items():
    
    url = '{}/{:02d}/{}/{}/{}.SAFE'.format(
        BASE_URL, 
        item.properties['sentinel:utm_zone'],
        item.properties['sentinel:latitude_band'],
        item.properties['sentinel:grid_square'],
        item.properties['sentinel:product_id']
    )
        
    band_urls = _get_band_urls(url)

    for band, url in band_urls.items():
        if band in item.assets:
            item.assets[band].href = url
            
    item.assets['overview'].href = band_urls['TCI']  # TCI -> overview

We update the catalog on the dCache storage with the new links:

In [9]:
# save catalog to storage
catalog.normalize_and_save(catalog_url, catalog_type='SELF_CONTAINED')