# Maridata S3 ODC ZARR Experiments

* Contact: SSH, EHJ
* Requirements: Python, venv
* Credentials required for accessing the S3 bucket are available via the project password safe
  * Credentials should be provided using a `.env` file with the following entries: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_BUCKET_REGION`.
    The first two are supported names of boto3.

In [None]:
%pip install zarr boto3 dotenv xarray ipykernel

## First Contact With a S3 Bucket

In [None]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
session = boto3.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                        aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
                        region_name=os.getenv('AWS_BUCKET_REGION'))
s3 = session.client('s3')

#
#   https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/list_buckets.html
#
buckets = s3.list_buckets()
print(f"The first bucket: '{buckets['Buckets'][0]}")
bucket = buckets['Buckets'][0]['Name']
#
#   https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
#
obj_key = 'uploaded/s3/new/path/requirements.txt'
s3.upload_file(Filename='requirements.txt', Bucket=bucket, Key=obj_key)
#
#   https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/list_objects_v2.html
#
objs = s3.list_objects_v2(Bucket=bucket, Prefix='zarr', MaxKeys=10)
objs
objs2 = s3.list_objects_v2(Bucket=bucket, MaxKeys=16, ContinuationToken=objs['NextContinuationToken'])
objs2
#
#   https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/delete_object.html
#   https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/delete_objects.html
#
delete_response = s3.delete_object(Bucket=bucket, Key='uploaded/s3/new/path/requirements.txt')
delete_response

## Convert NETCDF to ZARR Using xarray

* Specification: <https://zarr.readthedocs.io/en/stable/spec/v2.html>
* Python libraries:
  * zarr: [api docs](https://zarr.readthedocs.io/en/stable/api.html), [@pypi](https://pypi.org/project/zarr/)
  * xarray: [api docs](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html#xarray.Dataset.to_zarr), [@pypi](https://pypi.org/project/xarray/)


In [None]:
import xarray as xr
#
# https://docs.xarray.dev/en/stable/getting-started-guide/faq.html#id6
# https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray.open_dataset
#
ds = xr.open_dataset("./data/simu_alternateWaves_20230421_20230425.nc", engine="netcdf4")
#
# https://docs.xarray.dev/en/stable/user-guide/io.html#io-zarr
# https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html#xarray.Dataset.to_zarr
#
zarr_store = './zarr/'
ds.to_zarr(store=zarr_store, mode='w', consolidated=True)

## Upload Zarr Store to S3 Bucket via Boto3

* Documentation: <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html>
* AWS storage classes for s3 objects: <https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html>

In [None]:
import boto3
import os
from dotenv import load_dotenv
import zarr
import xarray as xr

load_dotenv()

ds = xr.open_dataset("./data/simu_alternateWaves_20230421_20230425.nc", engine="netcdf4")

session = boto3.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                        aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
                        region_name=os.getenv('AWS_BUCKET_REGION'))
s3 = session.client('s3')

zarr_store = './zarr/'
bucket = '52n-maridata'
store = zarr.DirectoryStore(zarr_store)

ds.to_zarr(store=zarr_store, mode='w', consolidated=True)

zarr_prefix = 'opendatacube/zarr/forecast/waves/'
#s3_keys = []
for filename in store.keys():
    s3_key = zarr_prefix + filename
    #s3_keys.append(s3_key)
    local_filename = 'zarr/'+filename
    print(f"Upload local file '{local_filename}' to s3 key 's3://{bucket}/{s3_key}")
    s3.upload_file(Filename=local_filename, Bucket=bucket, Key=s3_key)

#s3.delete_objects

## Access S3 Bucket via s3fs

* Documenation: <https://s3fs.readthedocs.io/en/latest/>
* Python library: [api docs](https://s3fs.readthedocs.io/en/latest/api.html), [@pypi](https://pypi.org/project/s3fs/)

In [None]:
import s3fs
import os
from dotenv import load_dotenv
import zarr
import xarray as xr
import fsspec

#load the environment variables from .env file
load_dotenv()

#load netCDF4 file using xarray 
ds = xr.open_dataset("./data/simu_alternateWaves_20230421_20230425.nc", engine="netcdf4")

#   https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem
#   https://github.com/fsspec/s3fs/blob/main/s3fs/core.py

#Setup the s3 file system by providing necessary credentials
s3 = s3fs.S3FileSystem(key=os.getenv('AWS_ACCESS_KEY_ID'), 
                       secret=os.getenv('AWS_SECRET_ACCESS_KEY'),client_kwargs={'region_name':os.getenv('AWS_BUCKET_REGION')})

#s3.ls('52n-maridata') #check the bucket in s3 file system
#provide the bucket name and the zarr prefix for s3
bucket = '52n-maridata'
zarr_prefix = 'opendatacube/zarr/forecast/waves/'

#create store using S3Map method from s3fs library and save the dataset as a zarr directory/store in s3 bucket
store = s3fs.S3Map(root=f"s3://{bucket}/{zarr_prefix}", s3=s3)
ds.to_zarr(store=store, mode='w', consolidated=True) 

#   List S3 bucket content
#
# zarr_prefix = 'opendatacube/zarr/forecast/waves/'
# s3_fs.ls(f"{bucket}/{zarr_prefix}")
#
# folder is created but not all the other files
#with s3_fs.open(f"{bucket}/{zarr_prefix}", 'wb') as s3_folder:
#    store = zarr.DirectoryStore(s3_folder.path)
#    ds.to_zarr(store, mode='w', consolidated=True)

## OpenDataCube and ZARR

* build `datacube-zarr` from source: <https://github.com/opendatacube/datacube-zarr>
* fix version of gdal to match the version installed on the system: `gdalinfo --version`

In [None]:
%pip install datacube

In [None]:
!sudo apt install gdal-bin 

In [None]:
!git clone https://github.com/opendatacube/datacube-zarr
%pip install --upgrade gdal==3.4.1
%pip install --upgrade --editable "./.[test]"

## Saving Dataset metadata to the s3 bucket

In [None]:
import xarray as xr
# import numpy as np
# import matplotlib.pyplot as plt
import uuid
import yaml
import boto3
from dotenv import load_dotenv 
load_dotenv()

df = xr.open_dataset('./data/waves1.nc')
df

# Generate a random UUID
id = str(uuid.uuid4())


def generate_measurement_dict():
    t = {}
    for i in range(len(list(df.data_vars)[:-1])):
        a = {str(list(df.data_vars)[:-1][i]): {
            'path': 's3://52n-maridata/opendatacube/zarr/forecast/waves/',
            'layer': str(list(df.data_vars)[:-1][i]),
        }}
        t = a | t
    return t


# Define the metadata information
metadata = {
    'id': '272302c9-1449-4a33-8166-4b6083a8a715',
    '$schema': 'https://schemas.opendatacube.org/dataset',
    'product': {
        'name': 'waves',
    },
    'crs': 'epsg:4326',
    'geometry':
        {
            'type': 'Polygon',
            'coordinates': [[[-90.0, -180.0], [-90.0, 180.0], [90.0, 180.0], [90.0, -180.0], [-90.0, -180.0]]]
    },
    'grids': {
        'default': {
            'shape': [df['VHM0'].shape[2], df['VHM0'].shape[1]],
            'transform': list(df['VHM0'].rio.transform()),
        },
    },
    'measurements': generate_measurement_dict(),
    'properties': {
        'eo:platform': 'na',
        'eo:instrument': 'na',
        'datetime': str(df.time.values[0]),
        'odc:processing_datetime': str(df.time.values[0]),
        'odc:file_format': 'Zarr',
    },
    'lineage': {},
}

# Print the metadata information
# print(metadata)
# Define S3 bucket and key
bucket_name = '52n-maridata'
key = f'opendatacube/zarr/forecast/waves/eo3_{metadata["product"]["name"]}_dataset.yaml'

# Convert the dictionary to a YAML string
yaml_str = yaml.dump(metadata, default_flow_style=False,
                     line_break='\n', allow_unicode=True)

# Upload YAML file to S3 bucket
session = boto3.Session(
    aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'), 
    aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
    region_name=os.environ.get('AWS_DEFAULT_REGION'))

s3 = session.client('s3')
# Print the YAML 
s3.put_object(Body=yaml_str, Bucket=bucket_name, Key=key)

## Index metadata from s3 to the opendatacube
```
datacube -v system init
datacube product add s3://52n-maridata/opendatacube/zarr/forecast/waves/eo3_waves_product.yaml
datacube dataset add s3://52n-maridata/opendatacube/za```rr/forecast/waves/eo3_waves_dataset.yaml
``` 

#### Import and initialize opendatacube

In [None]:
import datacube 
dc = datacube.Datacube()

### List available products

In [None]:
product_list = dc.list_products()
product_list

### Check the measurements 

In [None]:
measurement_list = dc.list_measurements()
measurement_list

### List available datasets/measurements

In [None]:
a=measurement_list.to_dict()
vars= list(a['name'].values())

### Load the dataset

In [None]:
ds_datacube = dc.load('waves',measurements=list(a['name'].values()),output_crs = "epsg:4326",resolution = (1, 1),align = (0.5, 0.5),crs='EPSG:4326')
ds_datacube

### Visualization of datasets

In [None]:
import matplotlib.pyplot as plt

# Plot the data
for var in vars:
    fig = plt.figure(figsize=(8, 6))
    ds_datacube[var].plot()
plt.show()