In [None]:
# Initialize notebook environment.
import boto3
import botocore
import datetime
import os.path
import xarray as xr
from dotenv import load_dotenv

In [None]:
load_dotenv()
DATA_DIR = os.environ.get("DATA_DIR")

## Data Exploration and Download
A majority of the code from this section was taken from planet-os repo, referenced in the readme and linked here:
https://github.com/planet-os/notebooks/blob/master/aws/era5-s3-via-boto.ipynb

In [None]:
era5_bucket = 'era5-pds'

# AWS access / secret keys required
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(era5_bucket)

# No AWS keys required
client = boto3.client('s3', config=botocore.client.Config(signature_version=botocore.UNSIGNED))

In [None]:
paginator = client.get_paginator('list_objects')
result = paginator.paginate(Bucket=era5_bucket, Delimiter='/')
for prefix in result.search('CommonPrefixes'):
    print(prefix.get('Prefix'))

In [None]:
keys = []
date = datetime.date(2022,5,1) # update to desired date
prefix = date.strftime('%Y/%m/')

response = client.list_objects_v2(Bucket=era5_bucket, Prefix=prefix)
response_meta = response.get('ResponseMetadata')

if response_meta.get('HTTPStatusCode') == 200:
    contents = response.get('Contents')
    if contents == None:
        print("No objects are available for %s" % date.strftime('%B, %Y'))
    else:
        for obj in contents:
            keys.append(obj.get('Key'))
        print("There are %s objects available for %s\n--" % (len(keys), date.strftime('%B, %Y')))
        for k in keys:
            print(k)
else:
    print("There was an error with your request.")

In [None]:
# select date and variable of interest
date = datetime.date(2022,5,1)
var = 'precipitation_amount_1hour_Accumulation'

# file path patterns for remote S3 objects and corresponding local file
s3_data_ptrn = '{year}/{month}/data/{var}.nc'
data_file_ptrn = '{year}{month}_{var}.nc'

year = date.strftime('%Y')
month = date.strftime('%m')
s3_data_key = s3_data_ptrn.format(year=year, month=month, var=var)
data_file = data_file_ptrn.format(year=year, month=month, var=var)

if not os.path.isfile(data_file): # check if file already exists
    print("Downloading %s from S3..." % s3_data_key)
    client.download_file(era5_bucket, s3_data_key, data_file)

ds = xr.open_dataset(data_file)
ds.info

In [None]:
print(ds.time1.encoding["units"])

In [None]:
ds.keys()

In [None]:
ds.coords.values()

## Extract and save as parquet
Extract a small subset of the data by date to test the workflow localy

In [None]:
ds_small=ds.sel(time1=slice('2022-05-01T22:00:00','2022-05-01T23:00:00'))

In [None]:
# Convert from xarray to pandas df
df_f = ds_small.to_dataframe()

In [None]:
# Save the subset of localy to a compressed parquet
df_f.to_parquet(f'{DATA_DIR}\df.parquet.gzip', compression='gzip') 