In [None]:
# ! pip install adlfs

In [10]:
%matplotlib inline 
import xarray as xr
import numpy as np
import os 

from adlfs import AzureBlobFileSystem
from datetime import datetime

import dask.distributed as dd
import dask
import rhg_compute_tools.kubernetes as rhgk

In [2]:
import gcsfs
fs = gcsfs.GCSFileSystem(token='/opt/gcsfuse_tokens/impactlab-data.json')

In [3]:
client, cluster = rhgk.get_standard_cluster(extra_pip_packages="adlfs")
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

Validation code for zarr stores 

In [4]:
def test_for_nans(ds, var):
    """
    test for presence of NaNs
    """
    assert ds[var].isnull().sum() == 0, "there are nans!"
    
def test_date_range(ds, var): 
    """
    test that first date and last date in zarrs are correct
    """
    start_date = datetime.strptime('01 01 1994', '%d %m %Y')
    end_date = datetime.strptime('31 12 2015', '%d %m %Y')
    ds_dates = ds.indexes['time'].to_datetimeindex()
    assert ds_dates[0] == start_date, "1994 is not the start date"
    assert ds_dates[-1] == end_date, "zarr store does not contain the full time series"
    
def test_lat_lon_length(ds, var):
    """
    tests that full lat/lon arrays were written to zarr store
    """
    assert len(ds.latitude) == 640, "the full latitude array did not get written"
    assert len(ds.longitude) == 1280, "the full longitude array did not get written"
    
def validate_zarr_store(ds, var):
    """
    validate zarr store by checking for NaNs and that full time series is present 
    """
    test_for_nans(ds, var)
    test_date_range(ds, var)
    test_lat_lon_length(ds, var)

Validate zarr stores by checking a) NaNs, b) valid date range (1994 - 2015 so we can slice the additional +/- 15 days), c) valid lat/lon lengths. Other validation was covered in previous validation steps. 

In [5]:
variables = ["tasmax", "tasmin", "dtr", "pr"]

In [11]:
for var in variables:
    print("validating {}".format(var))
    if var == 'pr':
        version = 'v3'
    else: 
        version = 'v2'
    zarr_storepath = 'gs://impactlab-data/climate/source_data/ERA-5/downscaling/{}.1995-2014.F320.{}.zarr'
    store = fs.get_mapper(zarr_storepath.format(var, version), check=False)
    with xr.open_zarr(store, consolidated=False) as ds:
        validate_zarr_store(ds, var)
        print("finished validating zarr store for {}".format(var))

validating pr
finished validating zarr store for pr


  ds_dates = ds.indexes['time'].to_datetimeindex()


write zarr stores to Azure (account key excluded for privacy purposes) 

In [None]:
fs_az = AzureBlobFileSystem(
        account_name='dc6',
        account_key='', 
        client_id=os.environ.get("AZURE_CLIENT_ID", None),
        client_secret=os.environ.get("AZURE_CLIENT_SECRET", None),
        tenant_id=os.environ.get("AZURE_TENANT_ID", None))

In [None]:
for var in variables:
    
    if var == 'pr':
        version = 'v3'
    else: 
        version = 'v2'
    zarr_storepath = 'gs://impactlab-data/climate/source_data/ERA-5/downscaling/{}.1995-2014.F320.{}.zarr'
    store = fs.get_mapper(zarr_storepath.format(var, version), check=False)
    
    with xr.open_zarr(store, consolidated=False) as ds:
    
        zarr_path = "clean-dev/ERA-5/F320/{}.1995-2015.F320.v2.zarr"
        az_zarr_direct_path = "az://clean-dev/ERA-5/{}.1995-2015.F320.v2.zarr"
        az_zarr_store = fs_az.get_mapper(zarr_path.format(var), check=False)

        ds.to_zarr(az_zarr_store, consolidated=True, mode="w")
        print("wrote zarr store to Azure for {}".format(var))

In [None]:
if 'dtr' in ds.variables: 
    print("yes")
    
if 'tmax' in ds.variables: 
    print("why is tmax here")