**Step 3: Remove leap days via convert_calendar and add `dayofyear` coodinate to yearly files. Convert netcdf files to zarr stores.**

In [2]:
! pip install xclim 

Collecting xclim
  Using cached xclim-0.24.0-py3-none-any.whl (272 kB)
Collecting boltons>=20.1
  Using cached boltons-20.2.1-py2.py3-none-any.whl (170 kB)
Collecting cftime>=1.4.1
  Using cached cftime-1.4.1-cp38-cp38-manylinux2014_x86_64.whl (322 kB)
Installing collected packages: cftime, boltons, xclim
  Attempting uninstall: cftime
    Found existing installation: cftime 1.3.0
    Uninstalling cftime-1.3.0:
      Successfully uninstalled cftime-1.3.0
Successfully installed boltons-20.2.1 cftime-1.4.1 xclim-0.24.0


In [1]:
%matplotlib inline
import xarray as xr
import numpy as np 
import os 
import pandas as pd 
import matplotlib.pyplot as plt 

import intake
import zarr
import gcsfs

import dask
import dask.array as da
import dask.distributed as dd
import rhg_compute_tools.kubernetes as rhgk

from xclim.core.calendar import convert_calendar

In [2]:
import warnings
warnings.filterwarnings("ignore")

For bias correction, we will be using 1995-2014 (the CMIP6 reference period). 

In [4]:
years = np.arange(1995, 2015)

function for removing leap days and converting the calendar to `noleap`

In [5]:
def convert_to_noleap_calendar(ds, target='noleap'):
    ds_noleap = convert_calendar(ds, target=target)
    return ds_noleap 

In [6]:
# Provide `dayofyear` coordinate to data. Needs to be in full years
def assign_dayofyear_coord(da):
    
    years = np.arange(da.time.dt.year.min(),da.time.dt.year.max()+1)
    da_wcoords = da.assign_coords(dayofyear=xr.DataArray(np.array([np.arange(1,366)]*len(years)).flatten(),
                                                dims=('time'),
                                                coords={'time':da.time})).persist()
    return da_wcoords

In [3]:
fs = gcsfs.GCSFileSystem(token='/opt/gcsfuse_tokens/impactlab-data.json')

In [8]:
client, cluster = rhgk.get_standard_cluster()
cluster.scale(10)

In [11]:
client

0,1
Client  Scheduler: gateway://traefik-impactlab-hub-dask-gateway.impactlab-hub:80/impactlab-hub.38d36f64266b48c88e00c6917d99287e  Dashboard: /services/dask-gateway/clusters/impactlab-hub.38d36f64266b48c88e00c6917d99287e/status,Cluster  Workers: 8  Cores: 8  Memory: 96.64 GB


In [21]:
cluster.close()

In [12]:
def create_era5_zarr_store(variable, years): 
    
    # create list of filenames to load
    filenames = [os.path.join('/gcs/impactlab-data/climate/source_data/ERA-5/day/%s/v1.1' %variable, 
                          '%s_daily_%s-%s.nc' %(variable, year, year)) for year in years]
    
    # Load files for specified years, note that the `preprocess=convert_to_noleap_calendar` argument is 
    # removing leap days and updating the calendar on each yearly file of daily data as it's being concatenated
    ds_allyears = xr.open_mfdataset(filenames, 
                                preprocess=convert_to_noleap_calendar)
    store_filename = 'gs://impactlab-data/climate/source_data/ERA-5/downscaling/%s.1995-2014.0p25.zarr' %variable 
    store = fs.get_mapper(store_filename, check=False)
    
    attrsdt = {'method': 'Changed calendar to no-leap and removed leap days'}
    ds_allyears.attrs.update(attrsdt)
    
    if not fs.exists(store_filename): 
        # save as a zarr store for rechunking 
        ds_allyears.to_zarr(store, consolidated=True, mode="w")

In [13]:
era5_variables = ['tmax', 'tmin', 'precip_total']

create zarr stores for tmax, tmin and precip_total

In [14]:
for variable in era5_variables: 
    create_era5_zarr_store(variable, years)

tmax


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the

<xarray.Dataset>
Dimensions:    (latitude: 721, longitude: 1440, time: 7300)
Coordinates:
  * longitude  (longitude) float32 0.0 0.25 0.5 0.75 ... 359.0 359.2 359.5 359.8
  * latitude   (latitude) float32 90.0 89.75 89.5 89.25 ... -89.5 -89.75 -90.0
  * time       (time) object 1995-01-01 00:00:00 ... 2014-12-31 00:00:00
Data variables:
    tmax       (time, latitude, longitude) float32 dask.array<chunksize=(365, 721, 1440), meta=np.ndarray>
Attributes:
    author:   Meredith Fish
    contact:  meredith.fish@rutgers.edu
    project:  impactlab-rhg/climate/source-data/ERA-5
    source:   impactlab-rhg/climate/downscaled/ERA-5/hourly
    created:  2021-02-16
    method:   Changed calendar to no-leap and removed leap days
tmin


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the

<xarray.Dataset>
Dimensions:    (latitude: 721, longitude: 1440, time: 7300)
Coordinates:
  * longitude  (longitude) float32 0.0 0.25 0.5 0.75 ... 359.0 359.2 359.5 359.8
  * latitude   (latitude) float32 90.0 89.75 89.5 89.25 ... -89.5 -89.75 -90.0
  * time       (time) object 1995-01-01 00:00:00 ... 2014-12-31 00:00:00
Data variables:
    tmin       (time, latitude, longitude) float32 dask.array<chunksize=(365, 721, 1440), meta=np.ndarray>
Attributes:
    author:   Meredith Fish
    contact:  meredith.fish@rutgers.edu
    project:  impactlab-rhg/climate/source-data/ERA-5
    source:   impactlab-rhg/climate/downscaled/ERA-5/hourly
    created:  2021-02-16
    method:   Changed calendar to no-leap and removed leap days
precip_total


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the

<xarray.Dataset>
Dimensions:       (latitude: 721, longitude: 1440, time: 7300)
Coordinates:
  * time          (time) object 1995-01-01 00:00:00 ... 2014-12-31 00:00:00
  * latitude      (latitude) float32 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * longitude     (longitude) float32 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
Data variables:
    precip_total  (time, latitude, longitude) float32 dask.array<chunksize=(365, 721, 1440), meta=np.ndarray>
Attributes:
    version:  v1.1
    author:   Meredith Fish
    contact:  meredith.fish@rutgers.edu
    project:  historical-climate-data-diagnostics
    source:   yearly files of ERA-5 daily data created from /gcs/impactlab-da...
    created:  2021-03-02
    method:   Changed calendar to no-leap and removed leap days
    units:    m


check to be sure xarray can read one of the test zarr stores

In [19]:
store_filename = 'gs://impactlab-data/climate/source_data/ERA-5/downscaling/precip_total.1995-2014.0p25.zarr' #%variable 
store = fs.get_mapper(store_filename, check=False)
ds_testread = xr.open_zarr(store, consolidated=True)

In [20]:
ds_testread

Unnamed: 0,Array,Chunk
Bytes,30.32 GB,1.52 GB
Shape,"(7300, 721, 1440)","(365, 721, 1440)"
Count,21 Tasks,20 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 30.32 GB 1.52 GB Shape (7300, 721, 1440) (365, 721, 1440) Count 21 Tasks 20 Chunks Type float32 numpy.ndarray",1440  721  7300,

Unnamed: 0,Array,Chunk
Bytes,30.32 GB,1.52 GB
Shape,"(7300, 721, 1440)","(365, 721, 1440)"
Count,21 Tasks,20 Chunks
Type,float32,numpy.ndarray


In [None]:
filename = 'biascorrected-rechunked.zarr' # gcm-future-rechunk.zarr, gcm-training-rechunk.zarr, reference-rechunk.zarr

In [13]:
store_filename = 'gs://impactlab-data/climate/downscaling/dc6-dev-75bpr/{}'.format(filename) #%variable 
store = fs.get_mapper(store_filename, check=False)
ds_testread = xr.open_zarr(store) #, consolidated=True

In [15]:
ds_testread['tasmax']

Unnamed: 0,Array,Chunk
Bytes,22.71 GB,51.84 MB
Shape,"(360, 43800, 180)","(360, 100, 180)"
Count,439 Tasks,438 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 22.71 GB 51.84 MB Shape (360, 43800, 180) (360, 100, 180) Count 439 Tasks 438 Chunks Type float64 numpy.ndarray",180  43800  360,

Unnamed: 0,Array,Chunk
Bytes,22.71 GB,51.84 MB
Shape,"(360, 43800, 180)","(360, 100, 180)"
Count,439 Tasks,438 Chunks
Type,float64,numpy.ndarray
