In [1]:
import intake
import xarray as xr
import os 
import pandas as pd
import numpy as np
import zarr 
import rhg_compute_tools.kubernetes as rhgk

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
write_direc = '/gcs/rhg-data/climate/downscaled/workdir'

In [4]:
client, cluster = rhgk.get_standard_cluster()
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

get some CMIP6 data from GCS

here we're going to get daily `tmax` from `IPSL` for historical and SSP370 runs. The ensemble member `r1i1p1f1` isn't available in GCS so we're using `r4i1p1f1` instead. 

Note that the `activity_id` for historical runs is `CMIP`, not `ScenarioMIP` as it is for the ssp-rcp scenarios. 

In [5]:
activity_id = 'ScenarioMIP'
experiment_id = 'ssp370'
table_id = 'day'
variable_id = 'tasmax'
source_id = 'IPSL-CM6A-LR'
institution_id = 'NCAR'
member_id = 'r4i1p1f1'

first we'll take a look at what our options are

In [6]:
df_cmip6 = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype={'version': 'unicode'})
len(df_cmip6)

354418

In [7]:
df_subset_future = df_cmip6.loc[(df_cmip6['activity_id'] == activity_id) & (df_cmip6['experiment_id'] == experiment_id) 
             & (df_cmip6['table_id'] == table_id) & (df_cmip6['variable_id'] == variable_id)
             & (df_cmip6['source_id'] == source_id) & (df_cmip6['member_id'] == member_id)]

In [8]:
df_subset_future 

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version,status,severity,issue_url
322554,ScenarioMIP,IPSL,IPSL-CM6A-LR,ssp370,r4i1p1f1,day,tasmax,gr,gs://cmip6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp37...,,20190614,good,none,none


In [9]:
df_subset_hist = df_cmip6.loc[(df_cmip6['experiment_id'] == 'historical') 
             & (df_cmip6['table_id'] == table_id) & (df_cmip6['variable_id'] == variable_id) 
             & (df_cmip6['source_id'] == source_id) & (df_cmip6['member_id'] == member_id)]

In [10]:
df_subset_hist

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version,status,severity,issue_url
54568,CMIP,IPSL,IPSL-CM6A-LR,historical,r4i1p1f1,day,tasmax,gr,gs://cmip6/CMIP/IPSL/IPSL-CM6A-LR/historical/r...,,20190614,good,none,none


now let's actually pull the data 

In [11]:
# search the cmip6 catalog
col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")

cat = col.search(activity_id=['CMIP', activity_id], 
                 experiment_id=['historical', experiment_id], table_id=table_id, variable_id=variable_id,
                 source_id=source_id, member_id=member_id)

In [12]:
ds_model = {}
ds_model['historical'] = cat['CMIP.IPSL.IPSL-CM6A-LR.historical.day.gr'].to_dask().isel(member_id=0
                                                                                       ).squeeze(drop=True).drop(['member_id', 
                                                                                                                  'height',
                                                                                                                  'time_bounds'])

In [13]:
ds_model['ssp370'] = cat['ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp370.day.gr'].to_dask().isel(member_id=0
                                                                                       ).squeeze(drop=True).drop(['member_id',
                                                                                                                  'height',
                                                                                                                  'time_bounds'])

In [14]:
ds_model['historical']

Unnamed: 0,Array,Chunk
Bytes,4.96 GB,83.19 MB
Shape,"(60265, 143, 144)","(1010, 143, 144)"
Count,181 Tasks,60 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.96 GB 83.19 MB Shape (60265, 143, 144) (1010, 143, 144) Count 181 Tasks 60 Chunks Type float32 numpy.ndarray",144  143  60265,

Unnamed: 0,Array,Chunk
Bytes,4.96 GB,83.19 MB
Shape,"(60265, 143, 144)","(1010, 143, 144)"
Count,181 Tasks,60 Chunks
Type,float32,numpy.ndarray


rechunk in space for global bias correction 

In [None]:
chunks = {'lat': 10, 'lon': 10, 'time': -1}

ds_model['historical'] = ds_model['historical'].chunk(chunks)
ds_model['historical'] = ds_model['historical'].persist()

ds_model['historical'] = ds_model['historical'].load()

ds_model['ssp370'] = ds_model['ssp370'].chunk(chunks)
ds_model['ssp370'] = ds_model['ssp370'].persist()

In [25]:
ds_model['historical'].to_zarr(os.path.join(write_direc, 'cmip6_test_model_historical'), 
                               consolidated=True, compute=False, mode='w')

Delayed('_finalize_store-3ec66b8c-1253-4cea-9917-433edd64009e')

In [28]:
ds_test = xr.open_zarr(os.path.join(write_direc, 'cmip6_test_model_historical.zarr'))

In [29]:
ds_test

In [27]:
ds_test.info 

0,1
Name,/
Type,zarr.hierarchy.Group
Read-only,False
Store type,zarr.storage.DirectoryStore
No. members,0
No. arrays,0
No. groups,0


In [None]:
ds_model['historical'].to_zarr(os.path.join(write_direc, 'cmip6_test_model_historical'), mode='w')

In [None]:
ds_model['ssp370'].to_netcdf(os.path.join(write_direc, 'cmip6_test_model_ssp370.nc'))

read in the zarr stores and see how hard it is to rechunk them in time instead of space for computing weights

In [None]:
ds_hist = zarr.open(os.path.join(write_direc, 'cmip6_test_model_historical.zarr'), mode='r')

In [None]:
ds_hist

In [None]:
ds_hist.info