In [1]:
import intake
import xarray as xr
import os 
import pandas as pd
import numpy as np
import zarr 
import rhg_compute_tools.kubernetes as rhgk

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
write_direc = '/gcs/rhg-data/climate/downscaled/workdir'

In [4]:
'''client, cluster = rhgk.get_standard_cluster()
cluster'''

'client, cluster = rhgk.get_standard_cluster()\ncluster'

get some CMIP6 data from GCS

here we're going to get daily `tmax` from `IPSL` for historical and SSP370 runs. The ensemble member `r1i1p1f1` isn't available in GCS so we're using `r4i1p1f1` instead. 

Note that the `activity_id` for historical runs is `CMIP`, not `ScenarioMIP` as it is for the ssp-rcp scenarios. 

In [None]:
activity_id = 'ScenarioMIP'
experiment_id = 'ssp370'
table_id = 'day'
variable_id = 'tasmax'
source_id = 'IPSL-CM6A-LR'
institution_id = 'NCAR'
member_id = 'r4i1p1f1'

first we'll take a look at what our options are

In [7]:
# df_cmip6 = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype={'version': 'unicode'})
df_cmip6 = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
len(df_cmip6)

519160

In [201]:
'''df_subset_future = df_cmip6.loc[(df_cmip6['activity_id'] == 'ScenarioMIP') 
             & (df_cmip6['table_id'] == 'day') & (df_cmip6['variable_id'] == 'tasmin')
             & (df_cmip6['institution_id'] == 'NUIST') & (df_cmip6['source_id'] == 'NESM3')]'''

df_subset_future = df_cmip6.loc[(df_cmip6['activity_id'] == 'ScenarioMIP') 
             & (df_cmip6['table_id'] == 'day') & (df_cmip6['variable_id'] == 'tasmin')
             & (df_cmip6['source_id'] == 'HadGEM3-GC31-MM')]

In [202]:
df_subset_future 

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
439121,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp585,r3i1p1f3,day,tasmin,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20200507
439462,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp585,r1i1p1f3,day,tasmin,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20200515
439538,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp585,r2i1p1f3,day,tasmin,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20200515
439930,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp126,r1i1p1f3,day,tasmin,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20200515
496033,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp585,r4i1p1f3,day,tasmin,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20201026


In [195]:
'''df_subset_hist = df_cmip6.loc[(df_cmip6['activity_id'] == 'CMIP') 
             & (df_cmip6['table_id'] == 'day') & (df_cmip6['variable_id'] == 'pr')
             & (df_cmip6['institution_id'] == 'NUIST') & (df_cmip6['source_id'] == 'NESM3')]'''

df_subset_hist = df_cmip6.loc[(df_cmip6['activity_id'] == 'CMIP') 
             & (df_cmip6['table_id'] == 'day') & (df_cmip6['variable_id'] == 'tasmax')
             & (df_cmip6['source_id'] == 'HadGEM3-GC31-MM')]

In [196]:
df_subset_hist

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version


now let's actually pull the data 

In [None]:
# search the cmip6 catalog
col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")

cat = col.search(activity_id=['CMIP', activity_id], 
                 experiment_id=['historical', experiment_id], table_id=table_id, variable_id=variable_id,
                 source_id=source_id, member_id=member_id)

In [None]:
ds_model = {}
ds_model['historical'] = cat['CMIP.IPSL.IPSL-CM6A-LR.historical.day.gr'].to_dask().isel(member_id=0
                                                                                       ).squeeze(drop=True).drop(['member_id', 
                                                                                                                  'height',
                                                                                                                  'time_bounds'])

In [None]:
ds_model['ssp370'] = cat['ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp370.day.gr'].to_dask().isel(member_id=0
                                                                                       ).squeeze(drop=True).drop(['member_id',
                                                                                                                  'height',
                                                                                                                  'time_bounds'])

In [None]:
ds_model['historical']

rechunk in space for global bias correction 

In [None]:
chunks = {'lat': 10, 'lon': 10, 'time': -1}

ds_model['historical'] = ds_model['historical'].chunk(chunks)
ds_model['historical'] = ds_model['historical'].persist()

ds_model['historical'] = ds_model['historical'].load()

ds_model['ssp370'] = ds_model['ssp370'].chunk(chunks)
ds_model['ssp370'] = ds_model['ssp370'].persist()

In [None]:
ds_model['historical'].to_zarr(os.path.join(write_direc, 'cmip6_test_model_historical'), 
                               consolidated=True, compute=False, mode='w')

In [None]:
ds_test = xr.open_zarr(os.path.join(write_direc, 'cmip6_test_model_historical.zarr'))

In [None]:
ds_test

In [None]:
ds_test.info 

In [None]:
ds_model['historical'].to_zarr(os.path.join(write_direc, 'cmip6_test_model_historical'), mode='w')

In [None]:
ds_model['ssp370'].to_netcdf(os.path.join(write_direc, 'cmip6_test_model_ssp370.nc'))

read in the zarr stores and see how hard it is to rechunk them in time instead of space for computing weights

In [None]:
ds_hist = zarr.open(os.path.join(write_direc, 'cmip6_test_model_historical.zarr'), mode='r')

In [None]:
ds_hist

In [None]:
ds_hist.info