In [None]:
# !pip install intake==0.6.2
# ! pip install intake-esm==2021.1.15

In [1]:
%matplotlib inline
import intake
import xarray as xr
import os 
import pandas as pd
import numpy as np
import zarr 
import gcsfs
from xarray.ufuncs import maximum, minimum
import rhg_compute_tools.kubernetes as rhgk

import matplotlib.pyplot as plt
import re
import yaml
import ast
import warnings 

### Note that `intake` and `intake-esm` versions must be `0.6.2` and `2021.1.15` respectively for `bnds`, `lat_bnds`, and `lon_bnds` to be preserved in the CMIP6 data. 

In [15]:
print(intake.__version__)

0.6.2


In [66]:
if intake.__version__ != '0.6.2':
    raise AssertionError("this workflow requires version 0.6.2 of intake")
warnings.warn("Double check that you have installed intake-esm (the 2021.1.15 version specified above), otherwise the bnds, lat_bnds, and lon_bdns will not be loaded \
              in any Pangeo CMIP6 datasets in the cat.to_dataset_dict step")



In [3]:
from science_validation_manual import read_gcs_zarr

In [4]:
fs = gcsfs.GCSFileSystem(token='/opt/gcsfuse_tokens/impactlab-data.json')

In [6]:
# col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6-noQC.json")

In [7]:
def _paramfile_to_tuple(model, variable):
    """
    takes in a model and variable, returns tuple from parameter file. 
    """
    param_file = '/home/jovyan/downscaling/downscale/workflows/parameters/{}-{}.yaml'.format(model, variable)
    with open(param_file, 'r') as f:
        var_dict = yaml.full_load(f)
    # some string parsing 
    line = var_dict['jobs']
    line1 = re.sub(r"\n", "", line)
    line2 = re.sub(r"[\[\]]", "", line1)
    return ast.literal_eval(line2.strip())

def _get_cmip6_dataset(model, variable, tuple_id, period='ssp'):
    d_ssp = _paramfile_to_tuple(model, variable)[tuple_id][period]
    cat = col.search(
              activity_id=d_ssp['activity_id'],
              experiment_id=d_ssp['experiment_id'],
              table_id=d_ssp['table_id'],
              variable_id=d_ssp['variable_id'],
              source_id=d_ssp['source_id'],
              member_id=d_ssp['member_id'],
              grid_label=d_ssp['grid_label'],
              version=int(d_ssp['version']),
          )
    return cat.to_dataset_dict(progressbar=False)

def compute_dtr(model, tuple_id=1):
    """
    takes in tasmax and tasmin Datasets, computes DTR (returns it lazily)
    """
    tasmax = _get_cmip6_dataset(model, 'tasmax', tuple_id)
    k_tasmax = list(tasmax.keys())
    if len(k_tasmax) != 1:
        raise ValueError("there is likely an issue with {} tasmax".format(model))
    tasmin = _get_cmip6_dataset(model, 'tasmin', tuple_id)
    k_tasmin = list(tasmin.keys())
    if len(k_tasmin) != 1:
        raise ValueError("there is likely an issue with {} tasmin".format(model))
    return tasmax[k_tasmax[0]]['tasmax'] - tasmin[k_tasmin[0]]['tasmin'] 

def check_dtr(dtr, model):
    """
    """
    min_dtr = dtr.min('time')
    neg_count = min_dtr.where(min_dtr < 0).count().values
    zero_count = min_dtr.where(min_dtr == 0).count().values
    if neg_count > 0:
        warnings.warn("DTR has {} negative values for {}, {} needs tasmin/tasmax swapping".format(neg_count, model, model))
    if zero_count > 0:
        warnings.warn("DTR has {} zero values for {}".format(zero_count, model))

checking models 

DTR negative: 
- GFDL-ESM4
- GFDL-CM4

DTR positive: 
- CanESM5
- INM-CM4-8
- INM-CM5-0
- NorESM2-MM
- NorESM2-LM
- MIROC6
- EC-Earth3-Veg-LR
- EC-Earth3-Veg
- EC-Earth3
- KIOST-ESM
- MIROC-ES2L
- MPI-ESM1-2-LR
- MPI-ESM1-2-HR
- NESM3
- MRI-ESM2-0
- FGOALS-g3
- CMCC-ESM2
- BCC-CSM2-MR
- AWI-CM-1-1-MR
- ACCESS-CM2

Parameter files to add or fix (could not check DTR): 
- UKESM1-0-LL
- ACCESS-ESM1-5

Tasmin parameter files to add (could not check DTR): 
- CAMS-CSM1-0

In [52]:
model = 'GFDL-CM4'
'''dtr = compute_dtr(model, tuple_id=0)
check_dtr(dtr, model)'''
tasmax = _get_cmip6_dataset(model, 'tasmax', 1, period='ssp')
k_tasmax = list(tasmax.keys())

{'cell_measures': 'area: areacella',
 'cell_methods': 'area: mean time: maximum',
 'interp_method': 'conserve_order2',
 'long_name': 'Daily Maximum Near-Surface Air Temperature',
 'original_name': 'tasmax',
 'standard_name': 'air_temperature',
 'units': 'K'}

### For models with negative DTR, swap tasmax and tasmin ### 

GFDL-CM4: historical, ssp245, ssp585

GFDL-ESM4: historical, ssp126, ssp245, ssp370, ssp585

In [53]:
def _compute_max_or_min_temperature(model, tuple_id=1, variable='tasmax', ssp_or_historical='ssp'):
    """
    takes in a model source_id, pulls in the required parameter file info, 
    gets the tasmax and tasmin Datasets from the CMIP6 archive, computes tasmax or tasmin (returns it lazily)
    """
    tasmax = _get_cmip6_dataset(model, 'tasmax', tuple_id, period=ssp_or_historical)
    k_tasmax = list(tasmax.keys())
    if len(k_tasmax) != 1:
        raise ValueError("there is likely an issue with {} tasmax".format(model))
    tasmin = _get_cmip6_dataset(model, 'tasmin', tuple_id, period=ssp_or_historical)
    k_tasmin = list(tasmin.keys())
    if len(k_tasmin) != 1:
        raise ValueError("there is likely an issue with {} tasmin".format(model))
    ds_tmax = tasmax[k_tasmax[0]].copy()
    ds_tmin = tasmin[k_tasmin[0]].copy()
        
    # compute max or min 
    if variable == 'tasmax':
        tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
        tmax.attrs = tasmax[k_tasmax[0]]['tasmax'].attrs
        ds_tmax['tasmax'] = tmax
        return ds_tmax
    
    elif variable == 'tasmin':
        tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
        tmin.attrs = tasmin[k_tasmin[0]]['tasmin'].attrs
        ds_tmin['tasmin'].values = tmin 
        
        return ds_tmin

def swap_cmip6_tasmax_or_tasmin(model, tuple_id, variable, ssp='ssp245', target_run='ssp'):
    """
    for select GCMs with negative DTR, this swaps tasmax and tasmin so that tasmax > tasmin 
    """
    temp_var = _compute_max_or_min_temperature(model, tuple_id=tuple_id, 
                                                          variable=variable, ssp_or_historical=target_run)
    ds_temp = temp_var.persist()
    
    if target_run == 'historical':
        activity_id = 'CMIP'
    else:
        activity_id = 'ScenarioMIP'
    if model == 'GFDL-CM4':
        version = '20180701'
    elif model == 'GFDL-ESM4':
        if target_run == 'ssp':
            version = '20180701'
        else:
            version = '20190726'

    store_filename = ('gs://raw-305d04da/cmip6/{}/NOAA-GFDL/{}/{}/r1i1p1f1/day/{}/gr1/v{}.zarr'.format(activity_id, model, ssp, variable, version))
    store = fs.get_mapper(store_filename, check=False)
    
    ds_temp.chunk({'member_id': 1, 'time': 830, 'lat': len(ds_temp.lat), 'lon': len(ds_temp.lon)}).to_zarr(store, consolidated=True, mode="w")
    
    print("zarr store for {} {} saved to {}".format(model, ssp, store_filename))

In [54]:
swap_cmip6_tasmax_or_tasmin('GFDL-CM4', 1, 'tasmax', 'ssp245', target_run='ssp')

  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-CM4 ssp245 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp245/r1i1p1f1/day/tasmax/gr1/v20180701.zarr


In [58]:
# variables: tasmin and tasmax
# models: GFDL-ESM4 (all ssps included in downscaling) and GFDL-CM4 (ssps 245 and 585)
model = 'GFDL-ESM4'

gfdlcm4_scens = ['historical', 'ssp245', 'ssp585']
gfdlesm4_scens = ['historical', 'ssp370', 'ssp245', 'ssp126', 'ssp585']
for variable in ['tasmin', 'tasmax']:
    for i, tuple_id in enumerate([0, 1, 2, 3, 4]):
        if tuple_id != 0:
            target_run = 'ssp'
        else:
            target_run = 'historical'
        swap_cmip6_tasmax_or_tasmin(model, tuple_id, variable, gfdlesm4_scens[i], target_run=target_run)

  tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 historical saved to gs://raw-305d04da/cmip6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/day/tasmin/gr1/v20190726.zarr


  tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp370 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp370/r1i1p1f1/day/tasmin/gr1/v20180701.zarr


  tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp245 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp245/r1i1p1f1/day/tasmin/gr1/v20180701.zarr


  tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp126 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp126/r1i1p1f1/day/tasmin/gr1/v20180701.zarr


  tmin = minimum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp585 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp585/r1i1p1f1/day/tasmin/gr1/v20180701.zarr


  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 historical saved to gs://raw-305d04da/cmip6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/day/tasmax/gr1/v20190726.zarr


  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp370 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp370/r1i1p1f1/day/tasmax/gr1/v20180701.zarr


  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp245 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp245/r1i1p1f1/day/tasmax/gr1/v20180701.zarr


  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp126 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp126/r1i1p1f1/day/tasmax/gr1/v20180701.zarr


  tmax = maximum(ds_tmax['tasmax'], ds_tmin['tasmin'])
  f(self.variable, other_variable)
  f(self_data, other_data) if not reflexive else f(other_data, self_data)


zarr store for GFDL-ESM4 ssp585 saved to gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp585/r1i1p1f1/day/tasmax/gr1/v20180701.zarr


In [60]:
# ds = read_gcs_zarr('gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp245/r1i1p1f1/day/tasmin/gr1/v20180701.zarr')
ds = read_gcs_zarr('gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp585/r1i1p1f1/day/tasmax/gr1/v20180701.zarr')
ds['tasmax'].attrs

{'cell_measures': 'area: areacella',
 'cell_methods': 'area: mean time: maximum',
 'interp_method': 'conserve_order2',
 'long_name': 'Daily Maximum Near-Surface Air Temperature',
 'original_name': 'tasmax',
 'standard_name': 'air_temperature',
 'units': 'K'}

check original raw DTR for `GFDL-CM4` ssp245 and then "updated" DTR for the same model/ssp 

In [61]:
ds_tmax = read_gcs_zarr('gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp585/r1i1p1f1/day/tasmax/gr1/v20180701.zarr')
ds_tmin = read_gcs_zarr('gs://raw-305d04da/cmip6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp585/r1i1p1f1/day/tasmin/gr1/v20180701.zarr')
dtr_gfdlcm4_ssp585 = ds_tmax['tasmax'] - ds_tmin['tasmin']

In [36]:
print("original raw CMIP6 data")
model = 'GFDL-CM4'
# _get_cmip6_dataset(model, variable, tuple_id, period='ssp')
dtr = compute_dtr(model, tuple_id=2)
check_dtr(dtr, model)

original raw CMIP6 data




In [62]:
print("pre-processed CMIP6 data")
model = 'GFDL-CM4'

check_dtr(dtr_gfdlcm4_ssp585, model)

pre-processed CMIP6 data


