# Get Domain

**Author:** Andrew Loeppky (Lots of code stolen from Jamie Byer)

**Project:** Land-surface-atmosphere coupling - CMIP6 intercomparison 

This code grabs a climate model from the cloud, screens it for required variable fields, then selects a user specified domain and saves it to disk as a netcdf4 file.

## Part I: Get a CMIP 6 Dataset and Select Domain

In [1]:
import xarray as xr
import pooch
import pandas as pd
import fsspec
from pathlib import Path
import time
import numpy as np
import json
import cftime
import matplotlib.pyplot as plt
import netCDF4 as nc
from cftime import date2num


# Handy metpy tutorial working with xarray:
# https://unidata.github.io/MetPy/latest/tutorials/xarray_tutorial.html#sphx-glr-tutorials-xarray-tutorial-py
import metpy.calc as mpcalc
from metpy.cbook import get_test_data
from metpy.units import units
from metpy.plots import SkewT

In [17]:
# Attributes of the model we want to analyze (put in csv later)
#source_id = 'CESM2-SE' 
#source_id = 'GFDL-ESM4' # working fig 11
#source_id = "CanESM5" 
#source_id = 'HadGEM3-GC31-MM'
#source_id = 'E3SM-1-0'
#source_id = 'INM-CM5-0'
#source_id = 'NorESM2-LM'
#source_id = 'GFDL-ESM4'
#source_id = 'MPI-ESM1-2-HR'
#source_id = 'CanESM5'

experiment_id = 'piControl'
experiment_id = 'historical'
table_id = '3hr'

# Domain we wish to study

# test domain #
##################################################################
lats = (15, 20) # lat min, lat max
lons = (25, 29) # lon min, lon max
years = (100, 105) # start year, end year (note, no leap days)
##################################################################

# Thompson, MB
#lats = (54, 56) # lat min, lat max
#lons = (261, 263) # lon min, lon max
#years = (100, 300) # start year, end year (note, no leap days)

save_data = False # save as netcdf for further processing?

In [3]:
%run CMIP6_lib.ipynb

In [4]:
required_fields = ['tas', 'mrsos', 'huss']#, 'ps'] 

In [5]:
# get esm datastore
odie = pooch.create(
    path="./.cache",
    base_url="https://storage.googleapis.com/cmip6/",
    registry={
        "pangeo-cmip6.csv": None
    },
)
file_path = odie.fetch("pangeo-cmip6.csv")
df_in = pd.read_csv(file_path)

In [6]:
# this is how to get fields!
#df_in[df_in.source_id == source_id][df_in.table_id == table_id][df_in.experiment_id == experiment_id]

In [7]:
# check that our run has all required fields, list problem variables
#fields_of_interest = []
#missing_fields = []
#for rq in required_fields:
#    if rq not in available_fields:
#        missing_fields.append(rq)
#    else:
#        fields_of_interest.append(rq)


#print(f"Model: {source_id}\n"+"="*30)
#print("Contains required fields:")
#[print("   ", field) for field in required_fields if field in fields_of_interest]

#if fields_of_interest == required_fields:
#    model_passes = True
#    print("\nAll required fields present\n")
#else: 
#    model_passes = False
#    print("Missing required fields:")
#    [print("   ", field) for field in required_fields if field not in fields_of_interest]
        

In [12]:
print(f"""Fetching domain:
          {source_id = }
          {experiment_id = }
          {table_id = }
          {lats = }
          {lons = }
          {years = }
          dataset name: my_ds (xarray Dataset)""")

Fetching domain:
          source_id = 'CESM2-SE'
          experiment_id = 'piControl'
          table_id = '3hr'
          lats = (15, 20)
          lons = (25, 29)
          years = (100, 105)
          dataset name: my_ds (xarray Dataset)


In [15]:
df_in[df_in.variable_id == "huss"][df_in.table_id == "3hr"]

  df_in[df_in.variable_id == "huss"][df_in.table_id == "3hr"]


Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
872,CMIP,NOAA-GFDL,GFDL-ESM4,1pctCO2,r1i1p1f1,3hr,huss,gr1,gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/1pct...,,20180701
901,CMIP,NOAA-GFDL,GFDL-ESM4,esm-piControl,r1i1p1f1,3hr,huss,gr1,gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/esm-...,,20180701
1748,CMIP,NOAA-GFDL,GFDL-ESM4,piControl,r1i1p1f1,3hr,huss,gr1,gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/piCo...,,20180701
2021,CMIP,NOAA-GFDL,GFDL-ESM4,abrupt-4xCO2,r1i1p1f1,3hr,huss,gr1,gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/abru...,,20180701
4016,CMIP,NOAA-GFDL,GFDL-CM4,abrupt-4xCO2,r1i1p1f1,3hr,huss,gr1,gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/abrup...,,20180701
...,...,...,...,...,...,...,...,...,...,...,...
489704,ScenarioMIP,MOHC,HadGEM3-GC31-MM,ssp585,r1i1p1f3,3hr,huss,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/HadGEM3-GC31...,,20201113
492697,CMIP,EC-Earth-Consortium,EC-Earth3-AerChem,historical,r4i1p1f1,3hr,huss,gr,gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...,,20201214
501291,ScenarioMIP,EC-Earth-Consortium,EC-Earth3-Veg,ssp585,r10i1p1f1,3hr,huss,gr,gs://cmip6/CMIP6/ScenarioMIP/EC-Earth-Consorti...,,20210102
503298,CMIP,CMCC,CMCC-ESM2,historical,r1i1p1f1,3hr,huss,gn,gs://cmip6/CMIP6/CMIP/CMCC/CMCC-ESM2/historica...,,20210114


In [11]:
#fetch_var(source_id, experiment_id, table_id, 'tas')
#df_in[df_in.experiment_id == 'piControl'].table_id.unique()
get_field("huss", df_in)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [10]:
# grab all fields of interest and combine
my_fields = [get_field(field, df_in) for field in required_fields]
small_fields = [trim_field(field, lats, lons, years) for field in my_fields]
my_ds = xr.combine_by_coords(small_fields, compat="broadcast_equals", combine_attrs="drop_conflicts")
print("successfully acquired domain")
success = True

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
my_ds

In [None]:
# save as netcdf as per these recommendations:
# https://xarray.pydata.org/en/stable/user-guide/dask.html#chunking-and-performance
# netcdf cant handle cftime, so convert to ordinal, then back once the file is reopened
my_ds["time"] = date2num(my_ds.time, "minutes since 0000-01-01 00:00:00", calendar="noleap", has_year_zero=True)

# get rid of time bounds variable, if it exists
try:
    my_ds = my_ds.drop("time_bnds")
except:
    pass

In [16]:
if save_data:
    print(f"saving {source_id} to disk as netcdf")
    my_ds.to_netcdf(f"./data/{source_id}-{experiment_id}.nc", engine="netcdf4")
    print("success\n\n")
else:
    print(f"successfully parsed {source_id}\n\n")

successfully parsed CESM2-SE




re-write this from scratch. 

1) select df_in where table_id == 3hr, and all required fields exist
2) slice and save each dataset as netcdf