# Prep data for downscaling paper Fig 10 and other diagnostics

- gs://clean-b1dbca25/
-  gs://downscaled-288ec5ac

- Emile's diagnostics output: gs://downscaled-288ec5ac/diagnostics/RELEASE-v{delivery_version}/{diagnostics_name}/{activity_id}/{institution_id}/{source_id}/{experiment_id}/{member_id}/{table_id}/{variable_id}/{delivery_version}.zarr
    - right now `delivery_version` is "RELEASE-v1.1"
    
    
#### Existing diagnostics:
```
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/file_paths.yaml
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-annual-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-annual-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-annual-tasmin-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-daily-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-daily-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/biascorrected-daily-tasmin-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmin-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-daily-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-daily-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-daily-tasmin-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/daily-precip-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/daily-tasmax-diagnostics/
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/daily-tasmin-diagnostics/
```


In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import gcsfs

import matplotlib.pyplot as plt
import cartopy.crs as ccrs

%matplotlib inline

In [2]:
import sys

sys.path.append(".")

import dc6_functions as dc6

In [3]:
CLEANED_REF_0p25deg_FP = 'gs://support-c23ff1a3/qplad-fine-reference/pr/v20220201000555.zarr'

In [4]:
CLEANED_REF_0p25deg_RECHUNKED_FP = 'gs://support-c23ff1a3/qplad-fine-reference/pr/v20220201000555-rechunked.zarr'

In [5]:
ERA_PR_DIAGNOSTICS_FP = 'gs://downscaled-288ec5ac/diagnostics/ERA5-ref/qplad-fine-reference/pr/v20220201000555/{diagnostics_name}.zarr'
ERA_TAS_DIAGNOSTICS_FP = 'gs://downscaled-288ec5ac/diagnostics/RELEASE-{delivery_version}/{diagnostics_name}/reanalysis/ERA5/F320/{variable_id}/{delivery_version}.zarr'

## TODO find ERA-5 reference data

In [6]:
import yaml

with open('dcmip6_all_paths_all.yaml', 'r') as f:
    DC6_STAGE_PATHS = yaml.safe_load(f)

In [7]:
EOC = slice("2080", "2099")
IPCC = slice("1995", "2014")

In [8]:
def read_gcs_zarr(
    zarr_url, token="/opt/gcsfuse_tokens/impactlab-data.json", check=False
):
    """
    takes in a GCSFS zarr url, bucket token, and returns a dataset
    Note that you will need to have the proper bucket authentication.
    """
    fs = gcsfs.GCSFileSystem(token=token)

    store_path = fs.get_mapper(zarr_url, check=check)
    ds = xr.open_zarr(store_path)

    return ds

In [9]:
def convert_lons_split(ds, lon_name="longitude"):
    # convert lons from 0-360 to -180 to 180

    print("converting lons to -180 to 180")
    # ds[lon_name].values = xr.where(ds[lon_name] > 180, ds[lon_name] - 360, ds[lon_name])

    ds = ds.assign_coords(
        **{lon_name: xr.where(ds[lon_name] > 180, ds[lon_name] - 360, ds[lon_name])}
    )
    ds = ds.sel(**{lon_name: np.sort(ds[lon_name].values)})

    return ds

def global_mean_area_weighted(da, lat_name="lat", lon_name="lon"):

    lat_weights = np.cos(da[lat_name] * np.pi / 180.0)
    weights = xr.ones_like(da) * lat_weights
    masked_weights = weights.where(~da.isnull())

    return (da * masked_weights).sum(dim=(lat_name, lon_name)) / masked_weights.sum(
        dim=(lat_name, lon_name)
    )

def global_land_mean_area_weighted(da, lat_name="lat", lon_name="lon"):

    landp25 = (
        "gs://rhg-data/impactlab-rhg/spatial/rasters/source/sedac_ciesin_GPWv4/"
        "gpw-v4-land-water-area-rev11_landareakm_30_sec_tif/derived_datasets/"
        "coarsened/gpw-v4-land-water-area-rev11_landareakm_0.25degree.zarr/"
    )
    la = read_gcs_zarr(landp25, token="/opt/gcsfuse_tokens/rhg-data.json",).load()
    # print(la)
    lweights = la.land_area / la.land_area.sum()
    
    weights = xr.ones_like(da) * lweights
    masked_weights = weights.where(~da.isnull())

    return (da * masked_weights).sum(dim=(lat_name, lon_name)) / masked_weights.sum(
        dim=(lat_name, lon_name)
    )

def global_mean_pop_weighted(da, lat_name="lat", lon_name="lon"):
    
    popp25 = (
        "gs://rhg-data/impactlab-rhg/spatial/rasters/source/sedac_ciesin_GPWv4/"
        "gpw-v4-population-count-adjusted-to-2015-unwpp-country-totals-rev11/"
        "derived_datasets/coarsened/"
        "gpw-v4-population-count-adjusted-to-2015-unwpp-country-totals-rev11_2020_0.25degree.zarr"
    )


    griddedpop = read_gcs_zarr(popp25, token="/opt/gcsfuse_tokens/rhg-data.json",).load()
    weights = griddedpop.population / griddedpop.population.sum()
    masked_weights = weights.where(~da.isnull())

    return (da * masked_weights).sum(dim=(lat_name, lon_name)) / masked_weights.sum(
        dim=(lat_name, lon_name)
    )


## Diagnostics

In [10]:
# Sample code from Diana


models_dict = dc6.get_cmip6_models()
ensemble_members = dc6.get_cmip6_ensemble_members()
grids = dc6.get_cmip6_grids()
institutions = dc6.get_cmip6_institutions()
ds_institutions = dc6.get_cmip6_institutions()
ds_institutions['MPI-ESM1-2-HR'] = 'MPI-M'
DS_INSTITUTIONS = ds_institutions
ALL_MODELS = list(models_dict.keys())

with open('dcmip6_all_paths_all.yaml', 'r') as f:
    ALL_PATHS = yaml.safe_load(f)
    

In [11]:
# to get actual downscaled data
def get_filepath(varname, model, stage, scen): 
    filepath = all_paths[model + '-' + varname][scen][stage]
    return filepath 

In [44]:
def compile_model_diags(
    var,
    diagtype,
    datatype,
    scenario,
    activity,
    delivery_version="v1.1",
    timemeans=[
        EOC,
    ],
):

    # TODO: check whether model has the desired scenario available

    vardt = {}
    gmdt = {}
    pwdt = {}
    lwdt = {}
    eocdt = {}
    modstr = []

    for idx in range(len(ALL_MODELS)):

        mod = ALL_MODELS[idx]
        fn = dc6.get_diagnostics_filepath(
            diagtype, datatype, DS_INSTITUTIONS, ensemble_members, var, mod, scenario
        )
        print(fn)

        try:
            with read_gcs_zarr(fn) as ds:

                # print(fn)
                # print("***",mod, ds) # @@@@

                ds = convert_lons_split(ds, lon_name="lon")

                # print("-------lon converted",ds) #@@@@@
                gmdt[mod] = global_mean_area_weighted(ds).compute()
                # print("NULLS", gmdt[mod]["annual_average_tasmax"].isnull().sum().values) # @@@

                pwdt[mod] = global_mean_pop_weighted(ds).compute()
                lwdt[mod] = global_land_mean_area_weighted(ds).compute()

                # loop through time periods
                tm = []
                for timemean in timemeans:

                    if "time" in ds.coords:
                        tm.append(
                            ds.sel(year=timemean, time=timemean)
                            .mean(dim=["year", "time"])
                            .compute()
                        )
                    else:
                        tm.append(
                            ds.sel(year=timemean)
                            .mean(
                                dim=[
                                    "year",
                                ]
                            )
                            .compute()
                        )

                timemeands = xr.concat(
                    tm,
                    dim=pd.Index(
                        [f"{tmslice.start}-{tmslice.stop}" for tmslice in timemeans],
                        name="epoch",
                    ),
                )
                eocdt[mod] = timemeands
                modstr.append(mod)
        except FileNotFoundError as fnfe:
            # handling the cases where the files don't exist for particular
            # variable/scenario/GCM combinations
            print(fnfe, "keep going")

        except Exception as e:
            # handling the cases where there is some kind of error for a
            # variable/scenario/GCM combination
            print(e, "keep going")

    gmds = xr.concat(list(gmdt.values()), dim=pd.Index(modstr, name="model"))
    pwds = xr.concat(list(pwdt.values()), dim=pd.Index(modstr, name="model"))
    lwds = xr.concat(list(lwdt.values()), dim=pd.Index(modstr, name="model"))
    eocds = xr.concat(list(eocdt.values()), dim=pd.Index(modstr, name="model"))

    # Note: for "clean" models that all have diff grids, return the
    #       dictionary eocdt, which has each GCM's diagnostics in it
    return gmds, pwds, lwds, eocds, eocdt


def load_era_diags(
    var,
    diagname,
    delivery_version="v1.1",
    timemeans=[
        EOC,
    ],
):

    if var in ("tasmax",):
        era_diagnostics_pattern = ERA_TAS_DIAGNOSTICS_FP
    elif var in ("pr",):
        era_diagnostics_pattern = ERA_PR_DIAGNOSTICS_FP
    else:
        raise NotImplementedError("what variable are you processing? not tasmax or pr!")

    fn = era_diagnostics_pattern.format(
        delivery_version=delivery_version,
        diagnostics_name=diagname,
        table_id="day",
        variable_id=var,
    )
    print(fn)

    with read_gcs_zarr(fn) as ds:

        ds = convert_lons_split(ds, lon_name="lon")
        # print(ds)
        gm = global_mean_area_weighted(ds).compute()
        pw = global_mean_pop_weighted(ds).compute()
        lw = global_land_mean_area_weighted(ds).compute()

        # loop through time periods
        tm = []
        for timemean in timemeans:

            if "time" in ds.coords:
                tm.append(
                    ds.sel(year=timemean, time=timemean)
                    .mean(dim=["year", "time"])
                    .compute()
                )
            else:
                tm.append(
                    ds.sel(year=timemean)
                    .mean(
                        dim=[
                            "year",
                        ]
                    )
                    .compute()
                )

        timemeands = xr.concat(
            tm,
            dim=pd.Index(
                [f"{tmslice.start}-{tmslice.stop}" for tmslice in timemeans],
                name="epoch",
            ),
        )
        eoc = timemeands

    return gm, pw, lw, eoc

In [13]:
def load_one_model_diag(var, diagname, model, scenario, activity, delivery_version="v1.1",):
    
    if 1:
        modfmt = dict(
            institution_id=INSTITUTIONS[model],
            source_id=model,
            member_id=ENSEMBLE_MEMBERS[model],
        )

        # if var in ("tasmax",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_temp
        # elif var in ("pr",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_pr
        # else:
        #     raise NotImplementedError("what variable are you processing? not tasmax or pr!")
            
        fn = downscaled_diagnostics_pattern.format(
            delivery_version=delivery_version,
            diagnostics_name=diagname,
            activity_id=activity,
            experiment_id=scenario,
            table_id="day",
            variable_id=var,
            **modfmt,
        )
        print(fn)

        with read_gcs_zarr(fn) as ds:
            if "lon" in ds.dims:
                ds = convert_lons_split(ds, lon_name="lon")
    
    return ds

def load_one_clean_model_diag(var, diagname, model, scenario, activity, delivery_version="1.1",):
    
    return load_one_model_diag(var, f"clean-{diagname}", model, scenario, activity, delivery_version=delivery_version,)

def compile_model_tstats(var, diagname, diagvar, scenario, activity, delivery_version="1.1", timemeans=[EOC, ]):
    
    # TODO: check whether model has the desired scenario available
    
    vardt = {}
    eocdt = {}
    modstr = []

    for idx in range(len(DELIVERY_MODELS)):

        mod = DELIVERY_MODELS[idx]
        modfmt = dict(
            institution_id=INSTITUTIONS[DELIVERY_MODELS[idx]],
            source_id=mod,
            member_id=ENSEMBLE_MEMBERS[DELIVERY_MODELS[idx]],
        )

        # if var in ("tasmax",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_temp
        # elif var in ("pr",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_pr
        # else:
        #     raise NotImplementedError("what variable are you processing? not tasmax or pr!")
            
        fnfut = downscaled_diagnostics_pattern.format(
            delivery_version=delivery_version,
            diagnostics_name=diagname,
            activity_id=activity,
            experiment_id=scenario,
            table_id="day",
            variable_id=var,
            **modfmt,
        )
        print(fnfut)
        fnhist = downscaled_diagnostics_pattern.format(
            delivery_version=delivery_version,
            diagnostics_name=diagname,
            activity_id="CMIP",
            experiment_id="historical",
            table_id="day",
            variable_id=var,
            **modfmt,
        )
        print(fnhist)

        try:
            with read_gcs_zarr(fnfut) as dsf:
                with read_gcs_zarr(fnhist) as dsh:
                    
                    # vardt[mod] = read_gcs_zarr(fn)

                    # print("***",mod, ds)
                    dsf = convert_lons_split(dsf, lon_name="lon")
                    dsh = convert_lons_split(dsh, lon_name="lon")

                    # print(dsf)
                    # loop through time periods
                    tm = []
                    for timemean in timemeans:

                        tstats = xr_ttest(dsf[diagvar].sel(year=timemean), dsh[diagvar].sel(year=IPCC),).compute()
                        tm.append(tstats)

                    eocdt[mod] = xr.concat(tm, dim=pd.Index([f"{tmslice.start}-{tmslice.stop}" for tmslice in timemeans], name="epoch"))
                    modstr.append(mod)
        except FileNotFoundError as fnfe:

            print(fnfe, "keep going")

        except Exception as e:

            print(e, "keep going")
            # raise e

    eocds = xr.concat(list(eocdt.values()), dim=pd.Index(modstr, name="model"))

    return eocds, eocdt # return the dict for "clean" models that all have diff grids

def compile_model_seasonal_diags(var, diagname, diagvar, scenario, activity, delivery_version="1.1", timemeans=[EOC, ]):
    
    # TODO: check whether model has the desired scenario available
    
    eocdt = {}
    modstr = []

    for idx in range(len(DELIVERY_MODELS)):

        mod = DELIVERY_MODELS[idx]
        modfmt = dict(
            institution_id=INSTITUTIONS[DELIVERY_MODELS[idx]],
            source_id=mod,
            member_id=ENSEMBLE_MEMBERS[DELIVERY_MODELS[idx]],
        )

        # if var in ("tasmax",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_temp
        # elif var in ("pr",):
        #     downscaled_diagnostics_pattern = downscaled_diagnostics_pattern_pr
        # else:
        #     raise NotImplementedError("what variable are you processing? not tasmax or pr!")
            
        fn = downscaled_diagnostics_pattern.format(
            delivery_version=delivery_version,
            diagnostics_name=diagname,
            activity_id=activity,
            experiment_id=scenario,
            table_id="day",
            variable_id=var,
            **modfmt,
        )
        print(fn)

        try:
            with read_gcs_zarr(fn) as ds:
                # vardt[mod] = read_gcs_zarr(fn)

                # print("***",mod, ds)
                ds = convert_lons_split(ds, lon_name="lon")

                # loop through time periods
                tm = []
                for timemean in timemeans:

                    seasds = ds.groupby("time.season").mean(dim="time").compute()
                    tm.append(seasds)
                    # tstats = xr_ttest(dsf[diagvar].sel(year=timemean), dsh[diagvar].sel(year=IPCC),).compute()                   

                timemeands = xr.concat(tm, dim=pd.Index([f"{tmslice.start}-{tmslice.stop}" for tmslice in timemeans], name="epoch"))
                eocdt[mod] = timemeands
                modstr.append(mod)
        except FileNotFoundError as fnfe:

            print(fnfe, "keep going")

        except Exception as e:

            print(e, "keep going")

    eocds = xr.concat(list(eocdt.values()), dim=pd.Index(modstr, name="model"))

    return eocds, eocdt # return the dict for "clean" models that all have diff grids


In [14]:
def xr_ttest(da1, da2, lat_name="lat", lon_name="lon"):
# inspiration: https://gist.github.com/cbur24/eb169254dae72a89acd66998a0fc4036
# also: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

    from scipy import stats

    arr_1 = da1.values
    arr_2 = da2.values
    #Get coordinates from the original xarray
    lat  = da1.coords[lat_name]  
    lon  = da1.coords[lon_name]  
    
    #run the t-test
    # print('starting T-test')
    t_stat, p_values = stats.ttest_ind(arr_1, arr_2, axis=0, equal_var = True, nan_policy = "propagate")

    #Write arrays into a x-array
    t_stat_xr = xr.DataArray(t_stat, coords = {lat_name: lat, lon_name: lon}, dims = [lat_name, lon_name], name='t_stat')
    p_val_xr = xr.DataArray(p_values, coords = {lat_name: lat, lon_name: lon}, dims = [lat_name, lon_name], name='p_value')
    
    # print('finished T-test')
    return xr.merge([t_stat_xr, p_val_xr])


# Gather & compute metrics:
- global area weighted average
- global land area weighted average
- population weighted average
- epoch averages

In [15]:
# TODO update this to a more useful location
SAVEDIR = "/gcs/impactlab-data/climate/downscaling/qc/kelly_diagnostics/"


In [50]:
! ls -ltrh /gcs/impactlab-data/climate/downscaling/qc/kelly_diagnostics/

total 23G
-rw-r--r-- 1 jovyan jovyan 2.5G Mar 24  2022 annual_precip_diagnostics_ssp370_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 2.4G Mar 24  2022 annual_precip_diagnostics_historical_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 950M Mar 24  2022 annual_tasmax_diagnostics_ssp370_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 890M Mar 24  2022 annual_tasmax_diagnostics_historical_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 103K Mar 24  2022 annual_tasmax_diagnostics_ssp245_global_mean_timeseries_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 1.1G Mar 24  2022 annual_tasmax_diagnostics_ssp245_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 339K Mar 24  2022 annual_precip_diagnostics_ssp245_global_mean_timeseries_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 2.9G Mar 24  2022 annual_precip_diagnostics_ssp245_epochaverages_allpapermodels.nc
-rw-r--r-- 1 jovyan jovyan 297K Mar 24  2022 annual_precip_diagnostics_ssp370_global

### save to disk
Basically we read in diagnostics for each GCM, compute the metric, concatenate into one dataset per metric, and save to disk.

The variables are:
- "tasmax": annual_tasmax_diagnostics (annual average)
- "tasmin": annual_tasmin_diagnostics (annual average)
- "pr": annual_precip_diagnostics (annual total)

For each, we save these metrics:
- `[lat, lon]` for epoch averages: 
    - slice("2020", "2040"), slice("2040", "2060"), slice("2060", "2080"), slice("2080", "2099")
    - slice("1960", "1980"), slice("1980", "2000"), slice("1995", "2014"),
    
- `[year,]` global means:
    - area-weighted global mean
    - pop-weighted global mean
    - land-area-weighted global mean

In [18]:
PR_DONE = True
TASMAX_DONE = True
TASMIN_DONE = True

timemeans = {
    "ssp245": [
        slice("2020", "2040"),
        slice("2040", "2060"),
        slice("2060", "2080"),
        EOC,
    ],
    "ssp370": [
        slice("2020", "2040"),
        slice("2040", "2060"),
        slice("2060", "2080"),
        EOC,
    ],
    "historical": [
        slice("1960", "1980"),
        slice("1980", "2000"),
        IPCC,
    ],
    "era": [
        slice("1960", "1980"),
        slice("1980", "2000"),
        IPCC,
    ],
}
mips = {
    "ssp245": "ScenarioMIP",
    "ssp370": "ScenarioMIP",
    "historical": "CMIP",
}

In [24]:
if not PR_DONE:
    VAR = "precip"
    RUNIT = True
elif not TASMAX_DONE:
    VAR = "tasmax"
    RUNIT = True
elif not TASMIN_DONE:
    VAR = "tasmin"
    RUNIT = True

else:
    RUNIT = False

if RUNIT:
    plazy = {}
    pgm = {}
    ppwgm = {}
    plwgm = {}
    peoc = {}

    for scen in ["ssp245", "ssp370", "historical"]:
        pgm[scen], ppwgm[scen], plwgm[scen], peoc[scen], _ = compile_model_diags(
            var=VAR,
            diagtype="annual",
            datatype="downscaled",
            # diagname="annual-precip-diagnostics",
            scenario=scen,
            activity=mips[scen],
            delivery_version="v1.1",
            timemeans=timemeans[scen],
        )
        scengm = xr.concat(
            [pgm[scen], ppwgm[scen], plwgm[scen]],
            dim=pd.Index(
                ["area-weighted", "pop-weighted", "land-area-weighted"],
                name="weighting",
            ),
        )

        print(f"saving {scen} data")
        scengm.to_netcdf(
            f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_global_mean_timeseries_GCMS.nc"
        )

        peoc[scen].to_netcdf(
            f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_epochaverages_GCMs.nc"
        )

    scen = "era"
    pgm[scen], ppwgm[scen], plwgm[scen], peoc[scen] = load_era_diags(
        VAR if VAR in ["tasmax","tasmin"] else "pr",
        f"annual-{VAR}-diagnostics",
        timemeans=timemeans[scen],
    )
    scengm = xr.concat(
            [pgm[scen], ppwgm[scen], plwgm[scen]],
            dim=pd.Index(
                ["area-weighted", "pop-weighted", "land-area-weighted"],
                name="weighting",
            ),
        )
    print("saving era data")
    scengm.to_netcdf(
        f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_global_mean_timeseries_ERA5.nc"
    )

    peoc[scen].to_netcdf(
        f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_epochaverages_ERA5.nc"
    )

gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/BCC/BCC-CSM2-MR/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CAS/FGOALS-g3/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CSIRO-ARCCSS/ACCESS-CM2/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/INM/INM-CM4-8/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/INM/INM-CM5-0/ssp245/r1i1p1f1/day/tasmin/v1.

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  ds = xr.open_zarr(store_path)


converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MOHC/UKESM1-0-LL/ssp245/r1i1p1f2/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CMCC/CMCC-CM2-SR5/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CMCC/CMCC-ESM2/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CCCma/CanESM5/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/EC-Earth-Consorti

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  ds = xr.open_zarr(store_path)


converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg-LR/ssp245/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
saving ssp245 data
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CAS/FGOALS-g3/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  ds = xr.open_zarr(store_path)


group not found at path '' keep going
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MPI-M/MPI-ESM1-2-HR/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
group not found at path '' keep going
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MOHC/HadGEM3-GC31-LL/ssp370/r1i1p1f3/day/tasmin/v1.1.zarr
group not found at path '' keep going
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MOHC/UKESM1-0-LL/ssp370/r1i1p1f2/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/CMCC/CMCC-CM2-SR5/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnost

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  ds = xr.open_zarr(store_path)


converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg-LR/ssp370/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
saving ssp370 data
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/CMIP/BCC/BCC-CSM2-MR/historical/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/CMIP/CAS/FGOALS-g3/historical/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnostics/CMIP/CSIRO-ARCCSS/ACCESS-CM2/historical/r1i1p1f1/day/tasmin/v1.1.zarr
converting lons to -180 to 180
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-tasmin-diagnos

NotImplementedError: what variable are you processing? not tasmax or pr!

## TODO save these outputs (tstats and seasonal outputs)

In [None]:
scen = "ssp370"
ptstat = {}
ptstat[scen],_ = compile_model_tstats(
    var="pr",
    diagname="annual-precip-diagnostics",
    diagvar="total_annual_precip",
    scenario=scen,
    activity="ScenarioMIP",
    delivery_version="1.1",
    timemeans=[
        slice("2020", "2040"),
        slice("2040", "2060"),
        slice("2060", "2080"),
        EOC,
    ],
)


In [None]:
scen = "ssp370"
pseas = {}
pseas[scen],_ = compile_model_seasonal_diags(
    var="pr",
    diagname="annual-precip-diagnostics",
    diagvar="total_seasonal_precip",
    scenario=scen,
    activity="ScenarioMIP",
    delivery_version="1.1",
    timemeans=[
        slice("2020", "2040"),
        slice("2040", "2060"),
        slice("2060", "2080"),
        EOC,
    ],
)

scen = "historical"
pseas[scen],_ = compile_model_seasonal_diags(
    var="pr",
    diagname="annual-precip-diagnostics",
    diagvar="total_seasonal_precip",
    scenario=scen,
    activity="CMIP",
    delivery_version="1.1",
    timemeans=[
        slice("1960", "1980"),
        slice("1980", "2000"),
        IPCC,
    ],
)


gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/annual-precip-diagnostics/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/day/pr/1.1.zarr
converting lons to -180 to 180


## Clean models

Had some problems with NaNs being in the output. Right now I think land-area-weighted mean and pop-weighted means are giving back NaNs

In [45]:
# CLEAN

VAR = "tasmax"
RUNIT = True
if RUNIT:
    print(VAR)
    # TODO regrid the epoch averages onto same grid so can concat and save to one netcdf
    clpgm = {}
    clppwgm = {}
    clplwgm = {}
    clpeoc = {}
    for scen in ["ssp245", "ssp370", "historical"]:
        (
            clpgm[scen],
            clppwgm[scen],
            clplwgm[scen],
            _,
            clpeoc[scen]
        ) = compile_model_diags(
            var=VAR,# if VAR in ["tasmax","tasmin"] else "pr",
            # diagname=f"clean-annual-{VAR}-diagnostics",
            diagtype="annual",
            datatype="clean",
            scenario=scen,
            activity=mips[scen],
            delivery_version="v1.1",
            timemeans=timemeans[scen],
        )

        clscengm = xr.concat(
            [clpgm[scen], clppwgm[scen], clplwgm[scen]],
            dim=pd.Index(
                ["area-weighted", "pop-weighted", "land-area-weighted"],
                name="weighting",
            ),
        )
        print(f"saving cleaned {scen} data")
        clscengm.to_netcdf(
            f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_global_mean_timeseries_cleanGCMS.nc"
        )

        # Actually can't save this as one netcdf file, b/c each GCM has a different 
        # resolution (they are the raw, cleaned GCMs)
        # clpeoc[scen].to_netcdf(
        #     f"{SAVEDIR}/annual_{VAR}_diagnostics_{scen}_epochaverages_cleanGCMs.nc"
        # )


tasmax
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmax-diagnostics/ScenarioMIP/BCC/BCC-CSM2-MR/ssp245/r1i1p1f1/day/tasmax/v1.1.zarr
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmax-diagnostics/ScenarioMIP/BCC/BCC-CSM2-MR/ssp245/r1i1p1f1/day/tasmax/v1.1.zarr
*** BCC-CSM2-MR <xarray.Dataset>
Dimensions:                 (year: 97, lat: 160, lon: 320)
Coordinates:
  * lat                     (lat) float64 -89.14 -88.03 -86.91 ... 88.03 89.14
  * lon                     (lon) float64 0.0 1.125 2.25 ... 356.6 357.8 358.9
  * year                    (year) int64 2004 2005 2006 2007 ... 2098 2099 2100
Data variables:
    annual_average_tasmax   (year, lat, lon) float32 dask.array<chunksize=(25, 40, 160), meta=np.ndarray>
    annual_count_above_95F  (year, lat, lon) int64 dask.array<chunksize=(25, 40, 80), meta=np.ndarray>
Attributes: (12/54)
    Conventions:             CF-1.7 CMIP-6.2
    activity_id:             ScenarioMIP
    branch_method:       

  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))


gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmax-diagnostics/ScenarioMIP/CAS/FGOALS-g3/ssp245/r1i1p1f1/day/tasmax/v1.1.zarr
gs://downscaled-288ec5ac/diagnostics/RELEASE-v1.1/clean-annual-tasmax-diagnostics/ScenarioMIP/CAS/FGOALS-g3/ssp245/r1i1p1f1/day/tasmax/v1.1.zarr
*** FGOALS-g3 <xarray.Dataset>
Dimensions:                 (year: 97, lat: 80, lon: 180)
Coordinates:
  * lat                     (lat) float64 -90.0 -84.82 -80.72 ... 84.82 90.0
  * lon                     (lon) float64 0.0 2.0 4.0 6.0 ... 354.0 356.0 358.0
  * year                    (year) int64 2004 2005 2006 2007 ... 2098 2099 2100
Data variables:
    annual_average_tasmax   (year, lat, lon) float32 dask.array<chunksize=(25, 40, 90), meta=np.ndarray>
    annual_count_above_95F  (year, lat, lon) int64 dask.array<chunksize=(25, 40, 90), meta=np.ndarray>
Attributes: (12/51)
    Conventions:             CF-1.7 CMIP-6.2
    activity_id:             ScenarioMIP
    branch_method:           standard
   

  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))


saving cleaned ssp245 data


In [61]:
clscengm.isnull().groupby("weighting").sum()