# Multi-model mean scaling

This notebook takes the multimodel mean forced climate change (from the training dataset) and scales it to match a simulation / dataset of interest.

### Imports

In [1]:
import xcdat as xc
from fx import verify_monthly_times_match
import numpy as np
import xarray as xr
import os
import glob
from scipy.signal import savgol_filter
# suppress warnings (but allow errors)
import logging
logging.getLogger('xcdat').setLevel(logging.ERROR)

### Custom functions

In [2]:
def scale_multimodel_mean_timeseries(ds_raw, reference_data, vmap):
    """
    scale_multimodel_mean_timeseries(ds_raw, reference_data, vmap)

    Function solves for the best scaling of a multimodel mean, global mean
    surface air temperature time series (forced response) and a "raw" global
    mean time series that includes both internal variability and the forced
    response such that:

        y = m*x + b

    where y is the smoothed global mean "raw" surface temperature time 
    series, x is the smoother global average reference surface temperature
    time series, and m and b are the fitted slope and intercept,
    respectively. The slope is then applied to the reference timeseries of a
    field of interest (e.g., sea ice concentration, surface pressure, or
    atmospheric temperature) to estimate the forced time series in the raw
    dataset such that:
        
        F[t, x, y] = m*X[t, x, y]

    Parameters:
    -----------
    ds_raw (xr.Dataset)      : Dataset containing the raw (forced+internal)
                               surface temperature data (contains "tas")
    reference_data (Dict)    : Dictionary containing reference datasets to
                               be scaled in form of reference_data[variable_name] = ds
    vmap (str)               : Mapping for variable name, cmipTable, and netcdf id in form
                               vmap[variable_name] = [cmipTable, netcdf_id] or
                               vmap['zmta'] = ['Amon', 'ta']

    Returns:
    --------
    Dict : Dict containing datasets of the forced response for each field, e.g.,
           scaled_data['tas'] = ds

    Notes:
    ------
    Smoothing is performed using a third order Savitzky-Golay filter (1) with a window length
    of 120 months. For 3D atmospheric temperature data (plev=50000) is chosen in order to fit
    the scaling coefficients. If a missing value exists in either the reference or raw data
    then a missing value is included in both datasets.
    
    [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html
    """
    # get reference tas data for scaling
    ds_ref_tas = reference_data['tas'].copy()
    # subset time axis if needed
    if len(ds_ref_tas.time) != len(ds_raw.time):
        subsetTime = True
        rtime = ds_raw.time.values
        startString = str(rtime[0].year) + '-' + str(rtime[0].month).zfill(2)
        endString = str(rtime[-1].year) + '-' + str(rtime[-1].month).zfill(2)
        timeSlice = slice(startString, endString)
        ds_ref_tas = ds_ref_tas.sel(time=timeSlice)
    else:
        subsetTime = False
    # time axis match
    if verify_monthly_times_match(ds_ref_tas.time, ds_raw.time):
        ds_raw['time'] = ds_ref_tas['time']
    else:
        raise ValueError('Time axes do not match.')
    # masking of global mean surface temperature time series
    ds_ref_tas["tas"] = xr.where(~np.isnan(ds_raw["tas"]), ds_ref_tas["tas"], np.nan)
    ds_raw["tas"] = xr.where(~np.isnan(ds_ref_tas["tas"]), ds_raw["tas"], np.nan)
    # add missing bounds (if needed)
    ds_ref_tas = ds_ref_tas.bounds.add_missing_bounds(['X', 'Y', 'T'])
    ds_raw = ds_raw.bounds.add_missing_bounds(['X', 'Y', 'T'])
    # compute global mean average
    ds_ref_tas_gm = ds_ref_tas.spatial.average('tas')
    ds_raw_gm = ds_raw.spatial.average('tas')
    ts_ref_tas_gm = ds_ref_tas_gm.temporal.departures('tas', freq='month')['tas']
    ts_raw_gm = ds_raw_gm.temporal.departures('tas', freq='month')['tas']
    # smooth the "raw" data
    ts_raw_gm_smooth = ts_raw_gm.copy()
    ts_raw_gm_smooth.values = savgol_filter(ts_raw_gm, 10*12, 3)
    # do fit
    m, b = np.polyfit(ts_ref_tas_gm, ts_raw_gm_smooth, 1)
    # loop over and scale all reference fields
    scaled_data = {}
    for vid in reference_data.keys():
        # get netcdf id to read data
        lvar = vmap[vid][1]
        # copy reference data before manipulation
        ds_ref = reference_data[vid].copy()
        if subsetTime:
            ds_ref = ds_ref.sel(time=timeSlice)
        # copy reference dataset to use for output data
        ds_fitted = ds_ref.copy()
        # scale reference dataarray and insert into output dataset
        ds_fitted[lvar] = ds_ref[lvar] * m
        # add output dataset to scaled_data dictionary
        scaled_data[vid] = ds_fitted.copy()
    return scaled_data

### Parameters

In [3]:
# Parameters
forcesmip_root = '/glade/campaign/cgd/cas/asphilli/ForceSMIP/'
dpath_em = '/glade/work/pochedls/forcesmip/ensemble_mean/'
dpath_out = '/glade/work/pochedls/forcesmip/'
fmethod = 'scaleMultiModelMeanBasedOnGlobalTas'
vmap = {'pr': ['Amon', 'pr'],
        'psl': ['Amon', 'psl'],
        'tas': ['Amon', 'tas'],
        'zmta': ['Amon', 'ta'],
        'monmaxpr': ['Aday', 'pr'],
        'monmaxtasmax': ['Aday', 'tasmax'],
        'monmintasmin': ['Aday', 'tasmin'],
        'siconc': ['OImon', 'siconc'],
        'tos': ['Omon', 'tos']}
models = ['CanESM5', 'CESM2', 'MIROC6', 'MIROC-ES2L', 'MPI-ESM1-2-LR']

### Load Reference Multimodel Mean (forced) Time Series for Scaling

In [4]:
# get mapping for reference variable files
reference_data = {}
for vid in vmap.keys():
    # get CMIP table
    cmipTable = vmap[vid][0]
    # get appropriate netcdf variable id
    lvar = vmap[vid][1]
    # open reference datasets
    fnr = dpath_em + vid + '_mon_MMM_historical_ssp370_ensmean.nc'
    ds = xc.open_dataset(fnr, add_bounds=["T", "X", "Y"])
    ds = ds.temporal.departures(lvar, freq='month')
    reference_data[vid]= ds.load()
    ds.close()

# ensure all time axes match
for vid in vmap.keys():
    if not verify_monthly_times_match(reference_data['tas'].time, reference_data[vid].time):
        raise ValueError('Reference time axes do not match.')

### Estimate Forced Response in Training Data

In [5]:
# first ensure output path exists
if not os.path.exists(dpath_out + '/training_predictions/'):
    os.makedirs(dpath_out + '/training_predictions/')

# open reference global mean surface temperature dataset
fnrtas = dpath_em + 'tas_mon_MMM_historical_ssp370_ensmean.nc'
ds_ref_tas = xc.open_dataset(fnrtas)
for model in models:
    # print progress
    print('   ' + model)
    # specify data path
    dpath = forcesmip_root + '/Training/Amon/tas/' + model
    # get all files for model
    mfiles = glob.glob(dpath + '/*nc')
    # loop over all files / members
    for fn in mfiles:
        # get member
        if model == 'CESM2':
            member = '.'.join(fn.split('_')[-1].split('.')[0:2])
        else:
            member = fn.split('.')[0].split('_')[-1]
        # open dataset
        ds_raw = xc.open_dataset(fn)
        # do fit
        scaled_data = scale_multimodel_mean_timeseries(ds_raw, reference_data, vmap)
        # save out each scaled dataset/variable
        for vid in scaled_data.keys():
            # specify output path
            fnOut = dpath_out + '/training_predictions/' + vid + '_mon_' + model + '_' + fmethod + '_historical_ssp370_' + member + '.' + mfiles[0].split('.')[-1]
            # get scaled data
            ds = scaled_data[vid]
            # save output
            ds.to_netcdf(fnOut)
        # close file
        ds_raw.close()

   CanESM5
   CESM2
   MIROC6
   MIROC-ES2L
   MPI-ESM1-2-LR


### Estimate Forced Response in Evaluation Data

In [6]:
# first ensure output path exists
if not os.path.exists(dpath_out + '/evaluation_predictions/'):
    os.makedirs(dpath_out + '/evaluation_predictions/')

# get evaluation files
mfiles = glob.glob(forcesmip_root + '/Evaluation-Tier1/Amon/tas/*nc')
# loop over all evaluation models
for fn in mfiles:
    # specify output path
    member = fn.split('/')[-1].split('_')[-1].split('.')[0]
    print('   ' + member)
    # open dataset
    ds_raw = xc.open_dataset(fn)
    # do fit
    scaled_data = scale_multimodel_mean_timeseries(ds_raw, reference_data, vmap)
    # save out each scaled dataset/variable
    for vid in scaled_data.keys():
        # specify output path
        fnOut = dpath_out + '/evaluation_predictions/' + vid + '_' + member + '_tier1_' + fmethod + '_benchmark.nc'
        # get scaled data
        ds = scaled_data[vid]
        # save output
        ds.to_netcdf(fnOut)
    # close file
    ds_raw.close()

   1F
   1I
   1E
   1G
   1D
   1B
   1A
   1C
   1H
   1J
