# Aggregate TSV outputs of LocalizeSL into Zarr

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dask.config
import numpy as np
import pandas as pd
import xarray as xr
from dask_gateway import Gateway
from gcsfs import GCSFileSystem
from tqdm.notebook import tqdm

from rhg_compute_tools import xarray as rhgx
from sliiders.settings import (
    DIR_SLR_INT,
    LOCALIZESL_COREFILES,
    PATH_SLIIDERS_SLR,
    PATH_SLR_N_GCMS,
)
from sliiders.utils import upload_pkg

  from distributed.utils import LoopRunner, format_bytes


In [3]:
# attrs for output zarr
AUTHOR = "Ian Bolliger, Daniel Allen"
CONTACT = "ibolliger@rhg.com, dallen@berkeley.edu"
HISTORY = """version 1.0: initial version"""
METHOD = """LocalizeSL was used to estimate monte carlo draws of future GMSL and LMSL relative to a vertical datum of MSL2000. Data quality adjustments:
1. RCP6 ignored due to no post-2100 projections and fewer GCMS used for pre-2100 projections.
2. Sites with <3 GCMs for an pre-2100 years dropped.
3. Sites with any null values for post-2100 projections dropped."""
DESCRIPTION = "LocalizeSL-based relative sea level rise projections"

FS = GCSFileSystem(token="/opt/gcsfuse_tokens/rhg-data.json")
PATH_SLR_N_GCMS = FS.get_mapper(PATH_SLR_N_GCMS.relative_to("/gcs"))

In [4]:
gateway = Gateway()
cluster = gateway.new_cluster(
    idle_timeout=3600,
    profile="micro",
)
client = cluster.get_client()
cluster.scale(140)

upload_pkg(client, "../../sliiders")
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

## Get lists of outputs

In [5]:
lsl_baseline_paths = dict()

corefile_paths = dict()
for corefile in LOCALIZESL_COREFILES:
    corefile_paths[corefile] = dict()
    dir_tsv = DIR_SLR_INT / "tmp" / corefile / "mc_tsv"

    all_lsl_paths = list(dir_tsv.glob("LSL*.tsv"))
    all_gsl_paths = list(dir_tsv.glob("GSL*.tsv"))

    corefile_paths[corefile]["lsl_scenario_paths"] = [
        p for p in all_lsl_paths if "_rcp60" not in p.stem and "_baseline" not in p.stem
    ]

    all_baseline_paths = [p for p in all_lsl_paths if "_baseline" in p.stem]
    baseline_paths = {p.stem.split("_")[-2]: p for p in all_baseline_paths}

    for site_id in baseline_paths:
        lsl_baseline_paths[site_id] = baseline_paths[site_id]

    corefile_paths[corefile]["lsl_baseline_paths"] = [
        p for p in all_lsl_paths if "_baseline" in p.stem
    ]

    corefile_paths[corefile]["gsl_paths"] = [
        p for p in all_gsl_paths if "_rcp60" not in p.stem and "_baseline" not in p.stem
    ]

    print(
        f"Corefile: {corefile} -- {len(corefile_paths[corefile]['lsl_scenario_paths'])} site scenario "
        f"files found, "
        f"{len(corefile_paths[corefile]['gsl_paths'])} global files found"
    )

lsl_baseline_paths = list(lsl_baseline_paths.values())
print(f"From all corefiles, {len(lsl_baseline_paths)} baseline paths found")

Corefile: SLRProjections190726core_SEJ_full -- 4192 site scenario files found, 2 global files found
Corefile: SLRProjections170113GRIDDEDcore -- 6288 site scenario files found, 3 global files found
Corefile: SLRProjections200204GRIDDEDcore_D20 -- 6288 site scenario files found, 3 global files found
Corefile: SLRProjections210628GRIDDEDcore_SROCC -- 6288 site scenario files found, 3 global files found
From all corefiles, 2096 baseline paths found


## Define functions to load and process TSVs

In [6]:
def combine_site_tsvs_into_dataframe(paths, is_baseline=False):
    """Open TSVs that are outputs of `WriteTableMC`, a function in the LocalizeSL repository"""

    dfs = []
    for path in tqdm(paths):
        df = pd.read_csv(
            path,
            sep="\t",
            skiprows=2,
            header=None,
            names=["year"] + [i for i in range(10000)],
        )
        rcp = path.stem.split("_")[-1]
        corefile = path.parent.parent.name
        df.insert(0, "scenario", f"{corefile}_{rcp}")
        df = df.dropna(subset=[0])
        dfs.append(df)

    df_info = pd.read_csv(paths[0], nrows=0).columns[0]
    coords = (df_info.split(" ")[0]).split("_")[1:]
    # GSL
    if coords == []:
        lon = np.nan
        lat = np.nan
    # LSL
    else:
        lat = float(coords[0])
        lon = float(coords[1])
    site_id = path.stem.split("_")[-2]

    out_df = pd.concat(dfs, ignore_index=True)

    out_df = out_df.sort_values(["scenario", "year"])
    out_df.columns.name = "mc_sample_id"

    return out_df, site_id, lon, lat


def df_to_da(combined, site_id, lon, lat):

    # Move columns representing sample instances to possible values of a single column
    flattened = combined.melt(id_vars=["scenario", "year"], value_name="msl_msl00")

    # Some type-casting
    flattened["year"] = flattened["year"].astype(np.uint16)

    # Convert centimeters to meters
    flattened["msl_msl00"] = flattened["msl_msl00"] / 100
    flattened["msl_msl00"] = flattened["msl_msl00"].astype(np.float32)

    # Set index as dimensions of destination xarray Dataset
    flattened = flattened.set_index(["scenario", "year", "mc_sample_id"])

    # Convert to DataArray
    ds = flattened.to_xarray()

    # add in coords
    ds = ds.expand_dims({"site_id": [site_id]})
    ds.coords["lon"] = ("site_id", [lon])
    ds.coords["lat"] = ("site_id", [lat])

    # make sure longitude is -180 to 180
    ds["lon"] = ds.lon.where(ds.lon <= 180, -360 + ds.lon)

    # some type casting to minimize size
    ds["year"] = ds.year.astype(np.uint16)
    ds["mc_sample_id"] = ds.mc_sample_id.astype(np.uint16)

    # convert to DataArray
    da = ds.msl_msl00

    return da


def process_site(paths, is_baseline=False):
    combined, site_id, lon, lat = combine_site_tsvs_into_dataframe(paths)
    if is_baseline:
        combined["scenario"] = "baseline"

    da_out = df_to_da(
        combined,
        site_id,
        lon,
        lat,
    )
    if is_baseline:
        return da_out.squeeze("scenario").drop("scenario")
    return da_out


def get_groups_from_paths(paths):
    # group jobs by site_id
    site_ids = [p.stem.split("_")[-2] for p in paths]
    site_ser = pd.Series(paths, index=site_ids)
    return site_ser.groupby(level=0).apply(list).to_list()


def process_all_sites(corefile):
    lsl_groups = get_groups_from_paths(corefile_paths[corefile]["lsl_scenario_paths"])

    # submit jobs to return futures of dataarrays
    gsl_fut = client.submit(process_site, corefile_paths[corefile]["gsl_paths"])
    lsl_fut = client.map(process_site, lsl_groups)
    return gsl_fut, lsl_fut

## Process all files

In [7]:
sl_arrs = []
for ix, corefile in enumerate(LOCALIZESL_COREFILES.keys()):
    print(corefile)

    # process results
    gsl_fut, lsl_fut = process_all_sites(corefile)

    # gather arrays
    gsl_arr = rhgx.dataarrays_from_delayed([gsl_fut], client=client)[0].squeeze(
        drop=True
    )
    lsl_arr = rhgx.dataarray_from_delayed(lsl_fut, dim="site_id", client=client).astype(
        np.float32
    )

    # merge arrays
    sl_arr = xr.Dataset(
        {
            "lsl_msl00": lsl_arr,
            "gsl_msl00": gsl_arr,
        }
    ).persist()

    sl_arrs.append(sl_arr)

baseline_groups = get_groups_from_paths(lsl_baseline_paths)
lsl_baseline_fut = client.map(process_site, baseline_groups, is_baseline=True)

lsl_baseline_arr = rhgx.dataarray_from_delayed(
    lsl_baseline_fut, dim="site_id", client=client
).astype(np.float32)

lsl_baseline_arr = xr.Dataset(
    {
        "lsl_ncc_msl00": lsl_baseline_arr,
    }
).persist()

SLRProjections190726core_SEJ_full
SLRProjections170113GRIDDEDcore
SLRProjections200204GRIDDEDcore_D20
SLRProjections210628GRIDDEDcore_SROCC


In [8]:
sl_arr = xr.merge((xr.concat(sl_arrs, "scenario"), lsl_baseline_arr))

In [9]:
# update attrs
sl_arr.lsl_msl00.attrs.update(
    {
        "long_name": "Estimated Future LMSL, MSL00",
        "description": (
            "Monte Carlo estimates of local mean sea level for a given RCP scenario "
            "and year. Estimates are made on a sparse 2-degree coastal grid, and are "
            "relative to MSL00 vertical datum."
        ),
        "units": "m",
    }
)

sl_arr.gsl_msl00.attrs.update(
    {
        "long_name": "Estimated Future GMSL, MSL00",
        "description": (
            "Monte Carlo estimates of global mean sea level for a given RCP scenario "
            "and year. Estimates are relative to MSL00 vertical datum."
        ),
        "units": "m",
    }
)

sl_arr.lsl_ncc_msl00.attrs.update(
    {
        "long_name": "Counterfactual Future LMSL (no climate change), MSL00",
        "description": (
            "Monte Carlo estimates of local mean sea level in the no-climate change scenario "
            "for each year. Estimates are made on a sparse 2-degree coastal grid, and are "
            "relative to MSL00 vertical datum."
        ),
        "units": "m",
    }
)

sl_arr.attrs.update(
    {
        "author": AUTHOR,
        "contact": CONTACT,
        "history": HISTORY,
        "description": DESCRIPTION,
        "method": METHOD,
        "updated": pd.Timestamp.now(tz="US/Pacific").strftime("%c"),
    }
)

## Clean the outputs

Clean using the following criteria:

- Ignore RCP6 when creating groups of trajectories binned by GMSL (this occurs later in the binning notebook). This is missing for all sites post-2100 (since no CMIP5 models were run for the extended timeline using RCP6 and has fewer GCMs used than other scenarios for the pre-2100 years (since RCP6 was not a prioritized scenario).
- Drop any sites that have <3 GCMs for any pre-2100 years.
- Ignoring RCP6 (which has missing values for all sites after 2100), drop any sites that have null values for any post-2100 years (no sites have missing values outside of RCP6 for pre-2100 years)

In [10]:
n_gcms = xr.open_zarr(PATH_SLR_N_GCMS, chunks=None).numGCMs.load()

In [11]:
# filter to only sites w/ >=3 gcms
good_sites = (
    n_gcms.sel(year=slice(None, 2090)).min(dim=["year", "scenario"]) >= 3
).values

# filter to sites that have no missing values
good_sites = (
    good_sites
    & sl_arr.lsl_msl00.notnull().all(dim=["mc_sample_id", "scenario", "year"]).values
)

# execute filtering
with dask.config.set(**{"array.slicing.split_large_chunks": False}):
    sl_arr = sl_arr.isel(site_id=good_sites).persist()

In [22]:
# re-chunk
sl_arr = sl_arr.chunk(
    {"scenario": -1, "site_id": 100, "year": -1, "mc_sample_id": 100}
).persist()

## Save

In [23]:
sl_arr.to_zarr(FS.get_mapper(PATH_SLIIDERS_SLR.relative_to("/gcs")), mode="w")

<xarray.backends.zarr.ZarrStore at 0x7f452daf9ac0>

In [25]:
cluster.close(), client.close()

(None, None)