# Combine data layers

1. Get elevation relative to sea level
2. Match to nearest country, impact region, protection zone (e.g. levees)
3. Uniformly distribute exposure over all surface area > 0 elevation within a 30" pixel
4. Aggregate both surface area and exposure up to region X segment X protection zone X wetland flag X .1-meter elevation bin
5. Save area by elevation for each segment x wetland flag (for <0 elevations, only care about wetland area).

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import geopandas as gpd
import numpy as np
import pandas as pd
import regionmask
import sliiders.settings as sset
import xarray as xr
from dask_gateway import GatewayCluster
from shapely.geometry import box
from sliiders import spatial
from sliiders.dask import start_cluster
from sliiders.io import open_rasterio, open_zarr, read_shapefile, save

In [3]:
PIXELS_PER_TILE = 3601
N_WORKERS = 400

In [4]:
client, cluster = start_cluster()
cluster.adapt(minimum=7, maximum=N_WORKERS)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [6]:
def open_asset_value(llon, llat, ulon, ulat):
    # Get corners of `bbox` by their indices
    lx_ix, ux_ix = spatial.grid_val_to_ix(
        np.array([llon, ulon]),
        sset.ASSET_VALUE_GRID_WIDTH,
    )

    ly_ix, uy_ix = spatial.grid_val_to_ix(
        np.array([llat, ulat]),
        sset.ASSET_VALUE_GRID_WIDTH,
    )

    return (
        pd.read_parquet(
            sset.PATH_EXPOSURE_ASSET_VALUE_BLENDED,
            columns=["value", "x_ix", "y_ix"],
            filters=[
                [
                    ("x_ix", ">=", lx_ix),
                    ("x_ix", "<", ux_ix),
                    ("y_ix", ">=", ly_ix),
                    ("y_ix", "<", uy_ix),
                ]
            ],
        )
        .set_index(["x_ix", "y_ix"])
        .value.rename("asset_value")
    )


def open_pop(llon, llat, ulon, ulat):
    lx_ix, ux_ix = spatial.grid_val_to_ix(
        np.array([llon, ulon]),
        sset.POP_GRID_WIDTH,
    )

    ly_ix, uy_ix = spatial.grid_val_to_ix(
        np.array([llat, ulat]),
        sset.POP_GRID_WIDTH,
    )
    return pd.read_parquet(
        sset.PATH_EXPOSURE_POP_INT,
        filters=[
            [
                ("x_ix", ">=", lx_ix),
                ("x_ix", "<", ux_ix),
                ("y_ix", ">=", ly_ix),
                ("y_ix", "<", uy_ix),
            ]
        ],
    ).population.rename("pop")


def load_exposure(llon, llat):
    """Get asset value and population within the bounds defined by `bbox`"""
    return open_asset_value(llon, llat, llon + 1, llat + 1).astype("float64"), open_pop(
        llon, llat, llon + 1, llat + 1
    ).astype("float64")


def convert_to_ser(mask, elev_tile):
    mapper = {k: v.name for k, v in mask.regions.items()}
    return (
        mask.mask(elev_tile, wrap_lon=180)
        .to_series()
        .dropna()
        .astype(int)
        .replace(mapper)
    )


def get_regions(bbox):
    seg_adm = read_shapefile(
        sset.PATH_SEG_REGION_VORONOI_INTERSECTIONS_SHP,
        bbox=box(*bbox.buffer(0.1).bounds),
        engine="pyogrio",
    )
    return regionmask.from_geopandas(seg_adm, names="seg_adm", name="seg_adm")


def _buffer_bbox(bbox, buffer_width=0.1):
    buffered = box(*bbox.buffer(0.1).bounds)
    return buffered.bounds


def get_wetland_mangrove_areas(bbox, elev_tile):
    """
    Get flag indicating existence of wetlands in `bbox`, returning a flattened array
    corresponding to the flattened indices of `elev_tile`
    """
    buff_bnds = _buffer_bbox(bbox)
    mangroves = read_shapefile(
        sset.PATH_GLOBAL_MANGROVES, bbox=buff_bnds, engine="pyogrio"
    )
    wetlands = (
        open_rasterio(sset.PATH_GLOBCOVER_2009)
        .sel(y=slice(buff_bnds[3], buff_bnds[1]), x=slice(buff_bnds[0], buff_bnds[2]))
        .squeeze(drop=True)
        .isin([160, 170, 180])
        .rename(y="lat", x="lon")
        .astype(int)
        .interp_like(elev_tile, method="nearest")
        .astype(bool)
    )
    if len(mangroves):
        wetlands = (
            wetlands
            | regionmask.mask_geopandas(
                mangroves.dissolve("PXLVAL"),
                wetlands,
            ).notnull()
        )
    return wetlands


def get_lake_mask(bbox, elev_tile):
    buff_bnds = _buffer_bbox(bbox)
    lake_region = gpd.read_parquet(sset.PATH_NATEARTH_LAKES_INT).clip_by_rect(
        *buff_bnds
    )
    if lake_region.is_empty.all():
        return False

    lake_mask = regionmask.mask_geopandas(
        lake_region,
        elev_tile,
    ).notnull()
    return lake_mask


def save_to_parquet(df, fpath):
    dtypes = {
        "seg_adm": "category",
        "protection_zone": "int64",
        "z_ix": "int16",
        "x_ix": "int16",
        "y_ix": "int16",
        "pop": "float64",
        "area_km": "float64",
        "asset_value": "float64",
        "wetland_flag": bool,
    }
    out = df.reset_index(drop=False).astype(
        {k: v for k, v in dtypes.items() if k in df.columns}
    )
    assert out.columns.isin(dtypes.keys()).all(), [
        c for c in out.columns if c not in dtypes.keys()
    ]
    save(out, fpath, index=False)


def match_elev_pixels_to_shapes(elev_tile, bbox):
    region_mask = get_regions(bbox)
    protected_mask = get_protected_areas(bbox)
    wetlands = get_wetland_mangrove_areas(bbox, elev_tile)

    return pd.DataFrame(
        {
            "seg_adm": convert_to_ser(region_mask, elev_tile).astype("category"),
            "protection_zone": convert_to_ser(protected_mask, elev_tile),
            "wetland_flag": wetlands.to_series(),
        }
    )


def get_protected_areas(bbox):
    protected = gpd.read_parquet(
        sset.PATH_COMBINED_PROTECTED_AREAS,
        columns=["geometry"],
    )
    protected = protected[protected.geometry.overlaps(bbox)].reset_index()

    # add in no-protection value
    out = gpd.GeoDataFrame({"protection_zone_id": [-1]}, geometry=[bbox])

    if len(protected):
        out["geometry"] = out.difference(protected.unary_union)
        out = out[~out.is_empty]
        out = pd.concat(
            [
                protected,
                out,
            ],
            ignore_index=True,
        )

    return regionmask.from_geopandas(
        out, names="protection_zone_id", name="protection_zone_id"
    )


def _correct_vals(x, bins_per_int_per_side):
    return np.arange(x - bins_per_int_per_side, x + bins_per_int_per_side + 1)


def spread_int_res(df, high_elev_ix):
    to_spread = df[df.index.get_level_values("int_res")]
    if not len(to_spread):
        return df.droplevel("int_res")

    to_drop = to_spread.index.copy()
    spread_vals = to_spread.index.get_level_values("z_ix").unique()
    names = [n for n in to_spread.index.names if n != "int_res"]

    bins_per_int_per_side = round((1 / sset.EXPOSURE_BIN_WIDTH_V) / 2)

    all_vals = spread_vals.map(
        lambda x: _correct_vals(x, bins_per_int_per_side)
    ).to_series()
    all_vals.index = spread_vals
    all_vals = all_vals.explode()
    counts = all_vals.groupby("z_ix").count().rename("counts")

    to_spread = (
        to_spread.to_frame()
        .join(all_vals.rename("new_z_ix"), on="z_ix", how="left")
        .join(counts, on="z_ix", how="left")
        .reset_index()
    )
    to_spread["z_ix"] = to_spread.new_z_ix
    to_spread["area_km"] /= to_spread.counts

    # drop the below MSL vals and clip at the high-elev threshold
    to_spread["z_ix"] = to_spread.z_ix.clip(upper=high_elev_ix)
    to_spread = to_spread[(to_spread.z_ix >= 0) | (to_spread.protection_zone != -1)]

    to_spread = to_spread.groupby(names, observed=True).area_km.sum()

    return (
        pd.concat((df.drop(to_drop).droplevel("int_res"), to_spread))
        .groupby(names, observed=True)
        .sum()
    )


def process_tile(tile_name, calc_elev=True, calc_exp=True, check=True):
    out_path = sset.DIR_EXPOSURE_BINNED_TMP_TILES / f"{tile_name}.parquet"
    seg_area_out_path = (
        sset.DIR_EXPOSURE_BINNED_TMP_TILES_SEGMENT_AREA / f"{tile_name}.parquet"
    )
    noland_path = sset.DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND / f"{tile_name}.parquet"

    if check and (
        out_path.is_file()
        or noland_path.is_file()
        or (seg_area_out_path.is_file() and not calc_exp)
    ):
        return out_path

    llon, llat = spatial.get_ll(tile_name)
    lat_slice = slice((90 + llat) * PIXELS_PER_TILE, (91 + llat) * PIXELS_PER_TILE)
    lon_slice = slice((180 + llon) * PIXELS_PER_TILE, (181 + llon) * PIXELS_PER_TILE)
    bbox = box(llon, llat, llon + 1, llat + 1)

    elev_tile = (
        open_zarr(sset.PATH_ELEV_MSS, chunks=None)
        .isel(lon=lon_slice, lat=lat_slice)
        .load()
    )

    connected = elev_tile.connected < 201
    underwater = ((elev_tile.z < 0) | elev_tile.source.isin([1, 3])) & connected
    int_res = (
        elev_tile.int_res
        & (elev_tile.z <= sset.HIGHEST_WITHELEV_EXPOSURE_METERS)
        & connected
    )
    elev_tile = elev_tile.z

    high_elev_fill = (
        sset.HIGHEST_WITHELEV_EXPOSURE_METERS + sset.EXPOSURE_BIN_WIDTH_V / 2
    )
    high_elev_ix = spatial.grid_val_to_ix(
        np.array(high_elev_fill), sset.EXPOSURE_BIN_WIDTH_V
    )

    if not connected.any():
        elev_tile *= np.nan
        calc_elev = False

    if not calc_elev:
        # mark it all as "high-elev"
        assert elev_tile.isnull().all()
        elev_tile = elev_tile.fillna(high_elev_fill)
        underwater[:] = False
    assert elev_tile.notnull().any()

    # Bundle higher-than-coastal elevation values into one to simplify later data
    # processing. Set non-hydro-connected pixels to the high-elev-fill value as well
    # (treated same as high elevation)
    elev_tile = elev_tile.where(
        (elev_tile <= sset.HIGHEST_WITHELEV_EXPOSURE_METERS) & connected,
        high_elev_fill,
    )

    # match tile points with countries, regions, protection zones
    out = match_elev_pixels_to_shapes(elev_tile, bbox)

    # get points on land, assign impact regions and countries at exposure grid level
    elev_df = xr.Dataset(
        {
            "elev": elev_tile,
            "underwater": underwater,
            "int_res": int_res,
            "lake": get_lake_mask(bbox, elev_tile),
        }
    ).to_dataframe()

    out["z_ix"] = pd.Series(
        spatial.grid_val_to_ix(elev_df.elev, sset.EXPOSURE_BIN_WIDTH_V),
        index=elev_df.index,
    ).astype("int16")

    out[["underwater", "int_res", "elev", "lake"]] = elev_df[
        ["underwater", "int_res", "elev", "lake"]
    ]

    out["area_km"] = (
        np.cos(np.deg2rad(out.index.get_level_values("lat")))
        * (spatial.LAT_TO_M / 1000 / PIXELS_PER_TILE) ** 2
    )

    out = out.set_index(
        [
            pd.Index(
                spatial.grid_val_to_ix(
                    out.index.get_level_values("lon"),
                    sset.ASSET_VALUE_GRID_WIDTH,
                    lon_mask=True,
                ).astype("int16"),
                name="x_ix",
            ),
            pd.Index(
                spatial.grid_val_to_ix(
                    out.index.get_level_values("lat"),
                    sset.ASSET_VALUE_GRID_WIDTH,
                    lon_mask=True,
                ).astype("int16"),
                name="y_ix",
            ),
        ]
    )

    valid = ((out.z_ix.ge(0) & (~out.underwater)) | out.protection_zone.ne(-1)) & (
        ~out.lake
    )
    out = out.drop(columns=["underwater", "lake"])
    negelev_wetland_pts = out[(~valid) & out.wetland_flag]
    out = out[valid]

    out = out.groupby(
        [
            "seg_adm",
            "protection_zone",
            "wetland_flag",
            "z_ix",
            "x_ix",
            "y_ix",
            "int_res",
        ],
        observed=True,
    ).area_km.sum()

    # spread out integer-resolution observations over the nearest 1m of bins
    out = spread_int_res(out, high_elev_ix)

    seg_areas = out.groupby(
        ["seg_adm", "protection_zone", "wetland_flag", "z_ix"],
        observed=True,
    ).sum()

    negelev_wetland_areas = negelev_wetland_pts.groupby(
        ["seg_adm", "protection_zone", "wetland_flag", "z_ix"],
        observed=True,
    ).area_km.sum()

    seg_areas = pd.concat([seg_areas, negelev_wetland_areas])
    seg_areas = seg_areas[seg_areas.index.get_level_values("z_ix") <= high_elev_ix]

    if sset.ASSET_VALUE_GRID_WIDTH != sset.POP_GRID_WIDTH:
        raise NotImplementedError(
            "A simple regridding is necessary to map to a different grid for "
            "population, but has not yet been implemented"
        )

    save_to_parquet(seg_areas.to_frame(), seg_area_out_path)

    if not calc_exp:
        return seg_area_out_path

    assets, pop = load_exposure(llon, llat)
    this_exp = assets.to_frame().join(pop, how="outer").fillna(0)

    if out.shape[0] == 0:
        if calc_exp:
            save_to_parquet(
                this_exp,
                noland_path,
            )
        return out_path

    # swap these if we want to assume no population/capital on wetlands
    # out = out[
    #     ~out.index.get_level_values("wetland_flag").values.astype(bool)
    # ].droplevel("wetland_flag")
    out = out.groupby(
        [c for c in out.index.names if c != "wetland_flag"], observed=True
    ).sum()

    this_exp[["lon", "lat"]] = spatial.grid_ix_to_val(
        this_exp.reset_index(["x_ix", "y_ix"])[["x_ix", "y_ix"]].values,
        sset.ASSET_VALUE_GRID_WIDTH,
        lon_mask=[True, False],
    )

    valid_locs = this_exp.index.isin(
        out.droplevel(
            [c for c in out.index.names if c not in ["x_ix", "y_ix"]]
        ).index.unique()
    )

    if not valid_locs.all():
        to_merge = []
        # ideally, we move this exposure that's has no valid x,y,z location within its
        # grid cell to the nearest grid cell with a valid location AND existing exposure
        if valid_locs.any():
            valid_exp = this_exp[valid_locs]
            to_merge.append(valid_exp)
        # if none exist in this tile, simply move to the nearest grid cell w/ valid
        # location, regardless of whether it contains exposure already.
        else:
            tmp = out.reset_index()[["x_ix", "y_ix"]]
            valid_exp = pd.DataFrame(
                spatial.grid_ix_to_val(
                    tmp.values, sset.ASSET_VALUE_GRID_WIDTH, lon_mask=[True, False]
                ),
                columns=["lon", "lat"],
                index=tmp.set_index(["x_ix", "y_ix"]).index,
            )

        extra_exp = this_exp[~valid_locs]
        exp_ix_mappings = spatial.spherical_nearest_neighbor(extra_exp, valid_exp)
        to_merge.append(
            extra_exp.set_index(
                pd.MultiIndex.from_tuples(
                    exp_ix_mappings.values, names=extra_exp.index.names
                )
            )
        )
        this_exp = pd.concat(to_merge).groupby(["x_ix", "y_ix"]).sum()

    this_exp = this_exp.drop(columns=["lon", "lat"])

    full = out.to_frame().join(this_exp, how="outer")
    assert full.area_km.notnull().all()
    full[["asset_value", "pop"]] = full[["asset_value", "pop"]].fillna(0)

    frac_areas = full.area_km / full.area_km.groupby(["x_ix", "y_ix"]).sum()
    out = (
        full[["asset_value", "pop"]]
        .mul(frac_areas, axis="index")
        .join(full.area_km, how="outer")
    )
    assert out.notnull().all().all()

    out = out.groupby(
        [c for c in out.index.names if c not in ["x_ix", "y_ix"]], observed=True
    ).sum()

    # make sure no exposure was dropped or added from the original exposure within tile
    # (within some margin of float error)
    # include very low sums for 0 / 0 division (areas where there is no exposure, but we
    # calculate anyway for diva areas)
    assert (
        this_exp.sum().sum() < 0.00001
        or np.abs(this_exp.sum() / out[this_exp.columns].sum() - 1).sum() < 0.00001
    )

    # make sure range is acceptable
    z = out.index.get_level_values("z_ix")[
        out.index.get_level_values("protection_zone") == -1
    ]

    assert (len(z) == 0) or (z.min() >= 0) & (z.max() <= high_elev_ix)

    save_to_parquet(out, out_path)

    return out_path

#### Get list of tiles to process

In [7]:
tile_meta = pd.read_parquet(sset.PATH_EXPOSURE_TILE_LIST)

tile_groups = (
    tile_meta.reset_index().groupby("PROCESSING_SET")["tile_name"].unique().to_dict()
)

## Without elevation

In [8]:
withoutelev_futures = client.map(
    process_tile, tile_groups["WITHOUTELEV"], calc_elev=False, batch_size=1000
)

## With elevation

### With Exposure

In [17]:
withelev_futures = client.map(
    process_tile,
    tile_groups["WITHELEV"],
)

### No exposure (just area)

In [21]:
ciam_futures = client.map(
    process_tile,
    tile_groups["CIAM"],
    calc_exp=False,
)

## Check unassigned exposure

In [23]:
finished = False
client.gather(withoutelev_futures + withelev_futures + ciam_futures)
finished = True

In [24]:
noland = [
    i.split("/")[-1] for i in sset.FS.ls(str(sset.DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND))
]
noland = pd.concat(
    [
        pd.read_parquet(
            sset.DIR_EXPOSURE_BINNED_TMP_TILES_NOLAND / t,
            columns=["asset_value", "pop"],
        )
        .sum()
        .to_frame()
        .T.set_index(pd.Index([t.split(".")[0]]))
        for t in noland
    ]
)
assert (noland.sum() < 700).all()
print(noland.sum())
noland

asset_value    138.70167
pop            605.00000
dtype: float64


Unnamed: 0,asset_value,pop
N09E081,0.0,2.0
N74E115,73.054528,0.0
N74E116,63.280207,0.0
N80E078,2.366936,0.0
S09E162,0.0,603.0


There is less than 140 people and around $600 in assets that failed to be assigned a location because they were in a 1-degree tile with no valued x/y/z locations. These are small enough values on the global scale and well within the uncertainty even for a single grid cell, so we drop these values.

## Shutdown workers

In [25]:
client.close()
cluster.close()