# Generate protected areas from various levee and hydrological data sources

### Gather basins from HydroBASINS to find endorheic basins to include in "protected areas" dataset

In [None]:
import warnings

import contextily as ctx
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon, box
from tqdm.notebook import tqdm

from sliiders import settings as sset
from sliiders import spatial

spatial.filter_spatial_warnings()

In [None]:
all_basin_shapefiles = set((sset.DIR_HYDROBASINS_RAW.glob("hybas_*_lev00_v1c.shp")))
eu_basin_shapefile = sset.DIR_HYDROBASINS_RAW / "hybas_eu_lev00_v1c.shp"

all_basin_shapefiles.remove(eu_basin_shapefile)

In [None]:
all_other_basins = pd.concat(
    [gpd.read_file(basin_shapefile) for basin_shapefile in tqdm(all_basin_shapefiles)],
    ignore_index=True,
)

In [None]:
ocean_and_caspian = gpd.read_file(sset.PATH_NATURALEARTH_OCEAN)

ocean_and_caspian = ocean_and_caspian.explode(index_parts=False)

ocean_and_caspian["area"] = ocean_and_caspian.area

# Sort so that ocean is first, Caspian is second
ocean_and_caspian = ocean_and_caspian.sort_values("area", ascending=False)

In [None]:
ocean_shape = ocean_and_caspian.geometry.values[0]
caspian_shape = ocean_and_caspian.geometry.values[1]

In [None]:
ocean_shape

In [None]:
caspian_shape

In [None]:
ocean_buffer = ocean_shape.buffer(sset.ENDORHEIC_BASIN_OCEAN_BUFFER).simplify(
    tolerance=0.1
)

### Handle Caspian Sea as a special case since it is considered "ocean" by HydroBASINS

In [None]:
eu_basins = gpd.read_file(eu_basin_shapefile)

eu_basins["touches_caspian"] = eu_basins["geometry"].intersects(
    box(*caspian_shape.bounds).buffer(0.5)
)

eu_basins["feeds_into_caspian"] = eu_basins["touches_caspian"].copy()

prev_basin_count = 0
while True:

    feeds_into_caspian = set(eu_basins.loc[eu_basins["feeds_into_caspian"], "HYBAS_ID"])

    eu_basins["feeds_into_caspian"] = (
        (eu_basins["feeds_into_caspian"])
        | (eu_basins["NEXT_DOWN"].isin(feeds_into_caspian))
        | (eu_basins["NEXT_SINK"].isin(feeds_into_caspian))
        | (eu_basins["MAIN_BAS"].isin(feeds_into_caspian))
    )

    this_basin_count = eu_basins["feeds_into_caspian"].sum()
    if this_basin_count == prev_basin_count:
        break

    prev_basin_count = this_basin_count

In [None]:
eu_basins[eu_basins["feeds_into_caspian"]].plot()

In [None]:
eu_basins.loc[eu_basins["feeds_into_caspian"], "ENDO"] = 1

In [None]:
all_basins = pd.concat([all_other_basins, eu_basins], ignore_index=True)

### Apply narrow definition of "endorheic" by assuming all "virtual" connections e.g. groundwater are real connections

In [None]:
all_basins["not_endorheic"] = all_basins["ENDO"] == 0

prev_non_endorheic_ct = -1
while True:
    not_endorheic = set(all_basins.loc[all_basins["not_endorheic"], "HYBAS_ID"])
    all_basins["not_endorheic"] = (
        (all_basins["not_endorheic"])
        | (all_basins["NEXT_DOWN"].isin(not_endorheic))
        | (all_basins["NEXT_SINK"].isin(not_endorheic))
        | (all_basins["MAIN_BAS"].isin(not_endorheic))
    )
    non_endorheic_ct = len(not_endorheic)

    if non_endorheic_ct == prev_non_endorheic_ct:
        break
    prev_non_endorheic_ct = non_endorheic_ct

In [None]:
all_endorheic_basins = all_basins.loc[~all_basins["not_endorheic"]].copy()

In [None]:
ax = all_endorheic_basins.plot(figsize=(20, 20))
ctx.add_basemap(ax, crs="EPSG:4327")

### Divide ocean shape into 1-degree tiles

In [None]:
llats = range(-90, 91)
llons = range(-180, 181)
boxes = []
llats_list = []
llons_list = []
for llat in llats:
    for llon in llons:
        llats_list.append(llat)
        llons_list.append(llon)
        boxes.append(
            box(
                llon,
                llat,
                llon + 1,
                llat + 1,
            )
        )

ocean_boxes_gdf = gpd.GeoDataFrame(
    {"llat": llats_list, "llon": llons_list}, geometry=boxes, crs="EPSG:4326"
)

In [None]:
ocean_boxes_gdf["ocean_box"] = ocean_boxes_gdf["geometry"].intersection(ocean_buffer)

In [None]:
ocean_boxes_gdf = ocean_boxes_gdf.drop(columns="geometry").rename(
    columns={"ocean_box": "geometry"}
)

In [None]:
ocean_boxes_gdf = ocean_boxes_gdf[~ocean_boxes_gdf["geometry"].is_empty]

### Find all endorheic basins that intersect with the ocean buffer, label them "not_endorheic"

In [None]:
intersections = gpd.sjoin(
    all_endorheic_basins, ocean_boxes_gdf, how="left", op="intersects"
)

In [None]:
no_ocean = set(
    intersections[intersections["index_right"].isnull()]["HYBAS_ID"].unique()
)

In [None]:
all_endorheic_basins["not_endorheic"] = ~all_endorheic_basins["HYBAS_ID"].isin(no_ocean)

### Once basins are labelled "not_endorheic" close to the ocean, we want basins flowing into those to be "not_endorheic" as well

In [None]:
prev_non_endorheic_ct = -1
while True:
    not_endorheic = set(
        all_endorheic_basins.loc[all_endorheic_basins["not_endorheic"], "HYBAS_ID"]
    )
    all_endorheic_basins["not_endorheic"] = (
        (all_endorheic_basins["not_endorheic"])
        | (all_endorheic_basins["NEXT_DOWN"].isin(not_endorheic))
        | (all_endorheic_basins["NEXT_SINK"].isin(not_endorheic))
        | (all_endorheic_basins["MAIN_BAS"].isin(not_endorheic))
    )
    non_endorheic_ct = len(not_endorheic)

    if non_endorheic_ct == prev_non_endorheic_ct:
        break
    prev_non_endorheic_ct = non_endorheic_ct

In [None]:
all_endorheic_basins = all_endorheic_basins[~all_endorheic_basins["not_endorheic"]]

In [None]:
ax = all_endorheic_basins.plot(figsize=(20, 20))
ctx.add_basemap(ax, crs="EPSG:4326")

In [None]:
endorheic_basins_dissolved = all_endorheic_basins.unary_union.geoms

In [None]:
combined_basins = gpd.GeoSeries(list(endorheic_basins_dissolved))
combined_basins = gpd.GeoDataFrame(geometry=combined_basins)

combined_basins["area"] = combined_basins.geometry.area
combined_basins = combined_basins.sort_values("area", ascending=False)
combined_basins = combined_basins[
    combined_basins["area"] > sset.MIN_BASIN_TILE_DEGREE_AREA
].copy()

### Label basins manually (check each basin manually)

In [None]:
ax = combined_basins.plot(figsize=(20, 20))
ctx.add_basemap(ax, crs="EPSG:4326")

In [None]:
combined_basins["label"] = [
    "eurasia_caspian",
    "sahara_sahel",
    "central_australia",
    "arabian_peninsula_dead_sea",
    "altiplano_and_argentina",
    "southern_africa",
    "great_lakes_and_horn_of_africa",
    "great_basin",
]

### Fill Eurasian-Caspian basin with the Caspian itself

In [None]:
surrounding_caspian = combined_basins.loc[
    combined_basins["label"] == "eurasia_caspian", "geometry"
].values[0]
combined_basins.loc[
    combined_basins["label"] == "eurasia_caspian", "geometry"
] = Polygon(surrounding_caspian.exterior)

In [None]:
combined_basins = combined_basins.reset_index(drop=True).drop(columns="area")

In [None]:
combined_basins.plot()

In [None]:
sset.PATH_MANUAL_PROTECTED_AREAS.parent.mkdir(exist_ok=True, parents=True)

In [None]:
combined_basins.to_parquet(sset.PATH_MANUAL_PROTECTED_AREAS)

In [None]:
combined_basins = gpd.read_parquet(sset.PATH_MANUAL_PROTECTED_AREAS)

## Combine protected areas for global processing
- US National Levee Database (NLDB) and manual areas
- Large global endorheic basins
- Manual boxes

In [None]:
nldb_and_manual_areas = gpd.read_parquet(sset.PATH_US_MANUAL_PROTECTED_AREAS)

Netherlands (assume all of the European Netherlands is protected)

In [None]:
vor_shapes = gpd.read_parquet(sset.PATH_GADM_ADM0_VORONOI)
protected_areas_nld = vor_shapes[vor_shapes["ISO"] == "NLD"][["geometry"]]

Manual boxes (additional to those defined in `sset.PATH_US_MANUAL_PROTECTED_AREAS`

In [None]:
manual_box_bounds = [
    {"label": "orinoco", "minx": -70.0, "miny": 5.0, "maxx": -66.0, "maxy": 8.0},
    {"label": "heilongjiang", "minx": 130.0, "miny": 45.0, "maxx": 136.0, "maxy": 48.0},
    {
        "label": "southern_africa",
        "minx": 28.0,
        "miny": -25.0,
        "maxx": 33.0,
        "maxy": -20.0,
    },
    {
        "label": "great_basin",
        "minx": -119.0,
        "miny": 35.0,
        "maxx": -115.0,
        "maxy": 40.0,
    },
    {
        "label": "inner_australia",
        "minx": 135.0,
        "miny": -32.0,
        "maxx": 143.0,
        "maxy": -25.0,
    },
    {"label": "yakutsk", "minx": 125.0, "miny": 62.0, "maxx": 130.0, "maxy": 67.0},
    {"label": "lake_baikal", "minx": 102.0, "miny": 49.0, "maxx": 113.0, "maxy": 57.0},
    {"label": "great_lakes", "minx": -95.0, "miny": 41.0, "maxx": -75.0, "maxy": 50.0},
]

box_countries = {
    "orinoco": "Colombia, Venezuela",
    "heilongjiang": "China",
    "southern_africa": "Botswana, South Africa, Zimbabwe",
    "great_basin": "USA",
    "inner_australia": "Australia",
    "yakutsk": "Russia",
    "lake_baikal": "Russia",
    "great_lakes": "Canada, USA",
}

manual_boxes = gpd.GeoDataFrame(manual_box_bounds)

manual_boxes["geometry"] = manual_boxes.apply(
    lambda row: box(row["minx"], row["miny"], row["maxx"], row["maxy"]), axis=1
)

manual_boxes = manual_boxes.drop(columns=["minx", "miny", "maxx", "maxy"])

manual_boxes.plot()

In [None]:
nldb_and_manual_areas["country"] = "USA"

protected_areas_nld["levee_segment_id"] = -1
protected_areas_nld["protection_group"] = "Netherlands"
protected_areas_nld["protection_type"] = "Netherlands"
protected_areas_nld["country"] = "NLD"

combined_basins["levee_segment_id"] = -1
combined_basins[
    "protection_group"
] = "largest endorheic basin areas, with buffer from ocean"
combined_basins["protection_type"] = "endorheic basin"
combined_basins["country"] = "multiple"

manual_boxes["levee_segment_id"] = -1
manual_boxes["protection_group"] = "manual boxes"
manual_boxes["protection_type"] = "non-coastal"
manual_boxes["country"] = manual_boxes["label"].apply(lambda x: box_countries[x])

In [None]:
protected_areas = (
    pd.concat(
        [
            nldb_and_manual_areas,
            protected_areas_nld,
            combined_basins,
            manual_boxes,
        ],
        ignore_index=True,
    )
    .reset_index(drop=False)
    .rename(columns={"index": "protection_zone_id"})
)

In [None]:
protected_areas.sample(5)

In [None]:
# plot-checking the protected areas
protected_areas.plot()

# Save combined areas

In [None]:
protected_areas.to_parquet(sset.PATH_GLOBAL_PROTECTED_AREAS)