# Update upstream and downstream polygons of split polygons
After splitting the polygons that contain nested gauges, I forgot to update two things:
- The `NextDownID` value of the polygon immediately upstream of the split one;
- The `upX` value of the polygon immediately downstream of the split one.
Code below based on Cyril's fix.

In [1]:
import warnings
import geopandas as gpd
import matplotlib.pyplot as plt
import math
import math
import numpy as np
import pandas as pd
from pathlib import Path
import rasterio
from rasterio.features import rasterize
#from shapely.geometry import LineString, MultiLineString

## Functions

In [2]:
def find_all_upstream(df, start_comid):
    visited = set()
    to_visit = [start_comid]

    # Columns to look at
    up_cols = ['up1', 'up2', 'up3', 'up4']

    while to_visit:
        current = to_visit.pop()
        if current in visited or current in [0, "0", None, ""]:
            continue
        visited.add(current)

        # Get the row matching the current COMID
        row = df.loc[df["COMID"] == current]
        if row.empty:
            continue

        # Pull upstream values from row and flatten to a list
        upstreams = row[up_cols].values.flatten()

        # Add new valid, unvisited upstreams to the stack
        for uid in upstreams:
            if uid not in visited and uid not in [0, "0", None, ""]:
                to_visit.append(uid)

    #visited.discard(start_comid)  # optional: exclude the start point
    return visited

## Processing

In [3]:
# Data location
cs_main_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload")

In [4]:
# Destination location
cs_update_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload-updates")

In [5]:
# Specify the folder structure
shape_path_part1 = "shapefiles"
shape_path_parts2 = ["headwater", "macro-scale", "meso-scale"]
shape_path_part3 = "shapes-distributed"

In [6]:
# Trial
debug = False

# Empties for check lists later
check_df1_p1 = []
check_df1_p2 = []
check_df1_p3 = []
check_df1_p4 = []
check_df2_p1 = []
check_df2_p2 = []
check_df2_p3 = []
check_df2_p4 = []

# Loop
for shape_path_part2 in shape_path_parts2:

    # 1. Find the basin folders
    shape_middle = f"{shape_path_part1}/{shape_path_part2}/{shape_path_part3}"
    basin_paths = [f for f in (cs_main_folder / shape_middle).iterdir() if f.is_dir()]

    # 2. Loop over the identified folders
    for basin_path in basin_paths:

        # DEBUG - skip all basins apart from one
        if debug:
            if "CAN_05BB001" not in str(basin_path):
                continue # trial just bow at banff
            
        # 2.1. Extract the basin ID
        basin_id = basin_path.name # just the final part, e.g. USA_08164300

        # 2.2. Load the river shapefile 
        riv = gpd.read_file(basin_path / f"{basin_id}_distributed_river.shp")

        # 2.2.1. Quickly exit if this is an empty river dataframe (all None)
        if len(riv) & (riv["COMID"].iloc[0] is None):
            continue # to next basin

        # 2.2.2. Quickly exit if we already have something for this basin in the update location (useful for debugging)
        des_file = cs_update_folder / shape_middle / basin_id / f"{basin_id}_distributed_river.shp"
        if des_file.exists():
            continue # to next basin

        # DEBUG
        if debug:
            print("Before update code:")
            print(riv[['COMID','NextDownID','up1', 'up2', 'up3', 'up4', 'uparea', 'slope']].iloc[[39,37,51,50,36]])
            print("")

        # 2.3. Check if we have a split polygon and do things if so
        split_polys = riv[riv['COMID'] % 1 != 0]['COMID']
        if len(split_polys) > 0:        
            print(f"\n- {basin_id} identified as having a split polygon.")
                       
            # 2.3.1. Loop over the river segments and modify the network connections if needed
            for split_COMID in split_polys:
                print(f"  - Processing COMID {split_COMID}")
            
                # 2.3.2.1. Get the row containing split_COMID
                segment = riv[riv['COMID'] == split_COMID]
                assert len(segment) == 1, f"Segment not successfully extracted for COMID {split_COMID}"    
                
                # 2.3.2.2. Fix the NextDownID of the upstream river segment(s)
                for up in ['up1', 'up2', 'up3', 'up4']:
                    up_COMID = segment[up].iloc[0]         # find the COMID of this upstream reach
                    up_mask = (riv['COMID'] == up_COMID)   # is this COMID in this shapefile?
                    if up_mask.any():                      # If this upstream COMID is in the shapefile ...
                
                        print(f"    - {up} ({up_COMID}) of {split_COMID} is present in this shapefile")
                        
                        # Get the NextDownID of this allegedly upstream segment
                        current_down = riv[up_mask]['NextDownID'].iloc[0]
                        down_found = False
                        print(f"        - Segment {up_COMID} has NextDownID {current_down} (expected: {split_COMID})")
                
                        # Check if this already matches our split_COMID 
                        #  (either already replaced or needs to be changed)
                        if current_down == split_COMID: # already replaced
                            print("          - Current NextDownID matches expectations. No action needed.")
                            down_found = True
                        elif current_down == math.floor(split_COMID): # needs to be replaced
                            print("          - Current NextDownID is COMID from before splitting. Replacing.")
                            riv.loc[up_mask,'NextDownID'] = split_COMID
                            down_found = True

                            check_df1_p1.append(split_COMID)
                            check_df1_p2.append(up_COMID)
                            check_df1_p3.append(current_down)
                            check_df1_p4.append(split_COMID)
                        
                        assert down_found, f"NextDownID {current_down} of COMID {riv[up_mask]['COMID'].iloc[0]} does not match expected {split_COMID}"

                # 2.3.2.3. Fix the 'up' field of the next downstream river segment
                down_COMID = segment['NextDownID'].iloc[0]
                down_mask = (riv['COMID'] == down_COMID)
                if down_mask.any():
                
                    print(f"    - NextDownID ({down_COMID}) of {split_COMID} is present in this shapefile")
                
                    # Loop over this segment's upstream IDs
                    up_found = False # initialize a flag to check we actually replaced something
                    for up in ['up1', 'up2', 'up3', 'up4']:
                       
                        # Get the upstream ID
                        current_up = riv[down_mask][up].iloc[0]
                        print(f"        - {up} of {down_COMID} has value {current_up} (looking for: {split_COMID})")
                
                        # Check if this already matches our split_COMID 
                        #  (either already replaced or needs to be changed)
                        if current_up == split_COMID:
                            print(f"          - Current {up} matches expectations. No action needed.")
                            up_found = True
                            break # out of up loop
                        elif current_up == math.floor(split_COMID):
                            print(f"          - Current {up} is COMID from before splitting. Replacing.")
                            riv.loc[down_mask,up] = split_COMID
                            up_found = True

                            check_df2_p1.append(split_COMID)
                            check_df2_p2.append(down_COMID)
                            check_df2_p3.append(current_up)
                            check_df2_p4.append(split_COMID)
                            
                            break # out of up loop

                    # Confirm this segment did indeed contain our split COMID as one of its upstream IDs
                    assert up_found, f"COMID {down_COMID}, identified as being downstream of split polygon {split_comid}, does not contain {split_COMID} in any of its up IDs"

            # 2.3.3. Update the 'uparea' values
            # 2.3.3.1. Load the associated basin file
            bas = gpd.read_file(basin_path / f"{basin_id}_distributed_basin.shp")

            # 2.3.3.2. Loop over the segments and find upstream areas (including the basin itself, because this is what MERIT does)
            for split_COMID in split_polys:
                segment_mask = riv['COMID'] == split_COMID # Find the row
                val = riv.loc[segment_mask,'uparea'] # Get current value
                if not val.empty and pd.isna(val.iloc[0]): # Check if we currently have something like NaN or None
                    upstream_ids = find_all_upstream(riv,split_COMID) # if not, identify upstream reaches including current one
                    upstream_area = bas[bas["COMID"].isin(upstream_ids)]["unitarea"].sum() # then, sum areas of upstreams basins [km^2]
                    riv.loc[segment_mask,'uparea'] = upstream_area # write into dataframe
                
            # 2.3.4. Update the 'slope' field
            for split_COMID in split_polys:

                # Find the row
                segment_mask = riv['COMID'] == split_COMID
                
                # Check if we somehow already have a slope value somehow
                val = riv.loc[segment_mask,'slope'] # Get current value
                if not val.empty and pd.isna(val.iloc[0]): # Check if we currently have something like NaN or None

                    # Get the geometry
                    geom = riv.loc[segment_mask]['geometry'].iloc[0]
                    
                    # Open the DEM 
                    dem_file = cs_main_folder / 'geospatial' / shape_path_part2 / 'merit' / basin_id / f"{basin_id}_merit_hydro_elv.tif"
                    with rasterio.open(dem_file) as dem:
                    
                        # Rasterize the river geometry
                        rasterized = rasterize(
                            [(geom,1)],
                            out_shape=dem.shape,
                            transform=dem.transform,
                            fill=0,
                            all_touched=True,
                            dtype="uint8"
                        )
                    
                        # Create a masked array of the DEM, keeping values under the river segment
                        dem_data = dem.read(1)
                        masked_dem = np.where(rasterized == 1, dem_data, np.nan)
                    
                        # Find high and low point
                        elev1 = np.nanmax(masked_dem)
                        elev2 = np.nanmin(masked_dem)
                    
                    # Get the length and find the slope
                    length = riv.loc[segment_mask, 'new_len_km'].iloc[0] * 1000 # [km] to [m]
                    slope = abs(elev2-elev1)/length # abs() because we're not sure which end is up; [m m-1]
                    riv.loc[segment_mask, 'slope'] = slope

        # DEBUG
        if debug:
            print("\nAfter update code:")
            print(riv[['COMID','NextDownID','up1', 'up2', 'up3', 'up4', 'uparea', 'slope']].iloc[[39,37,51,50,36]])
            print("")

        # 2.4. Perform the column sorting we want for all river shapefiles
        riv = riv[['COMID','NextDownID','up1','up2','up3','up4',
                   'new_len_km','slope','uparea','geometry',
                   'lengthdir','sinuosity','order','strmDrop_t','slope_taud','maxup']]

        # 2.5. Ensure the output directory exists, and save to file
        des_folder = cs_update_folder / shape_middle / basin_id 
        des_folder.mkdir(exist_ok=True, parents=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            riv.to_file(des_folder / f"{basin_id}_distributed_river.shp")

# Create two dataframes with replacements
check_df1 = pd.DataFrame(data={'for split_COMID': check_df1_p1,
                               'found up_COMID': check_df1_p2,
                               'and changed NextDownID': check_df1_p3,
                               'to': check_df1_p4})

check_df2 = pd.DataFrame(data={'for split_COMID': check_df2_p1,
                               'found down_COMID': check_df2_p2,
                               'and changed upX': check_df2_p3,
                               'to': check_df2_p4})

check_df1.to_csv("river_updates_nextdownid.csv", index=False)
check_df2.to_csv("river_updates_upids.csv", index=False)


- CAN_08ME023 identified as having a split polygon.
  - Processing COMID 78011699.2

- USA_01567500 identified as having a split polygon.
  - Processing COMID 73006263.2

- CAN_02HA020 identified as having a split polygon.
  - Processing COMID 72057015.2

- CAN_02GC031 identified as having a split polygon.
  - Processing COMID 72056632.2

- CAN_05LJ027 identified as having a split polygon.
  - Processing COMID 71038679.2

- CAN_05BM018 identified as having a split polygon.
  - Processing COMID 71037868.2

- USA_01142500 identified as having a split polygon.
  - Processing COMID 73001843.2

- CAN_05CC010 identified as having a split polygon.
  - Processing COMID 71026036.2

- USA_12143600 identified as having a split polygon.
  - Processing COMID 78016804.2

- CAN_07AA007 identified as having a split polygon.
  - Processing COMID 82043065.2

- USA_02297155 identified as having a split polygon.
  - Processing COMID 73022321.2

- CAN_08MF062 identified as having a split polygon.
  - Proc