# Subset geotiffs to basin shapefile outlines

In [1]:
import glob
import os
import pandas as pd
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path            = cs.read_from_config(config_file,'data_path')
geospatial_temp_path = cs.read_from_config(config_file,'geospatial_temp_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

### Find an overview of the data we have

In [6]:
data_folder = Path(data_path) / geospatial_temp_path
data_folders = glob.glob( str(data_folder/'*')) # E.g., geospatial_temp/lai

In [7]:
data_files = []
for root, dirnames, filenames in os.walk(data_folder):
    if filenames:
        for file in filenames:
            if file.endswith('.shp'):
                data_files.append(Path(root + '/' + file))

### Processing

In [104]:
debug_message = f'\n!!! CHECK DEBUGGING STATUS: \n- Full run in progress'

In [None]:
print(debug_message)

# Loop over the files we want to subset
for file in data_files:

    # Load the source data now, so we only need to do that once
    src = gpd.read_file(file)
    print(f'Processing {file}')
        
    for ix,row in cs_meta.iterrows():

        # DEBUGGING
        #if ix != 0: continue

        # Get shapefile path to determine download coordinates, and forcing destination path
        basin_id, shp_lump_path, _, _, _ = cs.prepare_delineation_outputs(cs_meta, ix, basins_path)
        print(f' - Processing {basin_id}')

        # Get the relative path compared to download folder; 
        # In other words, find which folders we want to create for the basin
        relative_path = file.relative_to(data_folder)
        des_folder = basins_path / 'basin_data' / basin_id / 'geospatial' / os.path.dirname(relative_path)
        
        # Subset the file
        subset_shapefile_to_shapefile(src, file, shp_lump_path, des_folder)


!!! CHECK DEBUGGING STATUS: 
- Full run in progress
Processing /Users/wmk934/data/CAMELS_spat/geospatial_temp/glhymps/raw/glhymps.shp
 - Processing CAN_01AD002
 - Processing CAN_01AD003
 - Processing CAN_01AE001
 - Processing CAN_01AF007
 - Processing CAN_01AF009
 - Processing CAN_01AJ003
 - Processing CAN_01AJ004
 - Processing CAN_01AJ010
 - Processing CAN_01AK001
 - Processing CAN_01AK006
 - Processing CAN_01AK007
 - Processing CAN_01AL002
 - Processing CAN_01AL004
 - Processing CAN_01AM001
 - Processing CAN_01AN002
 - Processing CAN_01AP002
 - Processing CAN_01AP004
 - Processing CAN_01AP006
 - Processing CAN_01AQ001
 - Processing CAN_01BC001
 - Processing CAN_01BD008
 - Processing CAN_01BE001
 - Processing CAN_01BG005
 - Processing CAN_01BG008
 - Processing CAN_01BG009
 - Processing CAN_01BH005
 - Processing CAN_01BH010
 - Processing CAN_01BJ003
 - Processing CAN_01BJ007
 - Processing CAN_01BJ010
 - Processing CAN_01BJ012
 - Processing CAN_01BL002
 - Processing CAN_01BL003
 - Proc

### Functions

In [18]:
import geopandas as gpd

In [100]:
def subset_shapefile_to_shapefile(src,src_file,src_shape,des_folder):

    # Input cleaning
    des_folder.mkdir(parents=True, exist_ok=True)
    des_file  = str(des_folder / os.path.basename(src_file))
    src_file  = str(src_file)
    src_shape = str(src_shape)
   
    # Open the basin shapefile
    shp = gpd.read_file(src_shape)
    
    # Loop over the geometries to check if they intersect, then use this info to create a HydroLAKES subset
    # Note: this is cleaner than des = gpd.overlay(src,shp, how='intersection'), because this alternative
    #   approach may be faster, but it clips the lake polygons to the catchment extent. The consequence of
    #   this is that the lake area reported as part of the HydroLAKES subset may be inaccurate, in cases
    #   where that polygon was clipped by the catchment outline. With the current approach we return the
    #   complete lake polygons.
    if 'hydrolakes' in src_file.lower():
        src['mask'] = src.apply(lambda row: row.geometry.intersects(shp.geometry), axis=1)
        des = src[src['mask'] == True].copy().reset_index(drop=True)
        des = des.drop('mask', axis=1)
    elif 'glhymps' in src_file.lower():
        des = gpd.overlay(src,shp,how='intersection') # Faster than above, clipping these polygons is fine because they don't contain an 'area' field
    
    # To file
    des.to_file(des_file)

    return # nothing