# Subset geotiffs to basin shapefile outlines

In [1]:
import glob
import os
import pandas as pd
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path            = cs.read_from_config(config_file,'data_path')
geospatial_temp_path = cs.read_from_config(config_file,'geospatial_temp_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

### Find an overview of the data we have

In [6]:
data_folder = Path(data_path) / geospatial_temp_path
data_folders = glob.glob( str(data_folder/'*')) # E.g., geospatial_temp/lai

In [73]:
data_files = []
for root, dirnames, filenames in os.walk(data_folder):
    if filenames:
        for file in filenames:
            if file.endswith('.tif'):
                data_files.append(Path(root + '/' + file))

### Processing

In [7]:
debug_message = f'\n!!! CHECK DEBUGGING STATUS: \n- Testing 1 file \n- Testing 1 basin'

In [None]:
print(debug_message)
for ix,row in cs_meta.iterrows():

    # DEBUGGING
    if ix != 0: continue

    # Get shapefile path to determine download coordinates, and forcing destination path
    basin_id, shp_lump_path, _, _, _ = cs.prepare_delineation_outputs(cs_meta, ix, basins_path)
    print(f'Processing GeoTIFFs for {basin_id}')

    # Loop over the files we want to subset
    for file in data_files:

        # Get the relative path compared to download folder; 
        # In other words, find which folders we want to create for the basin
        relative_path = file.relative_to(data_folder)
        des_folder = basins_path / 'basin_data' / basin_id / 'geospatial' / os.path.dirname(relative_path)
        
        # Subset the file
        # 'buffer' adds a small, data-set dependent, buffer around the shapefile to ensure full coverage
        cs.subset_geotiff_to_shapefile(file,shp_lump_path,des_folder, buffer=True) 


!!! CHECK DEBUGGING STATUS: 
- Testing 1 file 
- Testing 1 basin
Processing GeoTIFFs for CAN_01AD002


### Functions

In [53]:
import geopandas as gpd
import warnings

In [75]:
def subset_geotiff_to_shapefile(src_file,src_shape,des_folder,
                                buffer=False,
                                out_no_data = None):

    # Input cleaning
    des_folder.mkdir(parents=True, exist_ok=True)
    des_file  = str(des_folder / os.path.basename(src_file))
    src_file  = str(src_file)
    src_shape = str(src_shape)
    
    # Handle buffering of shapefile, if requested
    if buffer:
        tmp_shape = src_shape.replace('.shp','_TEMP.shp')
    
        # Find buffer distance
        src_tiff = gdal.Open(src_file, gdal.GA_ReadOnly)
        pixel_x  = src_tiff.GetGeoTransform()[1]
        pixel_y  = src_tiff.GetGeoTransform()[5]
        buffer   = 0.5*(pixel_x**2 + pixel_y**2)**(0.5) # I.e., half the maximum distance from center to edge of pixel
        src_tiff = None
        
        # Temporarily block warnings: 
        # gpd will tell us that buffering in EPSG:4326 is not accurate - this is fine because we're
        # buffering in lat/lon units
        with warnings.catch_warnings():
            warnings.simplefilter('ignore') 
    
            # Buffer the shapefile
            shp = gpd.read_file(src_shape)
            shp['geometry'] = shp.buffer(buffer)
            shp.to_file(tmp_shape)
    else:
        # Not using buffered shape, but code below still needs 'tmp_shape' to have a value
        tmp_shape = src_shape

    # Clip
    gdal.Warp(destNameOrDestDS = des_file,
              srcDSOrSrcDSTab  = src_file,
              cutlineDSName    = tmp_shape, # vector file
              cropToCutline    = True, # Select True
              copyMetadata     = True, # optional
              #dstAlpha         = True, # Dropping the alpha band saves half the file size
              dstNodata        = out_no_data,
              srcSRS           = 'EPSG:4326',
              dstSRS           = 'EPSG:4326',
              #resampleAlg      = "nearestneighbour"
             )
    
    # Remove buffered shapefile
    if buffer:
        os.remove(tmp_shape)