## Calculate attributes
Takes prepared geospatial data and computes various attributes.

In [1]:
import pandas as pd
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
from python_cs_functions import config as cs, attributes as csa
from python_cs_functions.delineate import prepare_delineation_outputs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [6]:
# Get the required info from the config file
data_path            = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

# Get the temporary data folder
cs_temp_folder = cs.read_from_config(config_file, 'temp_path')
temp_path = Path(cs_temp_folder)
temp_path.mkdir(exist_ok=True, parents=True)

# Get the attribute folder
att_folder = cs.read_from_config(config_file, 'att_path')
att_path = basins_path / att_folder
att_path.mkdir(parents=True, exist_ok=True)

### Data loading

In [7]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [8]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

### Processing

In [9]:
debug_message = f'\n!!! CHECK DEBUGGING STATUS: \n- Testing 1 file \n- Testing 1 basin'

In [10]:
data_subfolders = ['era5', 'worldclim', 'hydrology', 'lai', 'forest_height', 'glclu2019', 'modis_land', 'lgrip30', 'merit', 'hydrolakes', 'pelletier', 'soilgrids', 'glhymps']

In [11]:
# Every attribute needs a list, so that we can efficiently construct a dataframe later
l_gauges = [] # station ID

In [12]:
print(debug_message)
for ix,row in cs_meta.iterrows():

    # DEBUGGING
    if ix != 46: continue

    # Get the paths
    basin_id, shp_lump_path, shp_dist_path, _, _ = prepare_delineation_outputs(cs_meta, ix, basins_path)
    geo_folder = basins_path / 'basin_data' / basin_id / 'geospatial'
    met_folder = basins_path / 'basin_data' / basin_id / 'forcing'
    hyd_folder = basins_path / 'basin_data' / basin_id / 'observations'

    # Data storage
    l_gauges.append(basin_id) # Update the Station list
    l_values = [] # Initialize an empty list where we'll store this basin's attributes
    l_index = [] # Initialize an empty list where we'll store the attribute descriptions

    # Define the shapefiles
    shp = str(shp_lump_path) # because zonalstats wants a file path, not a geodataframe
    riv = str(shp_dist_path).format('river') # For topographic attributes
    
    # Data-specific processing
    print(f'Processing geospatial data into attributes for {basin_id}')
    for dataset in data_subfolders:
        print(f' - processing {dataset}')

        ## CLIMATE
        if dataset == 'era5':
            l_values, l_index, ds_precip, ds_era5 = csa.attributes_from_era5(met_folder, shp, 'era5', l_values, l_index)                                
        if dataset == 'worldclim':
            csa.oudin_pet_from_worldclim(geo_folder, dataset) # Get an extra PET estimate to sanity check ERA5 outcomes
            csa.aridity_and_fraction_snow_from_worldclim(geo_folder, dataset) # Get monthly aridity and fraction snow maps
            l_values, l_index = csa.attributes_from_worldclim(geo_folder, dataset, shp, l_values, l_index)

        ## LAND COVER
        if dataset == 'forest_height':
            l_values, l_index = csa.attributes_from_forest_height(geo_folder, dataset, shp, l_values, l_index)
        if dataset == 'lai':
            l_values, l_index = csa.attributes_from_lai(geo_folder, dataset, temp_path, shp, l_values, l_index)
        if dataset == 'glclu2019':
            l_values, l_index = csa.attributes_from_glclu2019(geo_folder, dataset, shp, l_values, l_index)
        if dataset == 'modis_land':
            l_values, l_index = csa.attributes_from_modis_land(geo_folder, dataset, shp, l_values, l_index)
        if dataset == 'lgrip30':
            l_values, l_index = csa.attributes_from_lgrip30(geo_folder, dataset, shp, l_values, l_index)

        ## TOPOGRAPHY
        if dataset == 'merit':
            l_values, l_index = csa.attributes_from_merit(geo_folder, dataset, shp, riv, row, l_values, l_index)

        ## OPENWATER
        if dataset == 'hydrolakes':
            l_values, l_index = csa.attributes_from_hydrolakes(geo_folder, dataset, l_values, l_index)
        if dataset == 'hydrology':
            l_values, l_index = csa.attributes_from_streamflow(hyd_folder, dataset, basin_id, ds_precip, row, l_values, l_index)

        ## SOIL
        if dataset == 'pelletier':
            l_values, l_index = csa.attributes_from_pelletier(geo_folder, dataset, shp, l_values, l_index)
        if dataset == 'soilgrids':
            l_values, l_index = csa.attributes_from_soilgrids(geo_folder, dataset, shp, l_values, l_index)

        ## GEOLOGY
        if dataset == 'glhymps':
            l_values, l_index = csa.attributes_from_glhymps(geo_folder, dataset, l_values, l_index)
            
print(debug_message)


!!! CHECK DEBUGGING STATUS: 
- Testing 1 file 
- Testing 1 basin
Processing geospatial data into attributes for CAN_01AD002
 - processing era5
 - processing worldclim




 - processing hydrology


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


 - processing lai
 - processing forest_height
 - processing glclu2019
 - processing modis_land
 - processing lgrip30
 - processing merit
 - processing hydrolakes
 - processing pelletier
 - processing soilgrids
 - processing glhymps

!!! CHECK DEBUGGING STATUS: 
- Testing 1 file 
- Testing 1 basin


In [13]:
len(l_values),len(l_index)

(1128, 1128)

#### Make the dataframe

In [14]:
# Test with a fake second station
l_gauges = ['CAN_01AD002','CAN_01AD003']

# Make the dataframe
input_dict = dict(zip(l_gauges, [l_values,l_values]))
df = pd.DataFrame(input_dict)

# Set the index
multi_index = pd.MultiIndex.from_tuples(l_index, names=['Category', 'Attribute', 'Unit', 'Source'])
df.index = multi_index

# Drop the fake extra column
df = df.drop(columns=['CAN_01AD003'], axis=1)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CAN_01AD002
Category,Attribute,Unit,Source,Unnamed: 4_level_1
Climate,num_years_era5,years,ERA5,70
Climate,mtpr_mean,mm,ERA5,1137.1210937712333
Climate,mtpr_std,mm,ERA5,136.74820016873835
Climate,mper_mean,mm,ERA5,209.9189803019856
Climate,mper_std,mm,ERA5,12.083002663320935
...,...,...,...,...
Geology,porosity_std,-,GLHYMPS,0.07108
Geology,log_permeability_min,m^2,GLHYMPS,-16.5
Geology,log_permeability_mean,m^2,GLHYMPS,-14.377311
Geology,log_permeability_max,m^2,GLHYMPS,-12.5


In [17]:
att_file = f'attributes_{basin_id}.csv'
df.to_csv(att_path/att_file)