# Restructure data
This takes the work-in-progress files and moves them into the final data structure we want to upload.

Almost all of the data is catchment-based so we can put this into a parallel run. Things to remember:
- Do not redistribute the raw WorldClim data - this is not allowed.
- Remember to put the main attribute file into the resulting attributes folder.

In [1]:
import os
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import sys
import warnings
from datetime import datetime
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

## Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path            = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

# Attributes folder
cs_att_folder = cs.read_from_config(config_file, 'att_path')
att_path  = basins_path / 'camels_spat_attributes.csv'

# Destination folder
final_fold = cs.read_from_config(config_file, 'final_path')

## Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

## General processing
Steps:
1. Set up the folder structure
2. Subset and move the meta-data file to only the 1426 basins we want to keep
3. Subset and move the main attributes file
4. Add a `readme`, `citations`, and `known issues` file

In [6]:
# Set the top level path
dest_root = Path(final_fold) / 'camels-spat-upload'

In [7]:
# 1. Create the folder structure
standard_subfolders = ['headwater','meso-scale','macro-scale']
forcing_subfolders =  ['gridded','lumped','distributed']
specific_subfolders = {
    'attributes':   [], 
    'forcing':      ['daymet','em-earth','era5','rdrs'],
    'geospatial':   ['forest-height','glclu2019','glhymps','hydrolakes','lai','lgrip30','merit','modis-land','pelletier','soilgrids','worldclim-derived'],
    'observations': ['obs-daily','obs-hourly'],
    'shapefiles':   ['delineation-outcomes','shapes-distributed','shapes-forcing','shapes-lumped','shapes-reference']
}

In [493]:
for main_folder, sub_folders in specific_subfolders.items():
    for scale in standard_subfolders:
        if main_folder == 'geospatial': # add dedicated metadata folder
            Path(f"{dest_root}/{main_folder}/_metadata").mkdir(parents=True, exist_ok=True)
        if len(sub_folders) == 0: # attributes
            Path(f"{dest_root}/{main_folder}/{scale}").mkdir(parents=True, exist_ok=True)
        else: # everything else
            for sub_folder in sub_folders:
                if main_folder == 'forcing':
                    for aggregation in forcing_subfolders:
                        Path(f"{dest_root}/{main_folder}/{scale}/{sub_folder}/{sub_folder}-{aggregation}").mkdir(parents=True, exist_ok=True)
                else: # not forcing
                    Path(f"{dest_root}/{main_folder}/{scale}/{sub_folder}").mkdir(parents=True, exist_ok=True)

# Create an ERA5-invariant folder
for scale in standard_subfolders:
    Path(f"{dest_root}/forcing/{scale}/era5/era5-invariants").mkdir(parents=True, exist_ok=True)

In [8]:
# 2. Meta-data
cs_meta_upload = cs_meta[~cs_meta.set_index(['Country', 'Station_id']).index.isin(cs_unusable.set_index(['Country', 'Station_id']).index)]

In [None]:
cs_meta_upload.to_csv(dest_root/'camels-spat-metadata.csv', index=False)

In [9]:
# 3. Attributes
cs_att = pd.read_csv(att_path, low_memory=False)
drop_these = cs_unusable['Country'] + "_" + cs_unusable['Station_id']
cs_att_upload = cs_att.drop(columns=drop_these, errors='ignore')

In [None]:
cs_att_upload.to_csv(dest_root/'attributes'/'attributes-lumped.csv', index=False)

In [7]:
# 4. readme, citations, known issues
# We'll add these manually

In [10]:
# 5. Basic check(s)
# 5.1 Ensure we have the same basins in meta and attributes
meta_basins = (cs_meta_upload['Country'] + '_' + cs_meta_upload['Station_id']).values
attr_basins = cs_att_upload.columns.to_list()
attr_basins.remove('Category')
attr_basins.remove('Attribute')
attr_basins.remove('Unit')
attr_basins.remove('Source')
assert (meta_basins == attr_basins).all(), "Basins not in same order and/or mismatches"

# 5.2 Ensure we have the expected number of basins (1426)
assert len(meta_basins) == 1426, "Number of basins not 1426"