# Subset EM-Earth data to basins

In [1]:
import glob
import os
import sys
import pandas as pd
from datetime import datetime
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

# Temporary download path
temp_folder = Path( cs.read_from_config(config_file, 'temp_path') )

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

## Processing

In [6]:
# Find the ERA5 files
em_earth_fold = temp_folder / 'EM_Earth_v1' / 'deterministic_hourly' / 'merged'
em_earth_files = sorted(glob.glob(str(em_earth_fold/'*.nc'))) # list

In [7]:
debug_message = f'\n!!! CHECK DEBUGGING STATUS: \n- Testing 1 file \n- Testing 1 basin'

In [8]:
print(debug_message)
for ix,row in cs_meta.iterrows():

    # DEBUGGING
    if ix != 0: continue
    
    # Get shapefile path to determine download coordinates, and forcing destination path
    basin_id, shp_lump_path, shp_dist_path, _, _ = cs.prepare_delineation_outputs(cs_meta, ix, Path(data_path)/cs_basin_folder)
    raw_fold, _, _ = cs.prepare_forcing_outputs(cs_meta, ix, Path(data_path)/cs_basin_folder) # Returns folders only, not file names
    print('--- Now running basin {}. {}'.format(ix, basin_id))
    
    # From shapefile, get bounding coordinates. Then determine download coordinates from those
    bounds = cs.find_shapefile_bounds(shp_lump_path)
    coords_eme, _, _ = cs.find_download_coords_from_bounds(bounds, target='EM-Earth')
    
    # Check if we need to run downloads for this station at all
    missing = cs.flow_obs_unavailable(cs_unusable, row.Country, row.Station_id)
    if 'iv' in missing and 'dv' in missing: 
        continue # with next station, because we have no observations at all for this station

    # From meta-data, get download period
    times_flow = cs.find_flow_obs_times_from_metadata(row, missing)
    times_era5 = cs.round_flow_obs_to_days(times_flow)
    start_date = datetime.strptime(times_era5[0], '%Y-%m-%d')
    final_date = datetime.strptime(times_era5[1], '%Y-%m-%d')
    
    print(f'    Basin coordinates:            {bounds}')
    print(f'    EM-Earth subset coordinates: [{coords_eme}]')
    print(f'    Flow obs unavailable:         {missing}')
    print(f'    Download times:               {times_era5}')

    # Convert start and end dates into two lists of start and end dates, that we'll iterate over
    date_list,_ = cs.convert_start_and_end_dates_to_era5_download_lists(start_date,final_date) # not the cleanest but this lets us reuse old code
    subset_strings = [date_obj.strftime("%Y-%m") for date_obj in date_list] # convert datetime objects to yyyy-mm strings

    # Subset the data files
    infiles = [file for file in em_earth_files if any(subset_string in file for subset_string in subset_strings)]
    
    for infile in infiles:
        if os.path.exists(infile):
            file_name = os.path.basename(infile).replace('deterministic_hourly_NorthAmerica_','') # Make the name more similar to ERA5_YYYY-MM.nc
            outfile = raw_fold/file_name
            if not os.path.exists(outfile):
                cs.extract_ERA5_subset(infile,outfile,coords_eme)
        else:
            print(f'    ERROR: source file {infile} not found.')
    
    # Create a figure to check if we actually cover the right domain with this
    fig_file = raw_fold.parent / f'{row.Country}_{row.Station_id}_em_earth_coverage.png'
    cs.compare_forcing_data_and_shape_extents(fig_file, outfile, shp_lump_path, nc_var='tmean', nc_time=0)

print(debug_message)


!!! CHECK DEBUGGING STATUS: 
- Testing 1 file 
- Testing 1 basin
--- Now running basin 0. CAN_01AD002
    Basin coordinates:            [-70.43208333  45.98541667 -68.07125     47.83791667]
    EM-Earth subset coordinates: [47.85/-70.45/45.95/-68.05]
    Flow obs unavailable:         ['iv', nan]
    Download times:               ['1950-01-01', '2020-12-31']

!!! CHECK DEBUGGING STATUS: 
- Testing 1 file 
- Testing 1 basin
