# Add time_bnds to forcing
Forcing data currently come without time bounds information. This has to be gathered from docs (ERA5) or personal communication (EM-Earth). Here we add a `time_bnds` variable to forcing files so that validity of the values is clear.

- ERA5: https://confluence.ecmwf.int/pages/viewpage.action?pageId=82870405#ERA5:datadocumentation-Table4 (access: 2024-01-03)
- EM-Earth: `For precipitation, it is accumulated value using the period-beginning format. For example, 0:00 value is the accumulated precipitation 0:00-1:00.` (G. Tang, personal communication, 2024)


In [1]:
import glob
import sys
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name,  dtype={'Station_id': object}) # Enforce reading IDs as string to keep leading 0's

### Processing
- Loop over all catchments
    - List all forcing files
    - if ERA5: period-ending time_bnds
    - if EM-Earth: period-starting time_bnds

In [5]:
debug_message = f'\n!!! CHECK DEBUGGING STATUS: \n- Testing 1 basin\n'

In [6]:
print(debug_message)
for ix,row in cs_meta.iterrows():

    # DEBUGGING
    if ix != 0: continue
    
    # Get forcing paths
    basin_id, _, _, _, _ = cs.prepare_delineation_outputs(cs_meta, ix, Path(data_path)/cs_basin_folder)
    raw_fold, lump_fold, dist_fold = cs.prepare_forcing_outputs(cs_meta, ix, Path(data_path)/cs_basin_folder) # Returns folders only, not file names
    print('--- Now running basin {}. {}'.format(ix, basin_id))
    
    # Check if we need to run downloads for this station at all
    missing = cs.flow_obs_unavailable(cs_unusable, row.Country, row.Station_id)
    if 'iv' in missing and 'dv' in missing: 
        continue # with next station, because we have no observations at all for this station
    
    # Find the files
    raw_files = sorted(glob.glob(str(raw_fold/'*.nc'))) # list
    lump_files = sorted(glob.glob(str(lump_fold/'*.nc'))) # list
    dist_files = sorted(glob.glob(str(dist_fold/'*.nc'))) # list
    all_files = raw_files + lump_files + dist_files

    # Get LST for this station
    LST = row['dv_flow_obs_timezone']
    
    # Open files, add time_bnds, and close
    for file in all_files:

        # ERA5_2023-01-01_invariants.nc' doesn't have a time variable and 
        #  thus doesn't need a time_bnds addition
        if 'invariant' in file:
            continue 

        # Add time_bnds specific to how each data set treats timestamps
        if 'era5' in file.lower():
            cs.add_time_bnds(file,'era5',LST)
        elif 'earth' in file.lower():
            cs.add_time_bnds(file,'em-earth',LST)

print(debug_message)


!!! CHECK DEBUGGING STATUS: 
- Testing 1 basin

--- Now running basin 0. CAN_01AD002

!!! CHECK DEBUGGING STATUS: 
- Testing 1 basin



### Checks

In [7]:
import xarray as xr

In [8]:
for file in all_files:
    if 'invariant' in file:
        print(f'\nSkipping {file}')
        continue
    
    if 'era5' in file.lower():
        print(f'\nERA5 file')
    elif 'earth' in file.lower():
        print(f'\nEM-Earth file')
    
    test = xr.open_dataset(file)
    
    t = 0
    ts = test['time'].values[t]
    tb = test['time_bnds'].values[:,t]
    print(f'First timestamp:   {ts}')
    print(f'         bounds: {tb}')
    
    t = -1
    ts = test['time'].values[t]
    tb = test['time_bnds'].values[:,t]
    print(f' Last timestamp:   {ts}')
    print(f'         bounds: {tb}')
    
    test.close()


EM-Earth file
First timestamp:   1950-01-01T00:00:00.000000000
         bounds: ['1950-01-01T00:00:00.000000000' '1950-01-01T01:00:00.000000000']
 Last timestamp:   1950-01-31T23:00:00.000000000
         bounds: ['1950-01-31T23:00:00.000000000' '1950-02-01T00:00:00.000000000']

ERA5 file
First timestamp:   1950-01-01T00:00:00.000000000
         bounds: ['1949-12-31T23:00:00.000000000' '1950-01-01T00:00:00.000000000']
 Last timestamp:   1950-01-31T23:00:00.000000000
         bounds: ['1950-01-31T22:00:00.000000000' '1950-01-31T23:00:00.000000000']

ERA5 file
First timestamp:   1950-02-01T00:00:00.000000000
         bounds: ['1950-01-31T23:00:00.000000000' '1950-02-01T00:00:00.000000000']
 Last timestamp:   1950-02-28T23:00:00.000000000
         bounds: ['1950-02-28T22:00:00.000000000' '1950-02-28T23:00:00.000000000']

ERA5 file
First timestamp:   1950-03-01T00:00:00.000000000
         bounds: ['1950-02-28T23:00:00.000000000' '1950-03-01T00:00:00.000000000']
 Last timestamp:   1950-03-3

### Functions

In [6]:
import netCDF4 as nc4
import numpy as np
import time

In [7]:
def add_time_bnds(file, dataset=[], timezone=[]):

    '''Adds a time_bnds variable to a netcdf'''

    with nc4.Dataset(file, 'a') as f: # (a)ppend

        # Check that the approach within this function is valid
        assert 'hours' in f['time'].getncattr('units'), f'ERROR: time units in {file} not in hours'

        # Check that a source dataset is specified
        assert (dataset.lower() == 'era5') | (dataset.lower() == 'em-earth'), \
            f'add_time_bnds() contains no settings for dataset = {dataset}'

        # Connect variable 'time_bounds' to variable 'time' through time attribute 'bounds'
        f['time'].setncattr('bounds','time_bnds')

        # Add nbnds dimension
        f.createDimension('nbnds', 2)
        f.createVariable('nbnds', 'i', 'nbnds')
        f.variables['nbnds'][:] = [1,2]
        f['nbnds'].setncattr('standard_name','bounds for timestep intervals')

        # Add time_bnds variable
        f.createVariable('time_bnds', f.variables['time'].datatype, ('nbnds','time'), fill_value = False, zlib=True, shuffle=True)
        f['time_bnds'].setncattr('long_name', 'start and end points of each time step')
        f['time_bnds'].setncattr('time_zone', timezone)
        f['time_bnds'].setncattr('calendar', f['time'].getncattr('calendar'))
        f['time_bnds'].setncattr('units', f['time'].getncattr('units'))

        # Add the actual data
        if dataset.lower() == 'era5':
            f['time_bnds'][:] = np.array([f['time'][:]-1, f['time'][:]]) # Period-ending timestamps: t(n) is valid over t(n-1) to t(n)
        if dataset.lower() == 'em-earth':
            f['time_bnds'][:] = np.array([f['time'][:], f['time'][:]+1]) # Period-starting timestamps: t(n) is valid over t(n) to t(n+1)

        # Update the file history
        new_history = f' On {time.ctime(time.time())}: add_time_bnds().'
        old_history = f.History
        hist = f'{old_history} {new_history}'
        f.setncattr('History',hist)