# Move updated flow files
Apparently the current camels-spat resource contains older netcdf files with flow data, that have timesteps in UTC and also use older hourly averaging code. The new files lived on my laptop and we're now moving these into the update folder.

In [39]:
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
import sys
import xarray as xr

import netCDF4 as nc4
import time
from datetime import timedelta
from datetime import datetime

sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

In [2]:
# Data location
cs_main_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload")

In [3]:
# Destination location
cs_update_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload-updates")

In [115]:
# Transfer folder
cs_transfer_folder = Path("/scratch/gwf/gwf_cmt/wknoben/TEMP_flow_transfers")

In [50]:
# Transfer folder
cs_working_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-wip")

In [5]:
# Specify the folder structure
obs_path_part1 = "observations"
obs_path_parts2 = ["headwater", "macro-scale", "meso-scale"]
obs_path_parts3 = ["obs-hourly","obs-daily"]

### Functions

In [6]:
def confirm_netcdf_basics(ds, country, basin, tz_expected):

    assert ds.country == country, "attribute mismatch: country"
    assert ds.station == basin, "attribute mismatch: basin"

    expected_vars = ['q_obs_data_quality', 'q_obs', 'time_bnds']
    assert set(list(ds.data_vars.keys())).issuperset(expected_vars), "key variable missing"
    
    expected_dims = ['time', 'nbnds']
    assert set(list(ds.dims.keys())) == set(expected_dims), "dimensions mismatch"

    tz_actual = ds['time_bnds'].time_zone
    if tz_expected == 'NST':
        if tz_actual != 'NST':
            print(f"Basin {basin}: expected timezone {tz_expected} but found {tz_actual}")
    else:
        assert tz_actual == tz_expected, f"time zone expected as {tz_expected} but is {tz_actual}"
    return

In [7]:
def compare_netcdf_sizes_and_vars(ds1, ds2):
    assert set(ds1.data_vars.keys()) == set(ds2.data_vars.keys()), "old/new mismatch: variables"
    if abs(len(ds1['time']) - len(ds2['time'])) > 1: # allow a 1 timestep difference
        assert len(ds1['time']) == len(ds2['time']), "length mismatch: time"
    if abs(len(ds1['q_obs']) - len(ds2['q_obs'])) > 1:
        assert len(ds1['q_obs']) == len(ds2['q_obs']), "length mismatch: qobs"  

In [8]:
def compare_daily_netcdfs(ds1,ds2,basin):
    assert np.allclose(ds1['q_obs'].values, ds2['q_obs'].values,equal_nan=True)
    if not ds1['q_obs'].identical(ds2['q_obs']):
        print(f"Basin {basin}: flows allclose() but not identical()")

    # check that time periods are roughly the same
    common_times = np.intersect1d(ds1['time'].values, ds2['time'].values)
    if abs(len(common_times) / len(ds1['time']) - 1) > 0.01:
        print(f"Basin {basin}: old and new hourly timesteps don't overlap for more than 1% of timesteps")

In [9]:
def compare_hourly_netcdfs(ds1,ds2,basin):
    
    # we know these won't be the same so check if statistics are roughly similar and warn if not
    mean_flow_diff = (ds1['q_obs'].mean() / ds2['q_obs'].mean() - 1).values
    if abs(mean_flow_diff) > 0.01:
        print(f"Basin {basin}: old and new hourly mean flow more than 1% different ({mean_flow_diff:4f})")

    std_flow_diff = (ds1['q_obs'].std() / ds2['q_obs'].std() - 1).values
    if abs(std_flow_diff) > 0.01:
        print(f"Basin {basin}: old and new hourly standard deviation of flow more than 1% different ({std_flow_diff:4f})")

    corr_flow_diff = pd.Series(ds1['q_obs']).corr(pd.Series(ds2['q_obs']))-1
    if abs(corr_flow_diff) > 0.01:
        print(f"Basin {basin}: old and new hourly correlation of flow more than 1% different ({corr_flow_diff:4f})")
    
    # check that time periods are roughly the same
    common_times = np.intersect1d(ds1['time'].values, ds2['time'].values)
    if abs(len(common_times) / len(ds1['time']) - 1) > 0.01:
        print(f"Basin {basin}: old and new hourly timesteps don't overlap for more than 1% of timesteps")

### Processing

In [10]:
# Load the meta-data, so we known which basins we have
cs_meta = pd.read_csv(cs_main_folder / "camels-spat-metadata.csv")

In [11]:
# Find all new files
new_files = list(cs_transfer_folder.glob("*.nc"))

In [12]:
# Loop over the upload data to check which basins we have
countries = []
basins_updated = []
files_moved = []
for obs_path_part2 in obs_path_parts2:
    for obs_path_part3 in obs_path_parts3:
        
        # Find the current files
        obs_middle = f"{obs_path_part1}/{obs_path_part2}/{obs_path_part3}"
        obs_files = list((cs_main_folder / obs_middle).glob("*.nc"))
        
        # Loop over files
        for obs_file in obs_files:

            # Tracking
            file_name = obs_file.name
            country = file_name.split("_")[0]
            countries.append(country)
            basin = file_name.split("_")[1]
            basins_updated.append(basin)

            # Confirm we have a new file
            new_file = []
            new_file = [p for p in new_files if file_name in str(p)]
            assert len(new_file) == 1, f"No/extra file found for {basin}. new_file: {new_file}"
            new_file = new_file[0]
            
            # Open both files
            old = xr.open_dataset(obs_file)
            new = xr.open_dataset(new_file)

            # Find the expected timezone
            tz = cs_meta[cs_meta['Station_id'] == basin]['dv_flow_obs_timezone'].iloc[0]
            
            # Compare the new file, depending on what case we're dealing with
            confirm_netcdf_basics(old, country, basin, 'UTC') # key attributes, time & nbnds dims
            confirm_netcdf_basics(new, country, basin, tz)
            compare_netcdf_sizes_and_vars(old,new) # time and qobs same length, same vars
            if 'hourly' in obs_path_part3:
                compare_hourly_netcdfs(old,new,basin)
            elif 'daily' in obs_path_part3:
                compare_daily_netcdfs(old,new,basin)

            # Move the new files into the upload folder
            des_folder = cs_update_folder / obs_path_part1 / obs_path_part2 / obs_path_part3
            des_folder.mkdir(exist_ok=True, parents=True)
            shutil.copy(new_file, des_folder/file_name)

Basin 02038850: old and new hourly correlation of flow more than 1% different (-0.013663)
Basin 10259200: old and new hourly correlation of flow more than 1% different (-0.012054)
Basin 06879650: old and new hourly correlation of flow more than 1% different (-0.126134)
Basin 09508300: old and new hourly correlation of flow more than 1% different (-0.010310)
Basin 03368000: old and new hourly correlation of flow more than 1% different (-0.016509)
Basin 11180960: old and new hourly correlation of flow more than 1% different (-0.015299)
Basin 07083000: old and new hourly mean flow more than 1% different (-0.014278)
Basin 02055100: old and new hourly correlation of flow more than 1% different (-0.016752)
Basin 11180500: old and new hourly correlation of flow more than 1% different (-0.013549)
Basin 03291780: old and new hourly correlation of flow more than 1% different (-0.016070)
Basin 03237280: old and new hourly correlation of flow more than 1% different (-0.022489)
Basin 08103900: old 

## Checks
Updated daily values are 100% identical, just with different timesteps. Updated hourly averages are mostly within 1% for a couple of important statistics. There is one gauge (02ZH002) where, somehow, our new data isn't in the expected timezone (NST). We'll need to keep this into account later.

In [17]:
# Check we moved what was expected
print(f"Updated {len(basins_updated)/2} out of expected {len(cs_meta)} basins.")

Updated 1426.0 out of expected 1426 basins.


In [26]:
check_df = pd.DataFrame(data={'country': countries,
                              'station': basins_updated,
                              'scale': scales,
                              'resolution': times,
                              'file':files_moved})

In [34]:
# It's pretty much a given, but confirm that we have everything we need
for obs_path_part2 in obs_path_parts2:
    for obs_path_part3 in obs_path_parts3:
        
        # Find all the files
        obs_middle = f"{obs_path_part1}/{obs_path_part2}/{obs_path_part3}"
        old_files = list((cs_main_folder / obs_middle).glob("*.nc"))
        new_files = list((cs_update_folder / obs_middle).glob("*.nc"))

        # extract file names from paths
        old_names = {p.name for p in old_files} # {}: set
        new_names = {p.name for p in new_files}
        
        # Make sure these match
        only_in_old = old_names - new_names
        only_in_new = new_names - old_names

        # report
        print(f"\nChecking: {obs_middle}")
        print(f"Missing in new:")
        print(only_in_old)
        print(f"Missing in old:")
        print(only_in_new)


Checking: observations/headwater/obs-hourly
Missing in new:
set()
Missing in old:
set()

Checking: observations/headwater/obs-daily
Missing in new:
set()
Missing in old:
set()

Checking: observations/macro-scale/obs-hourly
Missing in new:
set()
Missing in old:
set()

Checking: observations/macro-scale/obs-daily
Missing in new:
set()
Missing in old:
set()

Checking: observations/meso-scale/obs-hourly
Missing in new:
set()
Missing in old:
set()

Checking: observations/meso-scale/obs-daily
Missing in new:
set()
Missing in old:
set()


## Update 02ZH002
Here we'll update the new files for basin `02ZH002` so that everything is accounted for before the next step.

In [59]:
basin = '02ZH002'
site = f"CAN_{basin}"
row = cs_meta[cs_meta['Station_id'] == basin].iloc[0]

In [53]:
cs_tmp = cs_working_folder / f"tmp_{basin}"
cs_tmp.mkdir(exist_ok=True)

In [37]:
# Manually move the daily and hourly CSV files into working folder

#### Daily

In [184]:
# 1. Get paths
csv_path = cs_tmp / 'CAN_02ZH002_daily_raw_flow_observations.csv'
nc_path = cs_tmp / 'CAN_02ZH002_daily_flow_observations.nc'

In [161]:
# 2. Load the csv
csv = cs.prep_daily_country_csv_for_netcdf(csv_path, row.Country, row.dv_flow_obs_timezone)

In [162]:
# 3. Add the time bounds for daily averages in LST
csv['time_bnds_l'] = csv.index
csv['time_bnds_r'] = csv.index + pd.Timedelta(hours=24)

In [158]:
def daily_flow_csv_to_netcdf(csv, nc_path, country, station):

    # 1. Define standard values
    # -------------------------
    
    # Auxiliary
    global_att_countries = ['USA', 'CAN', 'MEX']
    global_att_i = global_att_countries.index(country)
    global_att_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Global attributes
    global_att_ttl = 'CAMELS-spat streamflow data'
    global_att_con = 'CF-1.10'
    global_att_src = 'Streamflow derived from observed water levels'
    global_att_ins = ['United States Geological Survey',
                      'Water Survey of Canada']
    global_att_ref = [('U.S. Geological Survey, 2016, National Water Information System data available ' +
                       'on the World Wide Web (USGS Water Data for the Nation), accessed 2023-03-23, at '+
                       'URL [http://waterdata.usgs.gov/nwis/]'),
                      ('Original data extracted from the Environment and Climate Change Canada Real-time ' +
                       'Hydrometric Data web site (https://wateroffice.ec.gc.ca/mainmenu/real_time_data_index_e.html) ' + 
                       'on 2023-04-05')]
    global_att_his = (f'{global_att_now} | File prepared using CAMELS-spat scripts. See:' + 
                       'https://github.com/CH-Earth/camels-spat')
    global_att_com = (f'{global_att_ins[global_att_i]} calculates daily average flow values' +
                      ' in the station\'s standard time (i.e., not UTC). See: variable time_bnds.')

    # Data variables
    q_obs_unit = 'm3 s-1'
    q_obs_long = 'observed streamflow values'
    q_obs_anc = [column for column in csv.columns if '_is_' in column] # Get names of all ancillary variables in .csv 
    q_obs_anc.append('q_obs_data_quality') # add the 'q_obs_data_quality' variable that's not captured by the above
    q_obs_anc = ' '.join([f"'{anc}'" for anc in q_obs_anc]) # convert full list into single string

    # Time settings
    time_unit = 'minutes since 1950-01-01 00:00:00'
    time_cal = 'proleptic_gregorian'
    
    # Time settings - ensure the data only specifies a single time zone that was used for calculating averages
    #                 Communications with USGS and WSC state that this should be the case
    assert len(csv['q_obs_tz_cd'].unique()) == 1, "Multiple timezones specified in csv; there should be only one"
    time_original_tz = csv['q_obs_tz_cd'].unique()[0]

    # 2. Create a basic data set to build from
    ds = csv.to_xarray()

    # 3. Global attributes
    ds.attrs['title'] = global_att_ttl
    ds.attrs['conventions'] = global_att_con
    ds.attrs['source'] = global_att_src
    ds.attrs['country'] = country
    ds.attrs['station'] = station
    ds.attrs['institution'] = global_att_ins[global_att_i]
    ds.attrs['references'] = global_att_ref[global_att_i]
    ds.attrs['history'] = global_att_his
    ds.attrs['comment'] = global_att_com

    # 4a. Time attributes (coordinate already exists)
    # NOTE: attributes 'units' and 'calendar' are automatically specified when writing to netcdf
    #       This can be checked by saving to netcdf, and then loading as follows: xr.open_dataset(nc_path, decode_times=False)
    ds.time.attrs['standard_name'] = 'time'
    ds.time.attrs['bounds'] = 'time_bnds'
    ds.time.encoding['units'] = time_unit
    ds.time.encoding['calendar'] = time_cal
        
    # 4b. Time bounds variable
    ds = ds.assign_coords(nbnds=[1,2])
    ds = ds.assign(time_bnds=(['nbnds','time'],
                              [csv['time_bnds_l'], csv['time_bnds_r']]))
    ds.nbnds.attrs['standard_name'] = 'bounds for timestep intervals'
    ds.time_bnds.attrs['long_name'] = 'start and end points of each time interval'
    #ds.time_bnds.attrs['time_zone'] = 'UTC'
    #ds.time_bnds.attrs['station_standard_time'] = time_original_tz
    ds.time_bnds.attrs['time_zone'] = time_original_tz

    # 5. Observed streamflow
    ds.q_obs.attrs['units'] = q_obs_unit
    ds.q_obs.attrs['long_name'] = q_obs_long
    ds.q_obs.attrs['cell_methods'] = 'time:mean' # indicating that values are average values over the timestep
    ds.q_obs.attrs['ancillary_variables'] = q_obs_anc

    # 6. Data quality flags
    flags = [str(s) for s in csv['q_obs_data_quality'].unique()]
    flags.sort()
    while ' ' in flags: flags.remove(' ')  # Sometimes we have empty spaces with no specific meaning in the data quality column: take those out
    meanings = cs.return_data_quality_flag_meaning(flags,country)
    ds.q_obs_data_quality.attrs['standard_name'] = 'quality_flag'
    ds.q_obs_data_quality.attrs['long_name'] = 'data quality flag'
    ds.q_obs_data_quality.attrs['flag_values'] = ' '.join([f"'{flag}'" for flag in flags])
    ds.q_obs_data_quality.attrs['flag_meanings'] = ' '.join([f"'{meaning}'" for meaning in meanings])

    # 7. Other status variables
    for variable in ds.variables:
        if '_is_' in variable:
            ds[variable].attrs['standard_name'] = 'quality_flag'
            ds[variable].attrs['long_name'] = 'flag indicating if main variable is affected by process in variable name'
            ds[variable].attrs['flag_values'] = "'0' '1'"
            ds[variable].attrs['flag_meanings'] = "'no' 'yes'"

    # 8. Remove the timezone variables we added to get the time_bnds
    ds = ds.drop_vars(['q_obs_tz_cd', 'time_bnds_l', 'time_bnds_r'])

    # 9. Save to file
    ds.to_netcdf(nc_path)
    ds.close()    

In [163]:
# 4. Convert to netcdf and save
daily_flow_csv_to_netcdf(csv, nc_path, row.Country, basin)

In [135]:
def compare_datasets(ds1, ds2, rtol=1e-5, atol=1e-8):
    differences = []

    # Dimensions
    if ds1.dims != ds2.dims:
        differences.append("❌ Dimension mismatch:")
        for dim in set(ds1.dims.keys()) | set(ds2.dims.keys()):
            d1 = ds1.dims.get(dim)
            d2 = ds2.dims.get(dim)
            if d1 != d2:
                differences.append(f"  - Dimension '{dim}': {d1} vs {d2}")

    # Coordinates
    if set(ds1.coords) != set(ds2.coords):
        differences.append("❌ Coordinate name mismatch:")
        only1 = set(ds1.coords) - set(ds2.coords)
        only2 = set(ds2.coords) - set(ds1.coords)
        if only1:
            differences.append(f"  - Only in ds1: {only1}")
        if only2:
            differences.append(f"  - Only in ds2: {only2}")

    else:
        for coord in ds1.coords:
            a = ds1[coord].values
            b = ds2[coord].values

            # Check and warn about dtype mismatch
            if a.dtype != b.dtype:
                differences.append(
                    f"⚠️ Coordinate '{coord}' has differing dtypes: "
                    f"ds1={a.dtype}, ds2={b.dtype}"
                )

            # Choose appropriate comparison method
            if a.dtype.kind in "fiu" and b.dtype.kind in "fiu":  # numeric
                if not np.allclose(a, b, rtol=rtol, atol=atol, equal_nan=True):
                    differences.append(f"❌ Coordinate '{coord}' values differ (numeric)")
            else:
                if not np.array_equal(a, b):
                    differences.append(f"❌ Coordinate '{coord}' values differ (non-numeric)")


    # Data variables
    if set(ds1.data_vars) != set(ds2.data_vars):
        differences.append("❌ Variable name mismatch:")
        only1 = set(ds1.data_vars) - set(ds2.data_vars)
        only2 = set(ds2.data_vars) - set(ds1.data_vars)
        if only1:
            differences.append(f"  - Only in ds1: {only1}")
        if only2:
            differences.append(f"  - Only in ds2: {only2}")

    else:
        for var in ds1.data_vars:
            a = ds1[var].values
            b = ds2[var].values

            # Check and warn about dtype mismatch
            if a.dtype != b.dtype:
                differences.append(
                    f"⚠️ Variable '{var}' has differing dtypes: ds1={a.dtype}, ds2={b.dtype}"
                )

            # Choose comparison based on dtype
            if a.dtype.kind in "fiu" and b.dtype.kind in "fiu":  # numeric
                if not np.allclose(a, b, rtol=rtol, atol=atol, equal_nan=True):
                    mismatch = ~np.isclose(a, b, rtol=rtol, atol=atol, equal_nan=True)
                    n_total = np.size(a)
                    n_diff = np.count_nonzero(mismatch)
                    differences.append(f"❌ Variable '{var}': {n_diff}/{n_total} entries differ")
            else:
                try:
                    equal = np.array_equal(a, b, equal_nan=True)
                except TypeError:
                    # Fallback: if dtype is float, we can safely substitute NaNs with a sentinel
                    if np.issubdtype(a.dtype, np.floating):
                        equal = np.array_equal(
                            np.where(np.isnan(a), 'NaN', a.astype("object")),
                            np.where(np.isnan(b), 'NaN', b.astype("object"))
                        )
                    else:
                        # Non-numeric, non-NaN-tolerant types (e.g. strings, datetimes)
                        equal = np.array_equal(a, b)

            # Check variable attributes
            attrs1 = ds1[var].attrs
            attrs2 = ds2[var].attrs

            if attrs1 != attrs2:
                differences.append(f"⚠️ Variable '{var}' attributes differ:")
                keys = set(attrs1) | set(attrs2)
                for key in keys:
                    v1 = attrs1.get(key)
                    v2 = attrs2.get(key)
                    if v1 != v2:
                        differences.append(f"    - Attr '{key}': ds1={v1!r}, ds2={v2!r}")


    # Global attributes
    if ds1.attrs != ds2.attrs:
        differences.append("❌ Global attributes differ")
        keys = set(ds1.attrs.keys()) | set(ds2.attrs.keys())
        for k in keys:
            v1 = ds1.attrs.get(k)
            v2 = ds2.attrs.get(k)
            if v1 != v2:
                differences.append(f"  - Attr '{k}': {v1} vs {v2}")

    # Summary
    if not differences:
        print("✅ Datasets are identical across dimensions, coordinates, variables, and attributes.")
    else:
        print("🔍 Differences found between datasets:")
        for diff in differences:
            print(diff)

In [164]:
# Compare to the existing one
old_file = cs_transfer_folder / 'CAN_02ZH002_daily_flow_observations.nc'
old_ds = xr.open_dataset(old_file)
new_ds = xr.open_dataset(nc_path)
compare_datasets(old_ds,new_ds)
old_ds.close()
new_ds.close()

🔍 Differences found between datasets:
⚠️ Coordinate 'nbnds' has differing dtypes: ds1=int32, ds2=int64
⚠️ Variable 'q_obs_is_ice_affected' has differing dtypes: ds1=int32, ds2=int64
⚠️ Variable 'q_obs_is_partial_day' has differing dtypes: ds1=int32, ds2=int64
⚠️ Variable 'q_obs_is_dry_day' has differing dtypes: ds1=int32, ds2=int64
⚠️ Variable 'q_obs_is_estimated_value' has differing dtypes: ds1=int32, ds2=int64
⚠️ Variable 'time_bnds' attributes differ:
    - Attr 'time_zone': ds1='UTC', ds2='NST'
    - Attr 'station_standard_time': ds1='NST', ds2=None
❌ Global attributes differ
  - Attr 'history': 2023-08-02 11:54:37 | File prepared using CAMELS-spat scripts. See:https://github.com/CH-Earth/camels-spat vs 2025-05-31 10:29:15 | File prepared using CAMELS-spat scripts. See:https://github.com/CH-Earth/camels-spat


In [172]:
# We'll accept the dtype differences because that shouldn't really affect anything.
# The time_bnds differences is what we wanted to change.
# The history attribute is a logical consequence of our changes.
#
# Not sure why the actual timesteps are the same, but they appear correct in the new file.
new_ds['time']

In [185]:
# Move the new file into the correct update folder
scale = check_df[(check_df['station'] == basin) & (check_df['resolution'] == 'obs-daily')]['scale']
src = nc_path
des = cs_update_folder / obs_path_part1 / scale.iloc[0] / 'obs-daily' / src.name
print(f"Moving:\n{src} \nto:\n{des}")
shutil.copy(src,des)

Moving:
/scratch/gwf/gwf_cmt/wknoben/camels-spat-wip/tmp_02ZH002/CAN_02ZH002_daily_flow_observations.nc 
to:
/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload-updates/observations/headwater/obs-daily/CAN_02ZH002_daily_flow_observations.nc


PosixPath('/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload-updates/observations/headwater/obs-daily/CAN_02ZH002_daily_flow_observations.nc')

#### Hourly
We don't need to re-do these, because we'll do them again for the longer time period anyway in the next section. We just need to make sure we do process a new version of this basin's file in the next notebook.