# Convert WSC flow data into hourly values
We have observed flow values from the WSC sites at native time resolution. We want to resample these to hourly and track the number of missing values in the metadata.

In [1]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object}) 

### Check if we have any <0 values

In [15]:
# Loop over the stations
has_negatives = []
for ix,row in cs_meta.iterrows():
    # Right now, the only Canadian values in cs_unusable are iv values, so we don't need to check beyond Station_id
    if row.Country == 'CAN' and row.Station_id not in cs_unusable['Station_id'].values: 
        site, _, _, csv_iv_path, _, _ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        raw = pd.read_csv(csv_iv_path, index_col=0, parse_dates=True, 
                          usecols=['Date','Value/Valeur','Qualifier/Qualificatif','Approval/Approbation'],
                          low_memory=False)
        num_neg = (raw['Value/Valeur']<0).sum()
        if num_neg > 0:
            print(f'{num_neg} negative values found for {site}')
            has_negatives.append(site)
print(f'Negative IV found for {len(has_negatives)} sites.')

  raw = pd.read_csv(csv_iv_path, index_col=0, parse_dates=True,


1321 negative values found for 02FE014
20 negative values found for 05NG020
261 negative values found for 05OB021
21 negative values found for 05OE007
1 negative values found for 05OE010
1 negative values found for 05OF018
1289 negative values found for 07BF002
27 negative values found for 07GE007
1 negative values found for 07OA001
Negative IV found for 9 sites.


## Processing

In [18]:
# Prepare the metadata file
meta_column_start = 'iv_flow_obs_availability_start'
meta_column_end   = 'iv_flow_obs_availability_end'
meta_column_miss  = 'flow_obs_missing_hourly'
c_start = np.where(cs_meta.columns == meta_column_start)[0][0]
c_end   = np.where(cs_meta.columns == meta_column_end)[0][0]
c_miss  = np.where(cs_meta.columns == meta_column_miss)[0][0]

In [19]:
# Determine what to do with raw files
remove_raw    = False # If True: removes the raw file
move_raw      = False # If True: moves raw file to new location specified by move_raw_here
assert not (remove_raw and move_raw), 'remove_raw and move_raw cannot both be True' # this means we can use simple logic later

In [20]:
if move_raw:
    move_raw_here = 'D:/CAMELS_spat' 
    move_raw_path = Path(move_raw_here) / cs_basin_folder / 'basin_data' # Mimic existing data structure

In [9]:
# For each site, 
# - Load the raw data file
# - Check that we have numerical values only in the obs_00060 column
# - Resample data o hourly (probably best to use a dataframe with just time and flow)
# - Update the metadata with missing values (and start and end dates if not in there)
# - Move the raw file, save the new file as .csv (processing to netcdf can come later)

In [21]:
def extract_offset(value):
    return value[-6:]

In [22]:
# Loop over the stations
for ix,row in cs_meta.iterrows():
    # Right now, the only Canadian values in cs_unusable are iv values, so we don't need to check beyond Station_id
    if row.Country == 'CAN' and row.Station_id not in cs_unusable['Station_id'].values: 
    
        # Get paths, etc
        site, _, _, csv_iv_path, _, _ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        _,    _, _, _, _, csv_hr_path = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='hourly')
        csv_hr_path = Path(str(csv_hr_path).replace('.nc','.csv'))
        print(f'{ix: >3}. Now working on {site}')
        
        # Resume after interupts
        if not os.path.isfile(csv_iv_path): # If csv file doesn't exist it must have been processed already 
            continue                     #   Note: assumes remove_raw or move_raw are True
        
        # 1. Load the raw data, grabbing just the columns with time, flow obs and quality flags
        raw = pd.read_csv(csv_iv_path, index_col=0, parse_dates=True, 
                          usecols=['Date','Value/Valeur','Qualifier/Qualificatif','Approval/Approbation'],
                          low_memory=False)
        
        # UTC to LST
        # 2a. Find what we're converting to
        lst = row['dv_flow_obs_timezone'] # e.g. 'AST'
        utc = cs.tz_abbreviation_to_utc(lst) # e.g. 'UTC-04'
        offset = cs.relative_utc_to_timedelta(utc) # e.g. '+4:00:00'

        # 2b. Convert the UTC timestamps into Local Standard Time
        raw['datetime_str'] = raw.index.astype(str)
        raw['offset_str'] = raw['datetime_str'].apply(extract_offset)
        assert all(raw['offset_str'] == '+00:00'), f'Not all timezone offsets are +00:00'
        raw.index = raw.index.tz_convert(None) + pd.Timedelta(offset)

        # 2c. Clean up the extra columns
        raw = raw.drop(['datetime_str','offset_str'], axis=1)
        
        # 3a. Create a temporary dataframe for hourly averaging
        tmp = raw.copy()
        tmp = tmp.drop(columns={'Qualifier/Qualificatif','Approval/Approbation'}) # remove the QC column

        # 3b. Replace any negative streamflow values with nan
        tmp.loc[tmp['Value/Valeur'] < 0, 'Value/Valeur'] = np.nan
        
        # 4. Create hourly average flow rates
        _,raw_H = cs.resample_arbitrary_flux_observations_to_hourly(tmp, data='Value/Valeur', center_window=False)
        
        # 5. Assign quality flags to the dataframe
        raw['Qualifier/Qualificatif'] = raw['Qualifier/Qualificatif'].fillna(0).astype(int) # Replace any NaN with 0 (unknown)
        raw_H = cs.assign_hourly_quality_flag(raw, raw_H, 'CAN', center_window=False)
        
        # 6. Update the metadata file with start & end date (if missing), and missing values
        if type(row.iv_flow_obs_availability_start) != str and np.isnan(row.iv_flow_obs_availability_start): 
            cs_meta.iat[ix,c_start] = raw.index[0].strftime('%Y-%m-%d %X')
        if type(row.iv_flow_obs_availability_end) != str and np.isnan(row.iv_flow_obs_availability_end): 
            cs_meta.iat[ix,c_end]   = raw.index[-1].strftime('%Y-%m-%d %X')
        cs_meta.iat[ix,c_miss] = raw_H['Value/Valeur'].isna().sum()
        
        # 7. Save the hourly file as .csv and move the raw file
        raw_H.to_csv(csv_hr_path)
        if remove_raw:
            os.remove(csv_iv_path)
        if move_raw:
            move_path = move_raw_path / f'CAN_{site}' / 'observations'
            move_path.mkdir(parents=True, exist_ok=True)
            move_file = os.path.basename(csv_iv_path)
            shutil.move(csv_iv_path, move_path/move_file)
        

  1. Now working on 01AD003
  3. Now working on 01AF007
  4. Now working on 01AF009
  5. Now working on 01AJ003
  6. Now working on 01AJ004
  7. Now working on 01AJ010
  8. Now working on 01AK001
  9. Now working on 01AK006
 10. Now working on 01AK007
 11. Now working on 01AL002
 12. Now working on 01AL004
 13. Now working on 01AM001
 14. Now working on 01AN002
 15. Now working on 01AP002
 16. Now working on 01AP004
 17. Now working on 01AP006
 18. Now working on 01AQ001
 19. Now working on 01BC001
 21. Now working on 01BE001
 27. Now working on 01BJ003
 28. Now working on 01BJ007
 29. Now working on 01BJ010
 30. Now working on 01BJ012
 31. Now working on 01BL002
 32. Now working on 01BL003
 33. Now working on 01BO001
 34. Now working on 01BP001
 35. Now working on 01BP002
 36. Now working on 01BQ001
 37. Now working on 01BS001
 38. Now working on 01BU002
 39. Now working on 01BU009
 40. Now working on 01BV004
 41. Now working on 01BV006
 42. Now working on 01CA003
 43. Now working on 

In [23]:
# Save the metadata file
cs_meta.to_csv(cs_meta_path / cs_meta_name, encoding='utf-8', index=False)