# Convert USGS flow data into hourly values
We have observed flow values from he USGS sites at native time resolution. We want to resample these to hourly and track the number of missing values in the metadata.

In [3]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [4]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [5]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [6]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [7]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object}) 

## Processing

In [8]:
# Prepare the metadata file
meta_column_start = 'iv_flow_obs_availability_start'
meta_column_end   = 'iv_flow_obs_availability_end'
meta_column_miss  = 'flow_obs_missing_hourly'
cs_meta[meta_column_miss] = -1
c_start = np.where(cs_meta.columns == meta_column_start)[0][0]
c_end   = np.where(cs_meta.columns == meta_column_end)[0][0]
c_miss  = np.where(cs_meta.columns == meta_column_miss)[0][0]

In [9]:
# Determine what to do with raw files
remove_raw    = False # If True: removes the raw file
move_raw      = False # If True: moves raw file to new location specified by move_raw_here
assert not (remove_raw and move_raw), 'remove_raw and move_raw cannot both be True' # this means we can use simple logic later

In [10]:
if move_raw:
    move_raw_here = 'D:/CAMELS_spat' 
    move_raw_path = Path(move_raw_here) / cs_basin_folder / 'basin_data' # Mimic existing data structure

In [11]:
# For each site, 
# - Load the raw data file
# - Check that we have numerical values only in the obs_00060 column
# - Resample data o hourly (probably best to use a dataframe with just time and flow)
# - Update the metadata with missing values (and start and end dates if not in there)
# - Move the raw file, save the new file as .csv (processing to netcdf can come later)

In [12]:
# https://help.waterdata.usgs.gov/codes-and-parameters/instantaneous-and-daily-value-status-codes
streamflow_codes = ['***', 'Bkw', 'Dis', 'Eqp', 'Ice', 'Rat', 'Ssn']

In [13]:
# Loop over the stations
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA' and row.Station_id not in cs_unusable['Station_id'].values:
        
        # Get paths, etc
        site, _, _, csv_iv_path, _, _ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        _,    _, _, _, _, csv_hr_path = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='hourly')
        csv_hr_path = Path(str(csv_hr_path).replace('.nc','.csv'))
        print(f'{ix: >3}. Now working on {site}')
        
        # Resume after interupts
        if not os.path.isfile(csv_iv_path): # If csv file doesn't exist it must have been processed already 
            continue                        #   Note: assumes remove_raw or move_raw are True
        
        # 1. Load the raw data, grabbing just the columns with time, flow obs and quality flags
        raw = pd.read_csv(csv_iv_path, index_col=0, parse_dates=True, usecols=['LST','obs_00060','obs_00060_cd'],
                          low_memory=False)
        
        # 2. Create a temporary dataframe for hourly averaging
        tmp = raw.copy() # We need the raw data later, so a copy here is required
        tmp = tmp.drop(columns={'obs_00060_cd'}) # remove the QC column
        
        # 2a. Replace any codes with nan
        for code in streamflow_codes: 
            tmp['obs_00060'] = tmp['obs_00060'].replace(code,np.nan) # Remove all flags so we can transform to float
        tmp['obs_00060'] = tmp['obs_00060'].astype('float') 
        
        # 2b. Replace any negative streamflow values with nan
        tmp.loc[tmp['obs_00060'] < 0, 'obs_00060'] = np.nan
        
        # 3. Create hourly average flow rates
        _,raw_H = cs.resample_arbitrary_flux_observations_to_hourly(tmp, data='obs_00060', center_window=False)
        
        # 4. Assign quality flags to the dataframe
        raw_H = cs.assign_hourly_quality_flag(raw, raw_H, 'USA', center_window=False)
        
        # 5. Update the metadata file with start & end date (if missing), and missing values
        if type(row.iv_flow_obs_availability_start) != str and np.isnan(row.flow_obs_availability_start): 
            cs_meta.iat[ix,c_start] = raw.index[0].strftime('%Y-%m-%d %X')
        if type(row.iv_flow_obs_availability_end) != str and np.isnan(row.flow_obs_availability_end): 
            cs_meta.iat[ix,c_end]   = raw.index[-1].strftime('%Y-%m-%d %X')
        cs_meta.iat[ix,c_miss] = raw_H['obs_00060'].isna().sum()
        
        # 6. Save the hourly file as .csv and move the raw file
        raw_H.to_csv(csv_hr_path)
        if remove_raw:
            os.remove(csv_iv_path)
        if move_raw:
            move_path = move_raw_path / f'USA_{site}' / 'observations'
            move_path.mkdir(parents=True, exist_ok=True)
            move_file = os.path.basename(csv_iv_path)
            shutil.move(csv_iv_path, move_path/move_file)
        

1027. Now working on 01013500
1028. Now working on 01022500
1029. Now working on 01030500
1030. Now working on 01031500
1031. Now working on 01047000
1032. Now working on 01052500
1033. Now working on 01054200
1034. Now working on 01055000
1035. Now working on 01057000
1036. Now working on 01073000
1037. Now working on 01078000
1038. Now working on 01118300
1039. Now working on 01121000
1040. Now working on 01123000
1041. Now working on 01134500
1042. Now working on 01137500
1043. Now working on 01139000
1044. Now working on 01139800
1045. Now working on 01142500
1046. Now working on 01144000
1047. Now working on 01162500
1048. Now working on 01169000
1049. Now working on 01170100
1050. Now working on 01181000
1051. Now working on 01187300
1052. Now working on 01195100
1053. Now working on 01333000
1054. Now working on 01350000
1055. Now working on 01350080
1056. Now working on 01350140
1057. Now working on 01365000
1058. Now working on 01411300
1059. Now working on 01413500
1060. Now 

In [14]:
# Save the metadata file
cs_meta.to_csv(cs_meta_path / cs_meta_name, encoding='utf-8', index=False)

### Run once only - find out which streamflow codes we are dealing with
Note: this is development code that does not need to be re-run. The critical outcome is the list of QC codes visible below.

In [41]:
q_zero = []
q_codes = []
qc_codes = []

In [42]:
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA' and row.Station_id not in cs_unusable['Station_id'].values:
        
        # Get paths, etc
        site, _, _, csv_iv_path, _, _ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        print(f'{ix: >3}. Now working on {site}')
        
        # Resume after interupts
        if not os.path.isfile(csv_iv_path): # If csv file doesn't exist it must have been processed already 
            continue                     #   Note: assumes remove_raw or move_raw are Tru
        
        # 1. Load the raw data, grabbing just the columns with flow obs and quality flags
        tmp = pd.read_csv(csv_iv_path, usecols=['obs_00060','obs_00060_cd'], low_memory=False)
        
        # 2. Get unique codes
        qc_codes.append(tmp['obs_00060_cd'].unique())
        is_non_numeric = pd.to_numeric(tmp['obs_00060'], errors='coerce').isnull()
        q_codes.append(tmp[is_non_numeric]['obs_00060'].unique())
        
        # 3. Check for zero values
        q_zero.append((pd.to_numeric(tmp['obs_00060'], errors='coerce') < 0).sum())

1027. Now working on 01013500
1028. Now working on 01022500
1029. Now working on 01030500
1030. Now working on 01031500
1031. Now working on 01047000
1032. Now working on 01052500
1033. Now working on 01054200
1034. Now working on 01055000
1035. Now working on 01057000
1036. Now working on 01073000
1037. Now working on 01078000
1038. Now working on 01118300
1039. Now working on 01121000
1040. Now working on 01123000
1041. Now working on 01134500
1042. Now working on 01137500
1043. Now working on 01139000
1044. Now working on 01139800
1045. Now working on 01142500
1046. Now working on 01144000
1047. Now working on 01162500
1048. Now working on 01169000
1049. Now working on 01170100
1050. Now working on 01181000
1051. Now working on 01187300
1052. Now working on 01195100
1053. Now working on 01333000
1054. Now working on 01350000
1055. Now working on 01350080
1056. Now working on 01350140
1057. Now working on 01365000
1058. Now working on 01411300
1059. Now working on 01413500
1060. Now 

1303. Now working on 04196800
1304. Now working on 04197100
1305. Now working on 04197170
1306. Now working on 04213000
1307. Now working on 04213075
1308. Now working on 04216418
1309. Now working on 04221000
1310. Now working on 04224775
1311. Now working on 04233000
1312. Now working on 04256000
1313. Now working on 04296000
1314. Now working on 05056000
1315. Now working on 05057000
1316. Now working on 05057200
1317. Now working on 05062500
1318. Now working on 05087500
1319. Now working on 05120500
1320. Now working on 05123400
1321. Now working on 05129115
1322. Now working on 05131500
1323. Now working on 05291000
1324. Now working on 05362000
1325. Now working on 05393500
1326. Now working on 05399500
1327. Now working on 05408000
1328. Now working on 05412500
1329. Now working on 05413500
1330. Now working on 05414000
1331. Now working on 05444000
1332. Now working on 05454000
1333. Now working on 05458000
1334. Now working on 05466500
1335. Now working on 05487980
1336. Now 

1577. Now working on 11148900
1578. Now working on 11151300
1579. Now working on 11162500
1580. Now working on 11176400
1581. Now working on 11180500
1582. Now working on 11180960
1583. Now working on 11224500
1586. Now working on 11253310
1587. Now working on 11264500
1588. Now working on 11266500
1589. Now working on 11274500
1590. Now working on 11274630
1591. Now working on 11284400
1592. Now working on 11299600
1593. Now working on 11381500
1594. Now working on 11383500
1595. Now working on 11451100
1596. Now working on 11468500
1597. Now working on 11473900
1598. Now working on 11475560
1599. Now working on 11476600
1600. Now working on 11478500
1601. Now working on 11480390
1602. Now working on 11481200
1603. Now working on 11482500
1604. Now working on 11522500
1605. Now working on 11523200
1606. Now working on 11528700
1607. Now working on 11532500
1608. Now working on 12010000
1609. Now working on 12013500
1610. Now working on 12020000
1611. Now working on 12025000
1612. Now 

In [47]:
for i,val in enumerate(q_zero):
    if val > 0:
        print(f'{val} value(s) < 0 in entry {i}')

394 value(s) < 0 in entry 148
3 value(s) < 0 in entry 337
3 value(s) < 0 in entry 346
425 value(s) < 0 in entry 353
1 value(s) < 0 in entry 376
8 value(s) < 0 in entry 417
27 value(s) < 0 in entry 423
18 value(s) < 0 in entry 428
1 value(s) < 0 in entry 483
5 value(s) < 0 in entry 518
6 value(s) < 0 in entry 609


In [43]:
# Convert to a flat list
qc_codes_list = [l.tolist() for l in qc_codes]
qc_codes_items =  [item for sublist in qc_codes_list for item in sublist]
set(qc_codes_items)

{'A',
 'A:<',
 'A:>',
 'A:R',
 'A:[4]',
 'A:[90]',
 'A:[91]',
 'A:[92]',
 'A:[93]',
 'A:e',
 'P',
 'P:e',
 nan}

In [44]:
# Convert to a flat list
q_codes_list = [l.tolist() for l in q_codes]
q_codes_items =  [item for sublist in q_codes_list for item in sublist]
set(q_codes_items)

{'***', 'Bkw', 'Dis', 'Eqp', 'Ice', 'Rat', nan, nan, nan, nan, nan}

In [45]:
# Find which station index we have for the flags we need to double-check
for i,sublist in enumerate(q_codes_list):
    for item in sublist:
        if item == '***': print(f'*** in {i}') # Value unavailable
        if item == 'Bkw': print(f'Bkw in {i}') # Value is affected by backwater at the measurement site
        if item == 'Dis': print(f'Dis in {i}') # Record has been discontinued at the measurement site
        if item == 'Eqp': print(f'Eqp in {i}') # Value affected by equipment malfunction
        if item == 'Ice': print(f'Ice in {i}') # Value is affected by ice at the measurement site
        if item == 'Rat': print(f'Rat in {i}') # Rating being developed
        if item == 'Ssn': print(f'Ssn in {i}') # Parameter monitored seasonally

Ice in 0
Ice in 5
Ice in 6
Ice in 7
Ice in 12
*** in 12
Ice in 13
Eqp in 13
Ice in 24
Ice in 36
Ice in 81
Ice in 213
*** in 223
Ice in 256
Ice in 259
Ice in 262
Ice in 264
Ice in 267
Ice in 271
Ice in 284
Eqp in 284
Ice in 285
Ice in 286
Ice in 288
Ice in 289
Ice in 294
Ice in 295
Ice in 296
Ice in 297
Ice in 298
Ice in 300
Ice in 301
Dis in 333
Dis in 334
Dis in 337
Ice in 338
Ice in 339
Ice in 340
Ice in 341
Ice in 342
Ice in 343
Ice in 344
Ice in 345
Ice in 346
Ice in 347
Ice in 348
Ice in 350
Ice in 352
Ice in 353
Ice in 354
Ice in 355
Ice in 356
Ice in 357
Ice in 359
Ice in 360
Ice in 361
Ice in 362
Ice in 363
Bkw in 364
Ice in 364
Ice in 365
Ice in 395
Ice in 407
Ice in 481
Ice in 483
Bkw in 487
Ice in 498
Ice in 521
Ice in 529
Rat in 533
Ice in 536
Eqp in 539
Dis in 605
Dis in 609
Dis in 610
Eqp in 619
Ice in 620
Ice in 621
Ice in 626
Ice in 627
Ice in 629
Ice in 630
Ice in 631
