# Convert USGS flow data into UTC
The downloaded USGS flow observations come into local timezones. We want to convert these to UTC for easier processing. Workflow:
1. Create dictionaries that map USGS timezone abbreviations onto UTC offsets, and map UTC offsets onto timeDelta strings
2. For each station:
    1. For each row in the flow observations csv:
        1. Convert timezone abbreviation into UTC-xx string (e.g. 'EDT' > 'UTC-04')
        2. Convert UTC-xx string into timeDelta string (e.g. 'UTC-04' > '+4:00:00')
        3. Use timeDelta string and datetime string to create a timezone-aware datetime index in UTC
    2. Extract start and end of observation dates and store in metadata file
    3. Extract time resolution of observations and store in metadata file
    
Note: the `EDT` > `UTC-04` > `+4:00:00` > `timezone-aware datetime` conversion can be streamlined, but doing it this way is (probably) clearer if we need to get back to it later.

In [1]:
import os
import sys
import shutil
import datetime
import numpy as np
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name,  dtype={'Station_id': object}) # Enforce reading IDs as string to keep leading 0's

## Processing of Instantaneous Values data

In [8]:
# Prepare the metadata file
meta_column_start = 'iv_flow_obs_availability_start'
meta_column_end   = 'iv_flow_obs_availability_end'
cs_meta[meta_column_start] = 'n/a'
cs_meta[meta_column_end]   = 'n/a'
c_start = np.where(cs_meta.columns == meta_column_start)[0][0]
c_end   = np.where(cs_meta.columns == meta_column_end)[0][0]

In [9]:
# Determine what to do with raw files
remove_raw    = False # If True: removes the raw file
move_raw      = True # If True: moves raw file to new location specified by move_raw_here
assert not (remove_raw and move_raw), 'remove_raw and move_raw cannot both be True' # this means we can use simple logic later

In [10]:
if move_raw:
    move_raw_here = 'D:/CAMELS_spat' 
    move_raw_path = Path(move_raw_here) / cs_basin_folder / 'basin_data' # Mimic existing data structure

In [18]:
# Loop over the stations
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA' and not ((cs_unusable['Country'] == row.Country) & (cs_unusable['Station_id'] == row.Station_id)).any():
     
        # 0. Get paths, etc
        site, _, raw_path, csv_path, header_path, _ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        print(f'{ix: >3}. Now working on {site}')
        
        # Resume after interupts
        if not os.path.isfile(raw_path): # If raw file doesn't exist it must have been processed already 
            continue                     #   Note: assumes remove_raw or move_raw are True
        
        # 1a. Separate raw file into header info ...
        text = open(raw_path,'r')
        head = open(header_path, 'w')
        for line in text:
            if line.startswith('#'): # i.e., line contains a comment
                head.write(line)
            else:
                break # When we've passed the last of the comments, we know there's only data left. No need to read all that
        text.close()
        head.close()
        
        # 1b. Check that we have what we were looking for:
        flag_id   = False # Correct station ID?
       # flag_name = False # Correct station name? # Can't use this because station names are not standardized w.r.t use of abbreviations
        flag_var  = False # Correct variable?
        flag_unit = False # Expected discharge units?

        # Metadata info
        station_id = cs_meta.Station_id.iloc[ix]
        station_name = cs_meta.Station_name.iloc[ix]
        data_var = 'discharge'
        data_unit = 'cubic feet per second'       
        
        head = open(header_path, 'r')
        for line in head:
            if station_id in line:
                flag_id = True
       #     if station_name.strip('.').lower() in line.lower(): # strip a period in e.g. CT. which is not in the download data
       #         flag_name = True
            if data_var in line.lower():
                flag_var = True
            if data_unit in line.lower():
                flag_unit = True
                TS_ID = line.split()[1] # Assumes line looks like: #  117636  00060  Discharge, cubic feet per second
        head.close()
        
        # This will throw an error if anything is wrong
        assert flag_id,   f'ID mismatch for cs_meta ID {site}. '
       # assert flag_name, f'Name mismatch for cs_meta ID {site}: {station_name}'
        assert flag_var,  f'Variable mismatch for cs_meta ID {site}. USGS metadata does not specify {data_var}'
        assert flag_unit, f'Unit mismatch for cs_meta ID {site}. USGS metadata does not specify {data_unit}'
        
        # 2a. Create a dataframe with only data for further processing
        raw = pd.read_csv(raw_path, delimiter='\t', comment='#', dtype='str') # skip comments; treat everything as str for now
        raw = raw[raw.site_no == site] # Strips any lines that don't specify a site ID (i.e., non-data lines). 
                                          # We remove these because they complicate automatic dtype setting later
                
        # 2b. Do the time conversions
        raw['tz_utc_str']  = raw.apply(lambda row: cs.tz_abbreviation_to_utc(            row['tz_cd']                     ), axis=1)
        raw['tz_td_str']   = raw.apply(lambda row: cs.relative_utc_to_timedelta(         row['tz_utc_str']                ), axis=1)
        raw['tz_datetime'] = raw.apply(lambda row: cs.datetime_str_to_timeaware_datetime(row['datetime'], row['tz_td_str'], 
                                                                                         localize_to_UTC=False),             axis=1)
        
        # 2c. Set UTC-datetime as the index
        raw = raw.set_index(raw['tz_datetime'], drop=True)
        raw = raw.sort_index() # Sometimes timesteps are in the wrong order when a daylight savings time shift happens
        
        # 2d. Quality of life change: replace the site-specific time series ID with something generic
        raw = raw.rename({TS_ID+'_00060': 'obs_00060', TS_ID+'_00060_cd': 'obs_00060_cd'}, axis=1)
                
        # 2e. Get the metadata info
        cs_meta.iat[ix,c_start] = raw.index[0].strftime('%Y-%m-%d %X')
        cs_meta.iat[ix,c_end]   = raw.index[-1].strftime('%Y-%m-%d %X')
        
        # 2f. Save CSV to disk and (re)move the raw file
        raw.to_csv(csv_path)
        if remove_raw:
            os.remove(raw_path)
        if move_raw:
            move_path = move_raw_path / f'USA_{site}' / 'observations'
            move_path.mkdir(parents=True, exist_ok=True)
            move_file = os.path.basename(raw_path)
            shutil.move(raw_path, move_path/move_file)
        

1027. Now working on 01013500
1028. Now working on 01022500
1029. Now working on 01030500
1030. Now working on 01031500
1031. Now working on 01047000
1032. Now working on 01052500
1033. Now working on 01054200
1034. Now working on 01055000
1035. Now working on 01057000
1036. Now working on 01073000
1037. Now working on 01078000
1038. Now working on 01118300
1039. Now working on 01121000
1040. Now working on 01123000
1041. Now working on 01134500
1042. Now working on 01137500
1043. Now working on 01139000
1044. Now working on 01139800
1045. Now working on 01142500
1046. Now working on 01144000
1047. Now working on 01162500
1048. Now working on 01169000
1049. Now working on 01170100
1050. Now working on 01181000
1051. Now working on 01187300
1052. Now working on 01195100
1053. Now working on 01333000
1054. Now working on 01350000
1055. Now working on 01350080
1056. Now working on 01350140
1057. Now working on 01365000
1058. Now working on 01411300
1059. Now working on 01413500
1060. Now 

1303. Now working on 04196800
1304. Now working on 04197100
1305. Now working on 04197170
1306. Now working on 04213000
1307. Now working on 04213075
1308. Now working on 04216418
1309. Now working on 04221000
1310. Now working on 04224775
1311. Now working on 04233000
1312. Now working on 04256000
1313. Now working on 04296000
1314. Now working on 05056000
1315. Now working on 05057000
1316. Now working on 05057200
1317. Now working on 05062500
1318. Now working on 05087500
1319. Now working on 05120500
1320. Now working on 05123400
1321. Now working on 05129115
1322. Now working on 05131500
1323. Now working on 05291000
1324. Now working on 05362000
1325. Now working on 05393500
1326. Now working on 05399500
1327. Now working on 05408000
1328. Now working on 05412500
1329. Now working on 05413500
1330. Now working on 05414000
1331. Now working on 05444000
1332. Now working on 05454000
1333. Now working on 05458000
1334. Now working on 05466500
1335. Now working on 05487980
1336. Now 

1577. Now working on 11148900
1578. Now working on 11151300
1579. Now working on 11162500
1580. Now working on 11176400
1581. Now working on 11180500
1582. Now working on 11180960
1583. Now working on 11224500
1586. Now working on 11253310
1587. Now working on 11264500
1588. Now working on 11266500
1589. Now working on 11274500
1590. Now working on 11274630
1591. Now working on 11284400
1592. Now working on 11299600
1593. Now working on 11381500
1594. Now working on 11383500
1595. Now working on 11451100
1596. Now working on 11468500
1597. Now working on 11473900
1598. Now working on 11475560
1599. Now working on 11476600
1600. Now working on 11478500
1601. Now working on 11480390
1602. Now working on 11481200
1603. Now working on 11482500
1604. Now working on 11522500
1605. Now working on 11523200
1606. Now working on 11528700
1607. Now working on 11532500
1608. Now working on 12010000
1609. Now working on 12013500
1610. Now working on 12020000
1611. Now working on 12025000
1612. Now 

In [19]:
# Save the metadata file
cs_meta.to_csv(cs_meta_path / cs_meta_name, encoding='utf-8', index=False)

### Find which timezone abbreviations we have in the data set
Note: this is development code that does not need to be re-run. This was used to find which timezone abbreviations the USGS downloads used, and to define the "to-UTC" conversions used in the main loop (see `time_processing.py` in `python_cs_functions`).

In [20]:
# Find which timezone abbreviations we're worknig with
timezone_list = []
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA':
        
        # Get paths, etc
        site,_, raw_path,_,_,_ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv')
        
        # Open the raw data file
        raw = pd.read_csv(raw_path, delimiter='\t', comment='#', dtype='str')
        
        # Find timezone abbreviations and append to tracking list
        if 'tz_cd' in raw.columns:
            timezones = raw.tz_cd.unique()
            for timezone in timezones:
                if timezone == '6s': continue # This just indicates that this column uses format '6s'
                timezone_list.append(timezone)
        else:
            print(f'No timezone data found for site {site}')

print(f'Timezone abbreviations found are {list(set(timezone_list))}')

No timezone data found for site 02342933
No timezone data found for site 02464360
No timezone data found for site 11230500
No timezone data found for site 11237500
Timezone abbreviations found are ['PDT', 'EST', 'CDT', 'MST', 'CST', 'MDT', 'EDT', 'PST']


In [37]:
timezone_list = ['MDT', 'EDT', 'PDT', 'CST', 'PST', 'EST', 'MST', 'CDT']