# Download USGS flow data
It seems difficult to automatically find how much data is available for a given USGS station, so instead we request a very long time period. The server will automatically return only available data to us. 

Workflow:
- Save server response as a temporary file
- Separate response into a `.csv` file containing the data and ...
- ... a `.txt` file with header (meta) info.

We need the data in `.csv` for future processing but we cannot immediately store the whole thing as a `.csv` file because the line organization doesn't match. Might as well process it here. 

Download info source: https://waterservices.usgs.gov/rest/IV-Service.html#Specifying

In [6]:
import sys
import time
import numpy as np
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

# Data period
time_s = cs.read_from_config(config_file, 'usgs_start_t')
time_e = cs.read_from_config(config_file, 'usgs_start_e')

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

### Loop over sites and download the flow record

In [5]:
# General settings
# Though IV and DV are pretty much identical apart from the URL, specifying them out seems cleaner
# See: https://waterservices.usgs.gov/rest/IV-Service.html
# See: https://waterservices.usgs.gov/rest/DV-Service.html
iv_var = '00060' # streamflow; 00065 for gage height
iv_url = 'https://nwis.waterservices.usgs.gov/nwis/iv/' # (i)nstantaneous (v)alues
dv_var = '00060' # streamflow; 00065 for gage height
dv_url = 'https://nwis.waterservices.usgs.gov/nwis/dv/' # (d)aily (v)alues

In [15]:
# Loop over the USA stations only
dnf_iv = [] # List of incomplete stations, retaining these for easier printout and checking later
dnf_dv = []
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA':
        
        # Get paths
        site, _, raw_path_iv, _, _,_ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='iv') # paths
        _,    _, raw_path_dv, _, _,_ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='daily') # paths
        
        # Resume after interrupts
        if raw_path_iv.is_file() and raw_path_dv.is_file():
            continue

        # Downloads
        dnf_iv = cs.download_usgs_values(iv_url, site, time_s, time_e, iv_var, raw_path_iv, dnf_iv)
        time.sleep(0.5) # pause for a second so we don't bombard the server with requests
        dnf_dv = cs.download_usgs_values(dv_url, site, time_s, time_e, dv_var, raw_path_dv, dnf_dv)
        time.sleep(0.5) 

Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 07068000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 07068000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08202700
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08202700
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08267500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08267500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08269000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08269000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08271000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08271000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08324000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08324000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 08377900
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 08377900
Completed https://nwis.waterservices.usgs.gov/nw

Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10310500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10316500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10316500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10329500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10329500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10336645
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10336645
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10336660
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10336660
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10336740
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10336740
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10343500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 10343500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 10348850
Completed https://nwis.waterservices.usgs.gov/nw

Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12115000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12115500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12115500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12117000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12117000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12141300
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12141300
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12143600
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12143600
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12144000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12144000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12145500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 12145500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 12147500
Completed https://nwis.waterservices.usgs.gov/nw

Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14236200
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14236200
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14301000
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14301000
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14303200
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14303200
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14305500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14305500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14306340
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14306340
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14306500
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14306500
Completed https://nwis.waterservices.usgs.gov/nwis/iv/ for 14308990
Completed https://nwis.waterservices.usgs.gov/nwis/dv/ for 14308990
Completed https://nwis.waterservices.usgs.gov/nw

## Update meta data file with daily data availability

In [7]:
meta_column_start = 'dv_flow_obs_availability_start'
meta_column_end   = 'dv_flow_obs_availability_end'
meta_column_miss  = 'flow_obs_missing_daily'

cs_meta[meta_column_start] = -1
cs_meta[meta_column_end] = -1
cs_meta[meta_column_miss] = -1

c_start = np.where(cs_meta.columns == meta_column_start)[0][0]
c_end   = np.where(cs_meta.columns == meta_column_end)[0][0]
c_miss  = np.where(cs_meta.columns == meta_column_miss)[0][0]

In [46]:
for ix,row in cs_meta.iterrows():
    if row.Country == 'USA':
        site, _, raw_path_dv, _, _,_ = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='daily') # paths
        df = pd.read_csv(raw_path_dv, delimiter='\t', comment='#', dtype='str')
        df = df[df.site_no == site]
        df = df.set_index(pd.to_datetime(df['datetime']))
        
        flow = [column for column in df.columns if '00060' in column and not '_cd' in column][0]
        cs_meta.iat[ix,c_start] = df.index[0].strftime('%Y-%m-%d %X')
        cs_meta.iat[ix,c_end]   = df.index[-1].strftime('%Y-%m-%d %X')
        cs_meta.iat[ix,c_miss]  = df[flow].isna().sum()
        print(f'{site}')

01013500
01022500
01030500
01031500
01047000
01052500
01054200
01055000
01057000
01073000
01078000
01118300
01121000
01123000
01134500
01137500
01139000
01139800
01142500
01144000
01162500
01169000
01170100
01181000
01187300
01195100
01333000
01350000
01350080
01350140
01365000
01411300
01413500
01414500
01415000
01423000
01434025
01435000
01439500
01440000
01440400
01451800
01466500
01484100
01485500
01486000
01487000
01491000
01510000
01516500
01518862
01532000
01539000
01542810
01543000
01543500
01544500
01545600
01547700
01548500
01549500
01550000
01552000
01552500
01557500
01567500
01568000
01580000
01583500
01586610
01591400
01594950
01596500
01605500
01606500
01613050
01620500
01632000
01632900
01634500
01638480
01639500
01644000
01658500
01664000
01666500
01667500
01669000
01669520
02011400
02011460
02013000
02014000
02015700
02016000
02017500
02018000
02027000
02027500
02028500
02038850
02046000
02051000
02051500
02053200
02053800
02055100
02056900
02059500
02064000
02065500
0

In [49]:
# Save the metadata file
cs_meta.to_csv(cs_meta_path / cs_meta_name, encoding='utf-8', index=False)

## Check sites for which we could not download any data

In [25]:
# Print which basins we need to check
for entry in dnf_iv:
    print(f'No IV data downloaded for gauge {entry}')
print('End of list')

No IV data downloaded for gauge 02342933
No IV data downloaded for gauge 02464360
No IV data downloaded for gauge 11230500
No IV data downloaded for gauge 11237500
End of list


Manual checks indicate that no Instantaneous Value (IV) discharge data is available for these stations. Checked on 2023-02-27.
- 02342933: https://waterdata.usgs.gov/monitoring-location/02342933/#period=P1Y
- 02464360: https://waterdata.usgs.gov/monitoring-location/02464360/#period=P1Y
- 11230500: https://waterdata.usgs.gov/monitoring-location/11230500/#period=P1Y
- 11237500: https://waterdata.usgs.gov/monitoring-location/11237500/#period=P1Y

In [24]:
# Print which basins we need to check
for entry in dnf_dv:
    print(f'No DV data downloaded for gauge {entry}')
print('End of list')

End of list


### Update the metadata file

In [26]:
country = 'USA'

In [27]:
reason = 'No Instantaneous Values of discharge available'

In [28]:
# Make a dataframe that lists the basins we cannot use
cs_unusable = pd.DataFrame({'Country': country,
                            'Station_id': dnf_iv,
                            'Reason': reason})

In [29]:
cs_unusable

Unnamed: 0,Country,Station_id,Reason
0,USA,2342933,No Instantaneous Values of discharge available
1,USA,2464360,No Instantaneous Values of discharge available
2,USA,11230500,No Instantaneous Values of discharge available
3,USA,11237500,No Instantaneous Values of discharge available


In [30]:
cs_unusable.to_csv(cs_meta_path / cs_unusable_name, encoding='utf-8', index=False)