# Data download
This Notebook downloads data needed to run the workflow steps 1 ot 5 for a test basin (in Canada or in the USA).

# Modules, paths, variables & functions
Paths & variables are the only elements you should need to modify.

In [5]:
# Import general required modules
import os, sys
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd
from datetime import datetime,date
from pprint import pprint

# Add scripts to the system path
sys.path.append('../scripts')

# Set up logging, configured for this workflow (see utilities.py)
from utilities import setup_logging, read_settings
setup_logging()
# Set up logging for this notebook
import logging
logger = logging.getLogger()

%load_ext autoreload
%autoreload 2

2023-08-15 15:57:31,338 - root - INFO - Logging setup complete. Log file: /Users/drc858/GitHub/data_driven_forecasting_workflow/logs/data_driven_forecasting_20230815_155731.log


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
settings = read_settings('../settings/config_North_America.yaml', log_settings=True)
pprint(settings)

2023-08-15 15:57:33,904 - root - INFO - Settings logged from ../settings/config_North_America.yaml


{'SWE_obs_path': '/Users/lla068/Documents/data_driven_forecasting/data/snow_obs/North_America/NorAmSnow_1979_2022.nc',
 'basins_shp_path': '/Users/lla068/Documents/data_driven_forecasting/basins/North_America/NorAm_unregulated_basins.shp',
 'domain': 'North_America',
 'nival_glacial_basins_shp_path': '/Users/lla068/Documents/data_driven_forecasting/scripts/HPC_workflow/output_data/North_America/v1/NorAm_nival_glacial_basins_outlines.shp',
 'output_data_path': '/Users/lla068/Documents/data_driven_forecasting/scripts/HPC_workflow/output_data/North_America/v1/',
 'plots_path': '/Users/lla068/Documents/data_driven_forecasting/scripts/HPC_workflow/output_plots/North_America/v1/',
 'precip_obs_path': '/Users/lla068/Documents/data_driven_forecasting/data/met_obs/North_America/SCDNA_v1.1.nc4',
 'streamflow_obs_path': '/Users/lla068/Documents/data_driven_forecasting/data/streamflow_obs/North_America/NorAmQobs_1979_2021.nc',
 'volumes_obs_path': '/Users/lla068/Documents/data_driven_forecasting/s

In [7]:
# Set user-specified variables
start_date = '1979-01-01' # start date for data extraction
end_date = '2021-12-31' # end date for data extraction
test_basin_id = '03339000' # USA test basin code from the USGS, for which we want to download test data

logger.debug(f'start_date: {start_date}')
logger.debug(f'end_date: {end_date}')
logger.debug(f'test_basin_id: {test_basin_id}')


You can explore the basins:
- in Canada: https://wateroffice.ec.gc.ca/map/index_e.html?type=historical
- in the USA: https://maps.waterdata.usgs.gov/mapper/index.html

# Streamflow data download for USA test basin 
This section downloads USGS streamflow data using this [Python package](https://github.com/USGS-python/dataretrieval) and saves it into a NetCDF. Only stations with limited regulation, from the USGS Hydro-Climatic Data Network 2009 (HCDN–2009) ([Lins, 2012](https://pubs.usgs.gov/fs/2012/3047/)), are kept. The HCDN-2009 data are available [here](https://water.usgs.gov/GIS/metadata/usgswrd/XML/gagesII_Sept2011.xml).

Decisions:
- We extract data for 1979-2021 as this is when we have SWE data. [Brunner et al. (2020)](https://hess.copernicus.org/articles/24/3951/2020/) download data from 1981-2018 as data for this period were available for most stations in the dataset.

In [None]:
# Import specific modules required for this sub-section
import dataretrieval.nwis as nwis

In [None]:
# Set required data paths
Q_netcdf_output = "/Users/lla068/Documents/data_driven_forecasting/data/streamflow_obs/USGS_"+test_site_id+"_Qdata_"+start_date+"-"+end_date+".nc" # NetCDF output path and file name

In [None]:
# Get daily discharge data (in ft3/s)
df = nwis.get_record(sites=test_basin_id, service='dv', start=start_date, end=end_date, parameterCd='00060')

display(df)

In the column "00060_Mean_cd", A stands for Approved for publication and P stands for Provisional data subject to revision. 

Some sites may measure from different locations, such as the right bank and the left bank of a river. Data for the different locations of a site will appear in separate columns - e.g., '00060_loc 1_Mean', '00060_loc 2_Mean', etc.

In [None]:
# Save this as NetCDF with right format
# Download regulation dataset & shapefiles

# SWE data download for USA test basin
Vincent Vionnet's script on GitHub?

# Streamflow data download for Canada test basin
Kasra & Shervan's script to download sqlite HYDAT data as csv files on GitHub?

You can download the Canada HYDAT National Water Data Archive manually from [here](https://www.canada.ca/en/environment-climate-change/services/water-overview/quantity/monitoring/survey/data-products-services/national-archive-hydat.html). Click on "Download the HYDAT database". Once on that page, download:
- the basin shapefiles by opening "HydrometricNetworkBasinPolygons/" and by downloading zip files "01.zip" to "11.zip" and unzip them. Each number corresponds to a different HYDAT region of Canada.
- the regulation data by opening "RHBN/" and by downloading "RHBN_Metadata.xlsx".

# Re-format

Here is the code in easymore to put the csv or data frame into a netcdf file.
at least on csv file should be prepared that include the station information in this way
https://github.com/ShervanGharari/EASYMORE/blob/main/data/station_data/station_data.csv

if you have flags, this can be done in another file
https://github.com/ShervanGharari/EASYMORE/blob/main/data/station_data/station_data_flag.csv

if you have station information this can be done in another file as well! (which can have all the station information as many as you like)
https://github.com/ShervanGharari/EASYMORE/blob/main/data/station_data/station_info.csv

example is here:
https://github.com/ShervanGharari/EASYMORE/blob/main/examples/Chapter1_E7.ipynb

# SWE data download for Canada test basin

You can download the [Canadian historical Snow Water Equivalent dataset](https://zenodo.org/record/6638382) manually from Zenodo. Make sure to select the latest available version. You can find more information about this dataset in [Vionnet et al. (2021)](https://essd.copernicus.org/articles/13/4603/2021/).

In [None]:
# Move below to HPC script

# All HCDN-2009 unregulated stations

In [None]:
# Set required data paths
hcdn_shp = '/Users/lla068/Documents/data_driven_forecasting/basins/USA/gagesII_9322_point_shapefile/gagesII_9322_sept30_2011.shp' # HCDN-2009 shapefile
netcdf_output = "/Users/lla068/Documents/data_driven_forecasting/data/USA/streamflow_obs/USGS_HCDN-2009_Qdata_1979-2021.nc" # NetCDF output path and file name

In [None]:
# Read HCDN-2009 shapefile as Geopandas dataframe
hcdn_gdf = gpd.read_file(hcdn_shp)

In [None]:
# Subselect data from the HCDN-2009 unregulated dataset
hcdn_2009_gdf = hcdn_gdf.loc[hcdn_gdf['HCDN_2009'] == 'yes']

display(hcdn_2009_gdf)

hcdn_2009_sites = list(hcdn_2009_gdf.STAID.values)

In [None]:
# Extract USGS Q data

# loop over sites
for s in hcdn_2009_sites:

    # get daily discharge data (in ft3/s) for approved data only (00060_Mean_cd = A)
    df = nwis.get_record(sites=s, service='dv', start=start_date, end=end_date, parameterCd='00060')

    # check that station has data
    if df.empty == False:
    
        # some sites may measure from different locations, such as the right bank and the left bank of a river (defined by loc)
        if '00060_Mean_cd' in list(df.columns):
            column_kwarg = '00060_Mean_cd'
            column_data = '00060_Mean'
        elif '00060_loc 1_Mean_cd' in list(df.columns):
            column_kwarg = '00060_loc 1_Mean_cd'
            column_data = '00060_loc 1_Mean'

        # in the column "00060_Mean_cd", A stands for Approved for publication and P stands for Provisional data subject to revision
        # we only keep the approved values
        df = df.loc[df[column_kwarg] == 'A']

        if s == hcdn_2009_sites[0]:
            hcdn_2009_Q_df = df.loc[:,[column_data]]
            hcdn_2009_Q_df = hcdn_2009_Q_df.rename(columns={column_data:s})

        else:
            df = df.rename(columns={column_data:s})
            hcdn_2009_Q_df = pd.concat([hcdn_2009_Q_df, df[s]], axis=1)
    
    # if no data go to next station
    else:
        print(s,'has no data')
        continue

In [None]:
display(hcdn_2009_Q_df)

In [None]:
# change date format to datetime64[ns]
hcdn_2009_Q_df.index = pd.to_datetime(hcdn_2009_Q_df.index)

display(hcdn_2009_Q_df)

In [None]:
# Save pandas dataframe to xarray DataArray
hcdn_2009_Q_da = xr.DataArray(data=hcdn_2009_Q_df, coords=dict(time=hcdn_2009_Q_df.index.values, Station_ID=hcdn_2009_Q_df.columns.values), dims=['time','Station_ID'], name='Flow', attrs={'long_name':'Daily flow','units':'ft3/s','info':'Data extracted by Louise Arnal (USask) from USGS using https://github.com/USGS-python/dataretrieval. Only stations from the USGS Hydro-Climatic Data Network 2009 (HCDN–2009) were kept (Lins, 2012; https://pubs.usgs.gov/fs/2012/3047/)'})

display(hcdn_2009_Q_da)

In [None]:
# Add stations lat/lon information
hcdn_2009_gdf_latlon = hcdn_2009_gdf[{'STAID','LAT_GAGE','LNG_GAGE'}].set_index('STAID')
lats = hcdn_2009_gdf_latlon.loc[hcdn_2009_Q_da.Station_ID.values,'LAT_GAGE'].values
lons = hcdn_2009_gdf_latlon.loc[hcdn_2009_Q_da.Station_ID.values,'LNG_GAGE'].values
hcdn_2009_Q_da = hcdn_2009_Q_da.assign_coords(lat=("Station_ID",lats),lon=("Station_ID",lons))
hcdn_2009_Q_da.lat.attrs['long_name'] = 'latitude'
hcdn_2009_Q_da.lat.attrs['units'] = 'degrees_north'
hcdn_2009_Q_da.lon.attrs['long_name'] = 'longitude'
hcdn_2009_Q_da.lon.attrs['units'] = 'degrees_east'

display(hcdn_2009_Q_da)

In [None]:
# Save data to NetCDF
hcdn_2009_Q_da.to_netcdf(netcdf_output, format="NETCDF4")

# All HYDAT basins

In [None]:
HYDAT_Q_path = "/Users/lla068/Documents/data_driven_forecasting/data/streamflow_obs/Canada/HYDAT_sqlite3_20220418/"

In [None]:
stations_list = sorted(os.listdir(HYDAT_Q_path))

for x in stations_list:

    df = pd.read_csv(HYDAT_Q_path+x, index_col=0)
    
    # change date format to datetime64[ns]
    df.index = pd.to_datetime(df.index)
    
    # Pandas dataframe to xarray DataSet
    ds = df.to_xarray().rename({'index':'time','FLOW':'Flow','FLAG':'Flag'})
    ds = ds.expand_dims(dim={'Station_ID':[x[0:-4]]}, axis=1)
    ds.Flow.attrs['long_name'] = 'Daily flow'
    ds.Flow.attrs['units'] = 'm3/s'
    ds.Flow.attrs['info'] = 'Data extracted by Shervan Gharari (USask), using scripts from Kasra Keshavarz (USask), and reformatted by Louise Arnal (USask).'
    ds.Flag.attrs['info'] = 'Flag attached to HYDAT data. A (Partial): calculation for daily data is made with incomplete daily record. B (Ice): ice cover observed at the time of measurement. D (dry): conditions of the river dry at the time of measurement. E (estimate): observation is an estimate only.'
    
    # merge datasets for all stations
    if x == stations_list[0]:
        HYDAT_Q_ds = ds
    else:
        HYDAT_Q_ds = xr.merge([HYDAT_Q_ds, ds])
    
display(HYDAT_Q_ds)
    
    # save to netcdf

In [None]:
# select data for specific time range?
# add lat/lon info