# Subset Canada HYDAT database to RHBN 2020 stations
The full HYDAT database is 1.1 GB. Subsetting sasves disk space.

In [5]:
import sys
#import sqlite3
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [6]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [7]:
# Get the required info from the config file
data_path = cs.read_from_config(config_file,'data_path')
shps_path = cs.read_from_config(config_file,'ref_shps_path')
hydat_url = cs.read_from_config(config_file,'can_hydat_db_url')
rhbn_meta = cs.read_from_config(config_file,'can_rhbn_meta_url')

### Define data location

In [8]:
# Construct the download location
data_folder = Path(data_path) / shps_path / 'RHBN-CAN'

In [21]:
# Find the HYDAT database. Assumes only 1 sqlite3 database exists
hydat_db_name = sorted(data_folder.glob('*.sqlite3'))[0]

In [23]:
# Make the file name
hydat_csv_name = hydat_url.split('/')[-1].strip().replace('.zip','_RHBN_2020_subset.csv')

In [18]:
# Make the file name: Get the last part of the url, strip whitespace and characters, replace extension
meta_name = rhbn_meta.split('/')[-1].strip().replace('xlsx','csv') 

### Subset HYDAT database to RHBN 2020 basins

In [24]:
# Open the RHBN 2020 list
df = pd.read_csv(str(data_folder/meta_name))

In [25]:
# open database
db = cs.connect_to_sqlite_database(data_folder/hydat_db_name)

In [26]:
# Define the search terms for the databse query
# See code section below for tests used to find these names and the correct format for value
table  = 'STATIONS' 
field  = 'STATION_NUMBER' 
values = df['STATION_NUMBER'].tolist() # Get the station IDs as a list

In [27]:
# Construct the SQL query we need
query = cs.construct_query_from_dataframe_column('STATIONS', 'STATION_NUMBER', values)

In [28]:
# Get the RHBN station metadata from the HYDAT database as a dataframe
rhbn = cs.sql_query_to_dataframe(db, query)

In [29]:
# Check how many of the basins we have data for
print('Found {} entries matching one of {} Reference Hydrometric Basins'.format(len(rhbn),len(df)))

Found 1027 entries matching one of 1027 Reference Hydrometric Basins


In [32]:
# Save the reduced dataframe as csv
rhbn.to_csv(data_folder/hydat_csv_name, index=False)

In [33]:
# Close the database
db.close()

In [11]:
# We'll keep the larger database for the moment, because we want to get flow observations out of it later

### Database checks
Code used during development of the HYDAT functions. Kept for traceability reasons.

In [20]:
# list the available tables
tables = cs.find_all_table_names(db) # -> we want 'STATIONS'

('STATIONS',)
('CONCENTRATION_SYMBOLS',)
('SED_SAMPLES_PSD',)
('ANNUAL_INSTANT_PEAKS',)
('STN_DATUM_UNRELATED',)
('DATA_SYMBOLS',)
('SED_VERTICAL_LOCATION',)
('STN_DATA_COLLECTION',)
('PEAK_CODES',)
('SED_DATA_TYPES',)
('MEASUREMENT_CODES',)
('SED_VERTICAL_SYMBOLS',)
('DATA_TYPES',)
('DLY_FLOWS',)
('STN_REMARKS',)
('STN_DATUM_CONVERSION',)
('AGENCY_LIST',)
('SED_DLY_SUSCON',)
('STN_OPERATION_SCHEDULE',)
('STN_DATA_RANGE',)
('PRECISION_CODES',)
('SED_DLY_LOADS',)
('DLY_LEVELS',)
('OPERATION_CODES',)
('STN_REGULATION',)
('DATUM_LIST',)
('ANNUAL_STATISTICS',)
('VERSION',)
('REGIONAL_OFFICE_LIST',)
('SAMPLE_REMARK_CODES',)
('STN_REMARK_CODES',)
('SED_SAMPLES',)
('STN_STATUS_CODES',)


In [21]:
# Find the headers in the STATION table
table_name = 'STATIONS'
headers = cs.find_table_headers(db,table_name) # -> we need to subset on 'STATION_NUMBER'

['STATION_NUMBER', 'STATION_NAME', 'PROV_TERR_STATE_LOC', 'REGIONAL_OFFICE_ID', 'HYD_STATUS', 'SED_STATUS', 'LATITUDE', 'LONGITUDE', 'DRAINAGE_AREA_GROSS', 'DRAINAGE_AREA_EFFECT', 'RHBN', 'REAL_TIME', 'CONTRIBUTOR_ID', 'OPERATOR_ID', 'DATUM_ID']


In [22]:
# Check the contents of a single row to find which format we need to specify STATION_NUMBER in
contents = cs.find_table_contents(db,'STATIONS',to_screen=False)
contents[0] # -> we need to specify station IDs as '01AA002' INCLUDING the apostrophes

('01AA002',
 'DAAQUAM (RIVIERE) EN AVAL DE LA RIVIERE SHIDGEL',
 'QC',
 '6',
 'D',
 None,
 46.557498931884766,
 -70.08110809326172,
 598.0,
 None,
 0,
 0,
 740,
 740,
 405)