## Ongoing Notes:
Key Problems:
1. Which dataset or datasets even contain the information we want
2. How do we determine when measurements were taken in the WQP dataset
3. How do we determine which param_codes to use (there are ~10,000!) via NWIS
4. How do we match streamflow gauges to their closest water quality gauge if one exists (lat/long data is available)
5. How do we work with the extremely reduced number of sites that have these niche water quality metrics<br>
    -Seems very unlikely that most if any sites will have all the ones listed in the proposal in one place

...in other words how do we spatially and temporarily match streamflow data with water quality data
<br><br>

TODO:
1. Determine relevant gauge counts for at least some of these metrics, per state
2. Determine type of data returned, time-series, or otherwise
<br><br>

Comments:<br>
How granular will the water quality data be ultimately? Is building a robust water quality profile for a few gauges in locations where EAR is viable the focus, or is having a general idea of an entire watershed regions water quality the focus?

## Initial Water Quality Data Exploration

In [1]:
#Python3.10
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import contextily as cx
from importlib import reload
from typing import IO
from IPython.display import display
from collections import Counter
import warnings

from datetime import datetime, timedelta

# USGS Data retreival tool
from dataretrieval import nwis, utils, codes

# Custom modules are imported in multiple locations to faciliate easy reloading when edits are made to their respective files
import Src.classes as cl
import Src.func as fn
reload(cl)
reload(fn)

# TODO: Look into the warning that this is disabling. It doesn't appear to be significant for the purposes of this code but should be understood
pd.options.mode.chained_assignment = None

#pd.options.mode.chained_assignment = 'warn'

In [5]:
#'01578310'
test_aquifer = 'Central Valley aquifer system'

df_sites = pd.read_excel('Prelim_Data/_National_Metrics/National_Metrics_30_90.xlsx', sheet_name='site_metrics', dtype=fn.DATASET_DTYPES)
#df_sites = df_sites.dropna(subset=['within_aq'])
print(len(df_sites))
df, metadata = nwis.get_qwdata(sites='11447650', start='1990-10-01', end='2020-09-30')
print(metadata)

7914




NWIS_Metadata(url=https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no=11447650&begin_date=1990-10-01&end_date=2020-09-30&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format=rdb&pm_cd_compare=Greater+than&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list)


  df = pd.read_csv(


In [7]:
print(metadata)

NWIS_Metadata(url=https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no=11447650&begin_date=1990-10-01&end_date=2020-09-30&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format=rdb&pm_cd_compare=Greater+than&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list)


In [3]:
site_list = df_sites['site_no'].to_list()
counted_sites = []
counter = Counter()

for site in site_list:
    try:
        print(f'Trying site: {site}')
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df, metadata = nwis.get_qwdata(sites=site, start='1990-10-01', end='2020-09-30')            
            counter.update(df.columns)
            counted_sites.append(site)
            
    except Exception as e:
        print(f'ERROR: {site} - {e}')   
        
print(counter)
print(f'Total Sites: {len(counted_sites)}')

Trying site: 02339495
ERROR: 02339495 - No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no=02339495&begin_date=1990-10-01&end_date=2020-09-30&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format=rdb&pm_cd_compare=Greater+than&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list
Trying site: 02342500
Trying site: 02361000
Trying site: 02361500
Trying site: 02363000
Trying site: 02364500
Trying site: 02369800
Trying site: 02371500
Trying site: 02372250
Trying site: 02372422
Trying site: 02372430
ERROR: 02372430 - No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no=02372430&begin_date=1990-10-01&end_date=2020-09-30&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format=rdb&pm_cd_compare=Greater+than&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw

In [4]:
print(counter)
print(len(fn.WATER_QUALITY_PCODES))
seen = []

for key, value in counter.items():
    if key in fn.WATER_QUALITY_PCODES.keys():
        print(f'{fn.WATER_QUALITY_PCODES[key]}: {value} ({key})')
        seen.append(key)
        
for key, value in fn.WATER_QUALITY_PCODES.items():
    if key not in seen:
        print(f'{value}: 0 ({key})')
        

Counter({'agency_cd': 5236, 'site_no': 5236, 'sample_dt': 5236, 'sample_tm': 5236, 'sample_end_dt': 5236, 'sample_end_tm': 5236, 'sample_start_time_datum_cd': 5236, 'tm_datum_rlbty_cd': 5236, 'coll_ent_cd': 5236, 'medium_cd': 5236, 'project_cd': 5236, 'aqfr_cd': 5236, 'tu_id': 5236, 'body_part_id': 5236, 'hyd_cond_cd': 5236, 'samp_type_cd': 5236, 'hyd_event_cd': 5236, 'sample_lab_cm_txt': 5236, 'p00010': 4945, 'p00095': 4709, 'p00061': 4583, 'p30209': 4583, 'p00028': 4009, 'p00191': 3782, 'p00400': 3782, 'p82398': 3765, 'p00065': 3747, 'p30207': 3747, 'p00300': 3540, 'p71999': 3448, 'p84164': 3300, 'p00020': 3298, 'p00301': 3159, 'p00025': 3099, 'p90095': 3082, 'p00665': 3061, 'p00631': 2971, 'p00940': 2933, 'p00660': 2923, 'p00671': 2922, 'p71846': 2912, 'p00608': 2911, 'p00945': 2831, 'p00900': 2780, 'p00605': 2770, 'p50280': 2767, 'p00618': 2698, 'p00405': 2663, 'p00613': 2660, 'p71856': 2660, 'p00915': 2659, 'p00925': 2650, 'p71851': 2637, 'p80154': 2608, 'p00930': 2556, 'p00403': 