## Imports

In [1]:
# Imports -- Python 3.10
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import pyarrow as pa
import pyarrow.parquet as pq

from matplotlib import ticker
from importlib import reload
from dataretrieval import nwis
from datetime import timedelta

# Custom libs
import Src.func as fn
import Src.classes as cl
reload(fn)
reload(cl)

pd.options.mode.chained_assignment = None



## Sub_df 1: HMF Events Breakdown Dataset
The following generates a dataset containing information on every HMF event for every input gauge

In [52]:
date_range = 30
quantile = 90
test_limit = math.inf

site_list_df = pd.read_excel(f'Prelim_Data/_National_Metrics/National_Metrics_{date_range}_{quantile}.xlsx', dtype=fn.DATASET_DTYPES)
#site_list_df = site_list_df[site_list_df['valid'] == True] # Will run on valid sites only once national metrics is updated with tidal data
site_list = site_list_df['site_no'].tolist()
print(f'# of sites: {len(site_list)}')

# of sites: 7858


In [53]:
list_results = []

# This is a cut down version of single_site_analysis() focused only on event metrics -- ['11447650', '13335050']
for i, site in enumerate(site_list):
    if i == test_limit: break
    print(f'Working on site {site} ({i+1}/{len(site_list)})')
    
    try:
        df = nwis.get_record(sites=site, service=fn.SERVICE, parameterCD=[fn.PARAM_CODE, fn.TIDAL_CODE], start=fn.DEFAULT_START, end=fn.DEFAULT_END)
        df = df.reset_index()
        
        # Only run on valid sites so this should never be the case but check anyways
        if df.empty: continue
        
        if '00060_radar sensor_Mean' in df.columns and '00060_Mean' not in df.columns:
            df.rename(columns={'00060_radar sensor_Mean': '00060_Mean'}, inplace=True)
            
        df = fn.merge_tidal(df)
        
        # Cropping to date range
        date_threshold = pd.to_datetime(fn.DEFAULT_END).date() - timedelta(days=365.25 * date_range)
        df = df[df['datetime'].dt.date >= date_threshold]
        
        threshold = fn.calc_threshold(df, (quantile / 100))
        _, hmf_series_cont = fn.filter_hmf(df, threshold)
        hmf_series_cont = fn.convert_hmf(hmf_series_cont, threshold)
        
        hmf_series_cont['00060_Mean'] = hmf_series_cont['00060_Mean'] * fn.CUBIC_FT_KM_FACTOR
        #hmf_series_cont.to_csv('hmf_series_cont.csv')
        
        # Insert a dummy row at the end to ensure the last event is captured
        last = hmf_series_cont['datetime'].iloc[-1] + timedelta(days=1)
        insert = {'datetime': last, '00060_Mean': 0, 'site_no': site}
        hmf_series_cont = pd.concat([hmf_series_cont, pd.DataFrame(insert, index=[0])]).reset_index(drop=True)
        
    except Exception as e:
        print(f'ERROR (outer): {e}')
        continue

    event_count = hmf = duration = 0
    event = False
    start = end = None
    for i, row in hmf_series_cont.iterrows():  
        try:
            # No current event and none starting now
            if row['00060_Mean'] == 0 and event == False:
                event = False
                continue
                        
            # New event starting
            if row['00060_Mean'] > 0 and event == False:
                hmf = row['00060_Mean']
                start = row['datetime'].date()
                duration = 1
                event_count += 1
                event = True
                continue
            
            # Current event continuing
            if row['00060_Mean'] > 0 and event == True:
                hmf += row['00060_Mean']
                duration += 1
                event = True
                continue
            
            # Current event ending, add to dataframe
            if row['00060_Mean'] == 0 and event == True:
                end = row['datetime'].date() - timedelta(days=1)
                data = {'site_no': site, 'event': event_count, 'start': start, 'end': end, 'hmf': hmf, 'duration': duration}
                list_results.append(data)
                event = False
                continue
            
        except Exception as e:
            print(f'ERROR (inner): {e}')
            continue

df_results = pd.DataFrame(list_results)
#df_results.to_csv(f'events_subdf_{date_range}_{quantile}.csv', index=False)           
pq.write_table(table=pa.Table.from_pandas(df_results), where=f'Prelim_Data/events_subdf_{date_range}_{quantile}.parquet', compression='snappy')

Working on site 01010000 (1/7858)
Working on site 01010070 (2/7858)
Working on site 01010500 (3/7858)
Working on site 01011000 (4/7858)
Working on site 01013500 (5/7858)
Working on site 01014000 (6/7858)
Working on site 01015800 (7/7858)
Working on site 01017000 (8/7858)
Working on site 01017290 (9/7858)
Working on site 01017550 (10/7858)
Working on site 01017960 (11/7858)
Working on site 01018009 (12/7858)
Working on site 01022500 (13/7858)
Working on site 01022840 (14/7858)
Working on site 01027240 (15/7858)
Working on site 01029200 (16/7858)
Working on site 01029500 (17/7858)
Working on site 01030350 (18/7858)
Working on site 01030500 (19/7858)
Working on site 01031300 (20/7858)
Working on site 01031500 (21/7858)
Working on site 01031510 (22/7858)
Working on site 01033000 (23/7858)
Working on site 01034000 (24/7858)
Working on site 01037000 (25/7858)
Working on site 01037380 (26/7858)
Working on site 01038000 (27/7858)
Working on site 01043500 (28/7858)
Working on site 01044550 (29/

In [None]:
# Loading parquet file example
loaded_df = pq.read_table(source=f'Prelim_Data/events_subdf_{date_range}_{quantile}.parquet').to_pandas()
#temp = loaded_df[loaded_df['site_no'] == '13335050']
#print(temp)
#print(loaded_df.head(10))
#print(loaded_df.tail(10))

## Sub_df 2: Annual Metrics Breakdown Dataset
The following generates a dataset including the annual metrics for each input gauge

In [46]:
date_range = 30
quantile = 90
test_limit = math.inf

site_list_df = pd.read_excel(f'Prelim_Data/_National_Metrics/National_Metrics_{date_range}_{quantile}.xlsx', dtype=fn.DATASET_DTYPES)
site_list_df = site_list_df[site_list_df['valid'] == True] # Will run on valid sites only once national metrics is updated with tidal data
site_list = site_list_df['site_no'].tolist()
print(f'# of sites: {len(site_list)}')

# of sites: 4242


In [47]:
list_results = []
df_results = pd.DataFrame()

# ['11447650', '13335050']
for i, site in enumerate(site_list):
    if i == test_limit: break
    print(f'Working on site {site} ({i+1}/{len(site_list)})')
    
    try:
        df = nwis.get_record(sites=site, service=fn.SERVICE, parameterCD=[fn.PARAM_CODE, fn.TIDAL_CODE], start=fn.DEFAULT_START, end=fn.DEFAULT_END)
        df = df.reset_index()
        
        # Only run on valid sites so this should never be the case but check anyways
        if df.empty: continue
        
        if '00060_radar sensor_Mean' in df.columns and '00060_Mean' not in df.columns:
            df.rename(columns={'00060_radar sensor_Mean': '00060_Mean'}, inplace=True)
            
        df = fn.merge_tidal(df)
        
        # Cropping to date range
        date_threshold = pd.to_datetime(fn.DEFAULT_END).date() - timedelta(days=365.25 * date_range)
        df = df[df['datetime'].dt.date >= date_threshold]
        
        threshold = fn.calc_threshold(df, (quantile / 100))
        hmf_series_defl, hmf_series_cont = fn.filter_hmf(df, threshold)
        hmf_years = fn.num_hmf_years(hmf_series_defl) 
        
        # Annual HMF 
        df_hmf = hmf_series_cont.copy()
        df_hmf = fn.convert_hmf(df_hmf, threshold)
        df_hmf['00060_Mean'] = df_hmf['00060_Mean'].apply(lambda x: x * fn.CUBIC_FT_KM_FACTOR if x >= 0 else 0)
        df_hmf.set_index('datetime', inplace=True)
        df_hmf = df_hmf.resample(fn.HYDRO_YEAR).agg({'00060_Mean': ['sum']}).reset_index()
        df_hmf.columns = ['water_year', 'annual_hmf']
        df_hmf['water_year'] = df_hmf['water_year'].dt.year
        
        # Duration and Event Metrics
        hmf_series_cont = fn.convert_hmf(hmf_series_cont, threshold)
        _, _, _, _, df_event_results = fn.calc_duration_intra_annual(hmf_series_cont, hmf_years)
        df_event_results = df_event_results.drop(columns=['00060_Mean']).rename(columns={'Year': 'water_year', 'total_days': 'annual_duration', 'duration': 'event_duration'})
        #print(df_event_results)
        
        # Timing
        hmf_series_defl = fn.convert_hmf(hmf_series_defl, threshold)
        _, com_series = fn.calc_timing(hmf_series_defl)
        com_series = (pd.DataFrame(com_series).reset_index().rename(columns={'year': 'water_year', 'day': 'timing'}))
        #print(com_series)
        
        # Aggregation
        df_temp = pd.merge(df_hmf, pd.merge(df_event_results, com_series, on='water_year', how='left'), on='water_year', how='left').reset_index(drop=True)#.fillna('NA') -- 'NA' not compatible with parquet
        df_temp['site_no'] = site
        df_temp = df_temp[['site_no', 'water_year', 'annual_hmf', 'annual_duration', 'event_hmf', 'event_duration', 'total_events', 'timing']]
        #df_temp.to_csv('temp.csv')       
        
        list_results.append(df_temp)
        
    except Exception as e:
        print(f'ERROR: {e}')
        continue
    
df_results = pd.concat(list_results)
#df_results.to_csv(f'annual_metrics_subdf_{date_range}_{quantile}.csv', index=False)
pq.write_table(table=pa.Table.from_pandas(df_results), where=f'Prelim_Data/annual_metrics_subdf_{date_range}_{quantile}.parquet', compression='snappy')

Working on site 02342500 (1/4242)
Working on site 02361000 (2/4242)
Working on site 02363000 (3/4242)
Working on site 02369800 (4/4242)
Working on site 02371500 (5/4242)
Working on site 02372250 (6/4242)
Working on site 02373000 (7/4242)
Working on site 02374500 (8/4242)
Working on site 02376500 (9/4242)
Working on site 02377570 (10/4242)
Working on site 02378500 (11/4242)
Working on site 02398300 (12/4242)
Working on site 02399200 (13/4242)
Working on site 02400100 (14/4242)
Working on site 02401000 (15/4242)
Working on site 02401390 (16/4242)
Working on site 02404400 (17/4242)
Working on site 02406500 (18/4242)
Working on site 02408540 (19/4242)
Working on site 02412000 (20/4242)
Working on site 02413300 (21/4242)
Working on site 02414500 (22/4242)
Working on site 02414715 (23/4242)
Working on site 02415000 (24/4242)
Working on site 02419000 (25/4242)
Working on site 02421000 (26/4242)
Working on site 02422500 (27/4242)
Working on site 02423130 (28/4242)
Working on site 02423380 (29/

In [None]:
# Loading parquet file example
loaded_df = pq.read_table(source=f'Prelim_Data/annual_metrics_subdf_{date_range}_{quantile}.parquet').to_pandas()
#temp = loaded_df[loaded_df['site_no'] == '13335050']
years = loaded_df.groupby('water_year').count()
#print(years)
#print(temp)
#print(loaded_df.head(10))
#print(loaded_df.tail(10))