In [None]:
import datetime, random, time
import plotly

import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

In [None]:
# dumb setup stuff\n
import IPython.core.display as ipnb_display
ipnb_display.display( ipnb_display.HTML("<style>.container { width:100% !important; }</style>") )
# matplotlib.rcParams['figure.figsize'] = [8, 5
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,f}'.format

pio.templates["ians_template"] = go.layout.Template(
  layout = go.Layout(
    height = 900
  )
)
pio.templates.default = "ians_template"

In [None]:
from ParFlowStreamVerification_Large import *
import climata.usgs as usgs

In [None]:
# Stations map
stations_map_path = "NWM_Gage_Adjustments_attribute_table_20200510.csv"
station_map = load_station_mapping_as_DataFrame( stations_map_path )

In [None]:
def get_multiple_data( station_ids, parameter_id=default_parameter_id, start_date=None, end_date=None, date_range=None ):
  # Check proper usage of date parameters
  if (start_date is None or end_date is None) and date_range is None:
    raise ValueError("load_station_data_over_date_range_as_DataFrame must use either both start and end dates, or date_range")

  # Properly cast station and parameter ids
  station_ids = [ station_id_cast( station_id ) for station_id in station_ids ]
  parameter_id = parameter_id_cast( parameter_id )
 
#   print( station_ids )
  
  # Create date-range
  # From provided time-period parameters
  if date_range is None:
    date_range_list = pd.date_range( start=start_date, end=end_date ).tolist()
  # From provided list of dates
  elif isinstance( date_range, list ):
    date_range_list = sorted( date_range )
  # From provided pandas DatetimeIndex
  elif isinstance( date_range, pandas.core.indexes.datetimes.DatetimeIndex ):
    date_range_list = date_range.tolist()

  using_start_date = date_range_list[0]
  using_end_date = date_range_list[-1]
  
  # Generator to download data in batches
  batch_size = 500
  batched_station_requests = (
    usgs.DailyValueIO(
      start_date = using_start_date,
      end_date   = using_end_date,
      station    = station_ids[batch_station_ids_index:batch_station_ids_index+batch_size],
      parameter  = parameter_id
    )
    for batch_station_ids_index in range(0,len(station_ids),batch_size)
  )
  
#   for request in station_requests:
#     print( request )
#   print("=============")
#   for request in station_requests:
#     print( request )
#     for value_object in request.data:
#       print( value_object )
      
  # climata.usgs.DailyValueIO claims to be iterable, but in practice there is nothing to iterate over...
  # Note the IO obeject annot be exhausted
  #(ie, it can be iterated over multiple times to get the different aspects of the request(s) from it)

  # Flatten data into list of dictionaries with the data and flow extracted
  # Note this is a generator, which *CAN* be exhausted. Only use *ONCE*!
  flattened_data_generator = (
    {
      "station"                                : station_id_cast(request.site_code),
      "date"                                   : date_value_object.date,
      parameter_id_cast(request.variable_code) : date_value_object.value
    }
    for station_requests in batched_station_requests
    for request in station_requests
    for date_value_object in request.data
  )

  # Create DatFrame from the flattened generator.
  station_data = pd.DataFrame(
    columns = ["station", "date", parameter_id],
    data    = flattened_data_generator,
  ).sort_values( ["station", "date"] )

#   if station_data.index.size == 0:
#     raise RuntimeError(f"Stations {station_ids} does not have data for parameters {parameter_id} for date range {using_start_date} - {using_end_date}")

  return station_data

In [None]:
# data-range
end_date = "2010-12-31"
periods = 365
date_range = pd.date_range(end=end_date, periods=periods).tolist()
start_date = date_range[0]

stations = station_map.STNID.values

In [None]:
print( f"expect {len(date_range) * len(stations)} rows" )

In [None]:
# for pid in range(0,101):
#   get_multiple_data( station_ids=[station_map.STNID[0]], parameter_ids=[pid], date_range=date_range)

start = time.perf_counter()
all_station_data = get_multiple_data( station_ids=stations, date_range=date_range)
end = time.perf_counter()

elapsed = end - start
print( f"elapsed {elapsed}s" )
print( f"elapsed {elapsed/len(all_station_data)} s/station-day" )

In [None]:
len( all_station_data )