In [39]:
from pathlib import Path
import pandas as pd
import folium

In [40]:
def extract_wc_by_flag(
  timeseries_dir,
  start_date,
  end_date,
  variable_code,
  flag_types=("A", "P", "R"),
):
    
  """
  Read all CSVs in data/02_waterquality_timeseries and return records
  for one variable within date range and selected quality flags.

  Returns columns:
    STREAM_ID, DateTime, variable_code, value, flag
  """
    
  timeseries_dir = Path(timeseries_dir)
  start_date = pd.to_datetime(start_date)
  end_date = pd.to_datetime(end_date)
  flag_types = set(flag_types)

  allowed_codes = {"Q", "WTemp", "SpC", "DO", "pH", "Turb", "NO3", "fDOM", "DOC", "PO4", "Chla", "PC"}
  if variable_code not in allowed_codes:
      raise ValueError(f"Unsupported variable_code '{variable_code}'. Allowed: {sorted(allowed_codes)}")

  out = []

  for csv_path in sorted(timeseries_dir.glob("*.csv")):
      # read header only to find value/flag columns
      cols = pd.read_csv(csv_path, nrows=0).columns.tolist()

      # find first value column matching variable code (e.g., WTemp_C, DO_mgL, Turb_FNU...)
      value_candidates = [c for c in cols if c != "DateTime" and not c.startswith("Flag_") and
c.startswith(variable_code)]
      if not value_candidates:
          continue
      value_col = value_candidates[0]

      # find matching flag column (preferred exact), fallback to any Flag_ + value_col suffix
      exact_flag = f"Flag_{value_col}"
      if exact_flag in cols:
          flag_col = exact_flag
      else:
          fallback_flags = [c for c in cols if c.startswith("Flag_") and c.endswith(value_col.replace(variable_code, "",
1))]
          if not fallback_flags:
              continue
          flag_col = fallback_flags[0]

      df = pd.read_csv(csv_path, usecols=["DateTime", value_col, flag_col])
      df["DateTime"] = pd.to_datetime(df["DateTime"], errors="coerce")
      df = df[(df["DateTime"] >= start_date) & (df["DateTime"] <= end_date)]
      df = df[df[flag_col].isin(flag_types)]

      if df.empty:
          continue

      df = df.rename(columns={value_col: "value", flag_col: "flag"})
      df["STREAM_ID"] = csv_path.stem
      df["variable_code"] = variable_code
      out.append(df[["STREAM_ID", "DateTime", "variable_code", "value", "flag"]])

  if not out:
      return pd.DataFrame(columns=["STREAM_ID", "DateTime", "variable_code", "value", "flag"])

  return pd.concat(out, ignore_index=True)



In [41]:
# Example: 
# Flag types: {"A": "Approved", "P": "Provisional", "R": "Rejected"}

filtered_df = extract_wc_by_flag(
  "data/02_waterquality_timeseries",
  start_date="2018-01-01",
  end_date="2020-12-31",
  variable_code="WTemp",
  flag_types=("A"),
)

In [42]:
filtered_df

Unnamed: 0,STREAM_ID,DateTime,variable_code,value,flag
0,STREAM-gauge-1616,2018-10-01 04:00:00,WTemp,13.800,A
1,STREAM-gauge-1616,2018-10-01 05:00:00,WTemp,13.775,A
2,STREAM-gauge-1616,2018-10-01 06:00:00,WTemp,13.700,A
3,STREAM-gauge-1616,2018-10-01 07:00:00,WTemp,13.600,A
4,STREAM-gauge-1616,2018-10-01 08:00:00,WTemp,13.550,A
...,...,...,...,...,...
2407583,STREAM-gauge-1772,2020-12-30 20:00:00,WTemp,4.300,A
2407584,STREAM-gauge-1772,2020-12-30 21:00:00,WTemp,4.300,A
2407585,STREAM-gauge-1772,2020-12-30 22:00:00,WTemp,4.300,A
2407586,STREAM-gauge-1772,2020-12-30 23:00:00,WTemp,4.300,A


In [59]:
filtered_gages = list(set(filtered_df["STREAM_ID"]))

Add geoinformation to the filtered data and plot them on a map

In [50]:
meta = pd.read_csv("data/01_metadata.csv")

# # keep all filtered rows, attach metadata by STREAM_ID
# filtered_merged_df = filtered_df.merge(meta, on="STREAM_ID", how="left")

In [51]:
meta.columns

Index(['STREAM_ID', 'SourceID', 'site name', 'source', 'latitude_wgs84',
       'longitude_wgs84', 'State Code', 'State', 'drainagearea_sqkm'],
      dtype='str')

In [73]:
def plot_gages_map(df):
  lat_col = "latitude_wgs84" 
  lon_col = "longitude_wgs84" 
  name_col = "site name" 
  true_color = "green"
  false_color = "gray"
    
  # one point per gage
  g = (
      meta[["STREAM_ID", name_col, lat_col, lon_col]]
      .dropna(subset=[lat_col, lon_col])
      .drop_duplicates(subset=["STREAM_ID"])
      .copy()
  )

  # center map
  m = folium.Map(
      location=[g[lat_col].mean(), g[lon_col].mean()],
      zoom_start=7,
      tiles="OpenStreetMap"
  )

  # add filtered markers
  for _, r in g.iterrows():
      color = false_color
      fill_opacity=0.1
      if r['STREAM_ID'] in filtered_gages:
          color = true_color
          fill_opacity=0.9
          
      folium.CircleMarker(
          location=[r[lat_col], r[lon_col]],
          radius=4,
          color=color,
          fill=True,
          fill_opacity=fill_opacity,
          popup=f"{r['STREAM_ID']}<br>{r[name_col]}"
      ).add_to(m)

  # add all markers
    

  return m

In [74]:
m = plot_gages_map(filtered_merged_df) 
m

In [83]:
# print the counts
total_gages = len(set(meta['STREAM_ID']))
filtered_gages = len(set(filtered_df['STREAM_ID']))

print(f"Total gages: {total_gages}\nFiltered gages: {filtered_gages}")

Total gages: 539
Filtered gages: 119
