In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
import glob

# webscraping
import json
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen
import os

# plotting/mapmaknig/
import geopandas as gpd
from geospatial_functions import get_background_map
import rasterio
from rasterio.plot import show as rioshow

In [3]:
path = os.getcwd()
home_path = os.path.dirname(path)
data_folder = f'{home_path}\\Data'
gis_folder = f'{home_path}\\GIS'

#### From the Hydrolocial website we can extract hydrological units for the basin
[USGS Watershed Boundary Dataset (WBD) for 2-digit Hydrologic Unit - 02 (published 20230306) GeoPackage](https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/WBD/HU2/GPKG/WBD_02_HU2_GPKG.zip) using [USGS TNM Download (v2.0](https://apps.nationalmap.gov/downloader/#/)
![map](Figures\wanted_hydrological_areas.png)

#### The ids shown on the map can be loaded in

In [6]:
gdf_wbdhu = gpd.read_file(f"{gis_folder}\\WBDHU8-hudson.gpkg",driver="GPKG",crs="EPSG:4326")  
huc8_ids = ",".join(gdf_wbdhu.huc8.values)
huc8_ids

'02020002,02020007,02020003,02020004,02020001,02020008,02030101,02020006,02020005'

after analysis we concluded to only use the northern hudson:

In [17]:
huc8_ids = ",".join(gdf_wbdhu.iloc[[0,2,3,4,8]].huc8.values)
huc8_ids

'02020002,02020003,02020004,02020001,02020005'

#### and queries

In [18]:
station_id_rest_query = f"https://waterservices.usgs.gov/nwis/dv/?format=rdb&huc={huc8_ids}&parameterCd=00060&siteType=ST&siteStatus=all"
page = urlopen(station_id_rest_query)
html_bytes = page.read()
html = html_bytes.decode("utf-8")

#### this data can be processed

In [19]:
# the 15th line contains the number of sites:
skip_rows = 15
line = html.split("\n")[skip_rows]
print(line)
n = int(line[line.find("following ")+len("following "):line.find(" site")].strip())
print(n)

# Data for the following 109 site(s) are contained in this file
109


#### and parsed

In [20]:
data = html.split("\n")[skip_rows+1:skip_rows+1+n]
data_ordered = []
for line in data:
    words = line[line.find("USGS"):].split(" ")
    type_id_name = words[:2] + [" ".join(words[2:])]
    data_ordered.append(type_id_name)
df_sites = pd.DataFrame(data=data_ordered,columns=["provider","site_no","name"])
df_sites.site_no = df_sites.site_no.astype(int)

#### yielding this df:

In [21]:
df_sites

Unnamed: 0,provider,site_no,name
0,USGS,131199050,FISHING BROOK (COUNTY LINE FLOW OUTLET) NR NEW...
1,USGS,1311992,ARBUTUS POND OUTLET NEAR NEWCOMB NY
2,USGS,1312000,HUDSON RIVER NEAR NEWCOMB NY
3,USGS,1313500,CEDAR R BELOW CHAIN LAKES NR INDIAN LAKE NY
4,USGS,1314000,"HUDSON R AT GOOLEY, NEAR INDIAN LAKE NY"
...,...,...,...
104,USGS,1356190,LISHA KILL NORTHWEST OF NISKAYUNA NY
105,USGS,1357499,MOHAWK RIVER DIVERSION AT CRESCENT DAM NY
106,USGS,1357500,MOHAWK RIVER AT COHOES NY
107,USGS,1358000,HUDSON RIVER AT GREEN ISLAND NY


#####  The ids are then stored for further analysis

In [22]:
np.savetxt(f'{data_folder}\\station_ids.txt',df_sites.site_no.astype(int).values)