In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
# webscraping
import json
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen
import os

In [2]:
path = os.getcwd()
home_path = os.path.dirname(path)
gis_folder = f'{home_path}\\GIS'
data_folder = f'{home_path}\\Data'

In [3]:
website_us = "https://waterservices.usgs.gov/nwis/site/?format=rdb&sites="

In [4]:
end = "&siteOutput=expanded&siteStatus=all"

In [5]:
ids = np.loadtxt(f'{data_folder}\\station_ids.txt',dtype=int)
ids

array([131199050,   1312000,   1315000,   1315170,   1315226,   1315227,
         1315500,   1318500,   1327500,   1327750,   1328770,   1329490,
         1329500,   1330000,   1330500,   1334500,   1335754,   1335755,
         1358000,   1359133,   1359135,   1359528,   1359750,   1360640,
         1361000,   1361200,   1362090,   1362192, 136219503,  13621955,
         1362200,   1362230,   1362297, 136230002,   1362322,   1362336,
         1362342,   1362357,   1362368,   1362370,   1362380,   1362465,
         1362487,   1362497,   1362500,   1363382,   1363556,   1364500,
         1372051,   1372500, 137449480,   1374505,   1374531,   1374559,
         1374581,   1374598, 137462010,   1374654,   1374701,   1374781,
         1374821,   1374890,   1374901,   1374918,   1374930,   1374941,
         1374976,   1374987, 137499350,   1375000])

In [6]:
output_path_gdf = f"{gis_folder}\\discharge_stations.gpk"
if os.path.exists(output_path_gdf):
    gdf_station=gpd.read_file(output_path_gdf,driver="GPKG",crs="EPSG:4326")   
else:
    df_lst = []
    for i in ids:
        site = website_us + f'0{i}' + end
        page = urlopen(site)
        html_bytes = page.read()
        html = html_bytes.decode("utf-8")
        splitted = html[2297:].split("\n")
        header = splitted[0].split('\t')
        data = splitted[2].split('\t')
        df_lst.append(pd.DataFrame(data=[data],columns=header))
    df_stations = pd.concat(df_lst)

    df_stations.reset_index(drop=True,inplace=True)

    gdf_station = gpd.GeoDataFrame(data=df_stations, 
                     geometry=gpd.points_from_xy(df_stations.dec_long_va,df_stations.dec_lat_va,crs="EPSG:4326"))

    gdf_station.to_file(output_path_gdf, driver="GPKG")

In [7]:
gdf_station.head(1)

Unnamed: 0,agency_cd,site_no,station_nm,site_tp_cd,lat_va,long_va,dec_lat_va,dec_long_va,coord_meth_cd,coord_acy_cd,...,reliability_cd,gw_file_cd,nat_aqfr_cd,aqfr_cd,aqfr_type_cd,well_depth_va,hole_depth_va,depth_src_cd,project_no,geometry
0,USGS,131199050,FISHING BROOK (COUNTY LINE FLOW OUTLET) NR NEW...,ST,435838.6,741613.5,43.97738889,-74.2704167,N,1,...,,Y,,,,,,,GAZETTEER,POINT (-74.27042 43.97739)


# link to loading discharge data:

import the station which have discharge data and filter these

In [8]:
ids_with_Q = np.loadtxt(f'{data_folder}\\station_ids_with_Q.txt').astype(np.int64)

In [11]:
gdf_station_with_Q_out = gdf_station[gdf_station.apply(lambda x: True if int(x.site_no[1:]) in ids_with_Q else False,axis=1)]

In [12]:
gdf_station_with_Q_out.to_file(f"{gis_folder}\\discharge_stations_with_Q.gpk", driver="GPKG")

In [13]:
gdf_station_with_Q_out

Unnamed: 0,agency_cd,site_no,station_nm,site_tp_cd,lat_va,long_va,dec_lat_va,dec_long_va,coord_meth_cd,coord_acy_cd,...,reliability_cd,gw_file_cd,nat_aqfr_cd,aqfr_cd,aqfr_type_cd,well_depth_va,hole_depth_va,depth_src_cd,project_no,geometry
0,USGS,0131199050,FISHING BROOK (COUNTY LINE FLOW OUTLET) NR NEW...,ST,435838.6,0741613.5,43.97738889,-74.2704167,N,1,...,,Y,,,,,,,GAZETTEER,POINT (-74.27042 43.97739)
1,USGS,01312000,HUDSON RIVER NEAR NEWCOMB NY,ST,435757.7,0740753.0,43.96602778,-74.13138889,N,1,...,,NNNNNNNN,,,,,,,,POINT (-74.13139 43.96603)
2,USGS,01315000,INDIAN RIVER NEAR INDIAN LAKE NY,ST,434523.4,0741601.4,43.7565,-74.2670556,N,1,...,,NNNNNNNN,,,,,,,,POINT (-74.26706 43.75650)
3,USGS,01315170,DURGIN BROOK AT BOREAS RIVER NY,ST,435623.4,0735713.0,43.9398333,-73.9536111,G,1,...,,,,,,,,,WECASS,POINT (-73.95361 43.93983)
4,USGS,01315226,VANDERWHACKER BROOK TRIB 1 NEAR BOREAS RIVER NY,ST,435654.6,0735918.1,43.9485,-73.9883611,G,1,...,,,,,,,,,WECASS,POINT (-73.98836 43.94850)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,USGS,01374930,MUSCOOT RIVER AT BALDWIN PLACE NY,ST,412017.3,0734607.3,41.33813889,-73.7686944,M,1,...,,NNNNNNNN,,,,,,,,POINT (-73.76869 41.33814)
65,USGS,01374941,MUSCOOT RIVER BELOW DAM AT AMAWALK NY,ST,411715.4,0734513.1,41.2876111,-73.75363889,M,1,...,,NNNNNNNN,,,,,,,,POINT (-73.75364 41.28761)
66,USGS,01374976,ANGLE FLY BROOK AT WHITEHALL CORNERS NY,ST,411657.1,0734331.6,41.28252778,-73.7254444,M,1,...,,NNNNNNNN,,,,,,,,POINT (-73.72544 41.28253)
67,USGS,01374987,KISCO RIVER BELOW MOUNT KISCO NY,ST,411343.5,0734437.0,41.22875,-73.7436111,M,1,...,,NNNNNNNN,,,,,,,,POINT (-73.74361 41.22875)
