In [None]:
import os
import urllib.request
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import signal

In [None]:
project = Path.cwd()
output = project / "output"
output_data = output / "data"
input = project / "input"
figures = project / "figures"

# functions

In [None]:
def apply_butterworth(discharge, buff=20, dts=25, N=5):
    """apply butterworth filter to remove tidal influence from data
    
    input:
    discharge = discharge dataframe
    dts       = sampling interval in minutes
    N         = filter order
    
    returns:
    filtered dataframe
    
    """
    # parameters
    crit_freq = 1/(((24.8412 + buff)*60*60))  # lundar day in hours to Hz
    fs = 1/(dts*60)  # sampling frequency
    
    b, a = signal.butter(N, crit_freq, btype='lowpass', fs=fs)

    filtered = discharge.apply(lambda x: signal.filtfilt(b, a, x))
    filtered.columns = ['discharge_cms_Butterworth_filtered']
    
    return filtered

def apply_godin(discharge):
    """apply Godin filter to remove tidal influence from data
    
    input:
    discharge = discharge dataframe
    
    returns:
    filtered dataframe
    
    """
    # parameters
    # Godin filter (USGS standard)
    godin = discharge.resample('1H').mean().interpolate(method='time').rolling(
        window=24, center=True).mean().rolling(
        window=25, center=True).mean().rolling(
        window=25, center=True).mean()
    godin.columns = ['discharge_cms_Godin_filtered']
    
    return godin

def download_nwis_data(
    site_name, site_no, begin_date, end_date, data_code=60, skiprows=28
):
    """download data from https://nwis.waterdata.usge and outputs as dataframe

    inputs:
    site_name = user specified name for site
    site_no = USGS site number code
    begin_date = first day in timeseries (YYYY-MM-dd)
    end_date = last day in timeseries (YYYY-MM-dd)
    skiprows = number of header rows to skip (default=28)

    return = discharge (pandas DataFrame)
    """

    # output file and request
    out_fn = output_data / f"{site_name}_{site_no}_{begin_date}_{end_date}.txt"
    request = f"https://nwis.waterdata.usgs.gov/usa/nwis/uv/?cb_{data_code:05d}=on&format=rdb&site_no={site_no}&period=&begin_date={begin_date}&end_date={end_date}"

    # get data
    txt, http = urllib.request.urlretrieve(request, out_fn)
    
    # Pandas
    try:
        data = pd.read_csv(
            txt,
            sep="\s+",
            skiprows=skiprows,
            usecols=[2, 3, 5],
            parse_dates={"datetime_CST": [0, 1]},
            header=0,
            index_col=0,
            names=["date", "time", "discharge"],
        )
    except:
        print("Problem with parsing text ")
        os.remove(txt)
        return None
    
    try:
        data.index = (
            data.index.tz_localize("America/Chicago", ambiguous=True)
            .tz_convert("UTC")
            .tz_localize(None)
        )
    except AttributeError as e:
        print("Problem converting datetime to UTC. Check data")
        os.remove(txt)
        return None

    data.to_csv(
        output_data / f"{site_name}_{begin_date}.csv",
        sep="\t",
        header=["val"],
        index_label=["datetime_UTC"],
    )
    return data

In [None]:
# read in site list
site_list_fn = input / "site_list_MS_Sound_area.csv"
site_list = pd.read_csv(site_list_fn)

In [None]:
# download
begin_date = "2018-01-01"
end_date = "2019-01-01"

In [None]:
data_code = 480
df = pd.DataFrame(columns=["site_no", "site_name", "lat", "lon"])

for row in site_list.itertuples():
    url = row._7
    site_no = url.split("=")[-1]
    site_name = row._2
    lat = row._6
    lon = row._5
    data = download_nwis_data(
        site_name, site_no, begin_date, end_date, data_code=data_code
    )
    if data is not None:
        df = df.append(
            {"site_no": site_no, "site_name": site_name, "lat": lat, "lon": lon},
            ignore_index=True,
        )

In [None]:
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326"
)
gdf.to_file(output / "salinity_stations_2018.shp")
df[["lon", "lat", "site_no"]].to_csv(
    output / "salinity_stations_2018.xyn", sep="\t", index=False, header=False
)