In [None]:
import os
import shutil
import fsspec
import ujson
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray
from datetime import datetime, timedelta

In [None]:
import sys
import subprocess

try:
    import google.colab

    ENV_IS_CL = True
    subprocess.run(
        [
            "git",
            "clone",
            "https://github.com/AlabamaWaterInstitute/data_access_examples",
        ]
    )
    sys.path.append("/content/data_access_examples")
except:
    ENV_IS_CL = False
    sys.path.append(r"..")
    sys.path.append(r"../data_access_examples")
    sys.path.append(r"git")

print(sys.path[0])
import nwm_filenames.listofnwmfilenames as lnf
from nwm_network.NWM_2_1_outlets import outlets_sorted

In [None]:
def gen_json(files):
    open_files = fsspec.open_files(files)
    out = []
    for file in open_files:
        with file as f:
            out.append(SingleHdf5ToZarr(f, file.path).translate())

    mzz = MultiZarrToZarr(
        out,
        remote_protocol="gcs",
        concat_dims=["time", "reference_time"],
    )

    tot = mzz.translate()
    return tot

In [None]:
import nwm_filenames.listofnwmfilenames as lnf

configs = [
    (1, 1, 1, -1),  # Short_range
    (2, 1, 1, 1),  # Medium range mem_1
    (2, 1, 1, 2),  # Medium range mem_2
    (2, 1, 1, 3),  # Medium range mem_3
    (2, 1, 1, 4),  # Medium range mem_4
    (2, 1, 1, 5),  # Medium range mem_5
    (2, 1, 1, 6),  # Medium range mem_6
    (2, 1, 1, 7),  # Medium range mem_7
    (3, 1, 1, -1),  # Medium range no_da
]

prefix = ""
start_date = "20221201"
end_date = "20221201"
configuration_list = []
configuration_list.extend([(*_c, start_date, end_date, [0], 5) for _c in configs])
print(configuration_list)

file_collections = [lnf.create_file_list(*_c) for _c in configuration_list]

In [None]:
file_collections[0]

### Generate plot data for one random stream segment

In [None]:
from time import time

In [None]:
%%time
# id_list = 101
id_list = [22811611]  # Mississippi River outlet
# id_list = [22811611, 20427622]  # Mississippi River outlet
# id_list = 20427622  # Random small interior outlet somewhere in Arizona; see https://github.com/AlabamaWaterInstitute/data_access_examples/blob/main/nwm_network/route_link_fsspec.ipynb
# id_list = outlets_sorted
ds_list = []
df_list = []
tot_list = []
for _i, files in enumerate(file_collections[0:8]):
    st = time()
    print(f"generating jsons for {_i}", end="\t")
    tot_list.append(gen_json(files))
    print(f"{time()-st} elapsed")

    print(f"creating xarray dataset for {_i}", end="\t")
    backend_args_1 = {
        "consolidated": False,
        "storage_options": {
            "fo": tot_list[_i],
            "remote_protocol": "gcs",
            "remote_options": {"anon": True},
        },
    }
    ds_1 = xr.open_dataset("reference://", engine="zarr", backend_kwargs=backend_args_1)
    print(f"{time()-st} elapsed")

    print(f"slicing dataset to feature for {_i}", end="\t")
    ds_select_1 = ds_1.sel(feature_id=id_list)
    ds_list.append(ds_select_1)
    print(f"{time()-st} elapsed")

    print(f"querying/retrieving data and creating dataframe for {_i}", end="\t")
    df_select_1 = ds_select_1["streamflow"].to_dataframe()
    df_list.append(df_select_1)
    print(f"{time()-st} elapsed")

    # print(f"selecting feature for {_i}", end="\t")
    # ds_select_1.plot.scatter("time","streamflow")
    # print(f"{time()-st} elapsed")

    print(f"finishing {_i}", end="\t")
    print(f"{time()-st} total time elapsed")
    print(f"\n")

In [None]:
%%time
# id_list = 101
id_list = [22811611]  # Mississippi River outlet
# id_list = [22811611, 20427622]  # Mississippi River outlet
# id_list = 20427622  # Random small interior outlet somewhere in Arizona; see https://github.com/AlabamaWaterInstitute/data_access_examples/blob/main/nwm_network/route_link_fsspec.ipynb
ds_list = []
df_list = []
tot_list = []
for _i, files in enumerate(file_collections[0:8]):
    st = time()
    print(f"generating jsons for {_i}", end="\t")
    tot_list.append(gen_json(files))
    print(f"{time()-st} elapsed")

In [None]:
%%time
id_list = outlets_sorted
id_list = [22811611]  # Mississippi River outlet

for _i, files in enumerate(file_collections[0:8]):
    st = time()
    print(f"creating xarray dataset for {_i}", end="\t")
    backend_args_1 = {
        "consolidated": False,
        "storage_options": {
            "fo": tot_list[_i],
            "remote_protocol": "gcs",
            "remote_options": {"anon": True},
        },
    }
    ds_1 = xr.open_dataset("reference://", engine="zarr", backend_kwargs=backend_args_1)
    print(f"{time()-st} elapsed")

    print(f"slicing dataset to feature for {_i}", end="\t")
    ds_select_1 = ds_1.sel(feature_id=id_list)
    ds_list.append(ds_select_1)
    print(f"{time()-st} elapsed")

    print(f"querying/retrieving data and creating dataframe for {_i}", end="\t")
    df_select_1 = ds_select_1["streamflow"].to_dataframe()
    df_list.append(df_select_1)
    print(f"{time()-st} elapsed")

    # print(f"selecting feature for {_i}", end="\t")
    # ds_select_1.plot.scatter("time","streamflow")
    # print(f"{time()-st} elapsed")

    print(f"finishing {_i}", end="\t")
    print(f"{time()-st} total time elapsed")
    print(f"\n")

In [None]:
df_list[1].xs(22811611, axis=0, level=2, drop_level=False)
# for more help, see https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe