In [None]:
import pandas as pd
import numpy as np
import h5py

# Notebook to split hydro h5 file by timewindow

In [None]:
hfile='../tests/data/historical_v82.h5'

In [None]:
hf=h5py.File(hfile,'r')

## Start with the root

In [None]:
[g for g in hf]

In [None]:
## Show next level down

In [None]:
[g for g in hf['hydro']]

In [None]:
def get_children(hf,path):
    return [g for g in hf[path]]

## Children of data 
This contains the time indexed data tables.

In [None]:
data_tables=get_children(hf,'/hydro/data')
print(data_tables)

## Children of geometry
These are the geometry tables for hydro. Typically can be represented by 2D tables and is the internal information in hydro and should be used to interpret the tables in /hydro/data path

In [None]:
geometry_tables=get_children(hf,'/hydro/geometry')
print(geometry_tables)

## Children of input
These are the tables of hydro input. This is how hydro views the input provided

In [None]:
input_tables=get_children(hf,'/hydro/input')
print(input_tables)

# Slicing for a time window and saving to a new hydro file

* The tables under /hydro/data should be sliced for this time window. This should be straightforward once the index corresponding to the time window is calculated, then the table sliced and the slice written to the new file
* The tables under /hydro/geometry should not be effected by the time window slicing so should just be copied over to the new file.
* The tables under /hydro/input has the model start time and end time. Those could be argued to be sliced as well. However for QUAL and PTM (the programs using this information), they usually do not look at the start/end time information from the tidefile and so it is optional to change this information


In [None]:
t=hf[f'/hydro/data/{data_tables[0]}']

In [None]:
import pandas as pd

In [None]:
def get_start_time(tbl):
    return tbl.attrs['start_time'][0].decode('utf-8')
def get_time_interval(tbl):
    return tbl.attrs['interval'][0].decode('utf-8')
def get_slice_indices(tbl,stime,etime):
    dindex=pd.date_range(start=get_start_time(tbl),freq=get_time_interval(tbl),periods=tbl.shape[0])
    dfindex=pd.DataFrame(np.arange(tbl.shape[0]),index=dindex)
    return tuple(dfindex[stime:etime].iloc[[0,-1]][0].values)
def slice_table(tbl,stime,etime):
    bi,ei=get_slice_indices(tbl,stime,etime)
    return tbl[slice(bi,ei)]

In [None]:
outfile='%s_sliced.h5'%hfile.split('.h5')[0]
print(outfile)

In [None]:
def copy_attrs_table(ntbl,tbl):
    for a in tbl.attrs:
        ntbl.attrs[a]=tbl.attrs[a]    

In [None]:
def mins_since_origin(dstr,origin_date='1899-12-31'):
    '''
    origin date default is HEC convention. 
    '''
    delt=pd.to_datetime(dstr)-pd.to_datetime(origin_date)
    return delt.total_seconds()/60.

x=mins_since_origin('1990-01-02','1899-12-31')
print(x)

In [None]:
stime='1990-01-10'
etime='1990-01-15'
with h5py.File(outfile, "w") as nhf:
    for tname in data_tables:
        tpath='/hydro/data/%s'%tname
        tbl=hf[tpath]
        nhf[tpath]=slice_table(tbl,stime,etime)
        ntbl=nhf[tpath]
        copy_attrs_table(ntbl,tbl)
        ntbl.attrs['start_time']=bytes(str(pd.to_datetime(stime)),'utf-8')
    for tname in geometry_tables:
        tpath='/hydro/geometry/%s'%tname
        tbl=hf[tpath]
        nhf[tpath]=hf[tpath][:]
        ntbl=nhf[tpath]
        copy_attrs_table(ntbl,tbl)
    for tname in input_tables:
        tpath='/hydro/input/%s'%tname
        tbl=hf[tpath]
        nhf[tpath]=hf[tpath][:]
        ntbl=nhf[tpath]
        copy_attrs_table(ntbl,tbl)

In [None]:
hf['hydro'].attrs['Start time']

In [None]:
hf['hydro'].attrs['Start time string']

In [None]:
pd.to_datetime('10JAN1990').strftime('%d%b%Y %H%M').upper()