## Create GIS Layers

In [3]:
import s3fs
import pandas
import geopandas
from pathlib import Path

import utils.S3hsclient as hsclient

In [4]:
df = pandas.read_csv("data/01_metadata.csv")

In [6]:
gdf = geopandas.GeoDataFrame(
    df,
    geometry=geopandas.points_from_xy(df['longitude_wgs84'], df['latitude_wgs84']),
    crs='EPSG:4326' # WGS 84
)

# save to parquet 
gdf.to_parquet('data_parquet/metadata.parquet', index=False)

In [35]:
gdf = gdf.rename(columns={"latitude_wgs84": "latitude", "longitude_wgs84": "longitude", "drainagearea_sqkm": "drain_sqkm" })

In [36]:
gdf.to_file("gis/gauges.shp")

## Convert Shapefile Points into Parquet


In [45]:
gdf = geopandas.read_file('gis/gauges.shp')

In [46]:
gdf.to_parquet('data_parquet/gauges.parquet')

In [60]:
# read the data from HydroShare

## Create Parquet Files

In [49]:
def create_parquet(input_data_dir, output_filename, date_cols=[], sort_by=None):
    quality_dir = Path(input_data_dir)
    dfs = []
    for f in quality_dir.glob('*.csv'):
        df_temp = pandas.read_csv(f,
                                 parse_dates=date_cols)
        df_temp['gauge'] = f.stem
        dfs.append(df_temp)
        
    df = pandas.concat(dfs)
    
    if sort_by is not None:
        df.sort_values(by=sort_by, inplace=True)
    
    df.to_parquet(output_filename, index=False)
    

In [39]:
create_parquet('data/07_lulc', 'data_parquet/lulc.parquet', date_cols=['year'])
create_parquet('data/08_dynamic_anthropogenic', 'data_parquet/dynamic_antropogenic.parquet', date_cols=['year'])
create_parquet('data/02_waterquality_timeseries', 'data_parquet/water_quality.parquet', date_cols=['DateTime'], sort_by='DateTime')

  df_temp = pandas.read_csv(f,
  df_temp = pandas.read_csv(f,
  df_temp = pandas.read_csv(f,
  df_temp = pandas.read_csv(f,
  df_temp = pandas.read_csv(f,


In [43]:
create_parquet('data/05_dynamic_historical_meteorology', 'data_parquet/dynamic_historical_meteorology.parquet', date_cols=['time'], sort_by='time')

In [50]:
create_parquet('data/09_streamflow_discharge', 'data_parquet/streamflow.parquet', date_cols=['DateTime'], sort_by='DateTime')

In [51]:
create_parquet('data/10_grab_samples', 'data_parquet/grab_samples.parquet', date_cols=['DateTime'], sort_by='DateTime')

## Query Parquet Data

In [2]:
import pyarrow.parquet as pq
from datetime import datetime

In [10]:
file_path = 'water_quality.parquet'


In [23]:
gauge = 'STREAM-gauge-1665'
st = datetime(2018, 10, 1)
et = datetime(2020, 3, 1)

table = pq.read_table(
    file_path,
    filters = [
        ('gauge', '=', gauge),
        #('DateTime', '>=', st),
        #('DateTime', '<=', et)
    ]
)

In [25]:
subset = table.to_pandas()
subset.to_csv('water_quality_subset.csv', index=False)

## Misc

In [2]:
hs = hsclient.S3HydroShare()

Username:  TonyCastronova
Password for TonyCastronova:  ········


In [3]:
resource_id = '248ec0f13d6c4580b2faa66425cb58c3'
s3_path = f'tonycastronova/{resource_id}/data/contents/gauges.parquet'

In [4]:
resource = hs.resource(resource_id)

In [5]:
resource.s3_ls()

['tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/dynamic_antropogenic.parquet',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/gauges.parquet',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/gis',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/grab_samples.parquet',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/lulc.parquet',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/streamflow.parquet',
 'tonycastronova/248ec0f13d6c4580b2faa66425cb58c3/data/contents/water_quality.parquet']

In [30]:
df = geopandas.read_parquet(
    s3_path,
    filesystem=hs.get_s3_filesystem(),
)

In [31]:
df

Unnamed: 0,STREAM_ID,SourceID,site_name,source,latitude,longitude,state_code,State,drain_sqkm,geometry
0,STREAM-gauge-1616,1427195,"Equinunk Creek near Dillontown, PA",USGS,41.840278,-75.238333,PA,Pennsylvania,121.470531,POINT (-75.23833 41.84028)
1,STREAM-gauge-1617,1427207,DELAWARE RIVER AT LORDVILLE NY,USGS,41.867278,-75.21375,PA,Pennsylvania,4118.0841,POINT (-75.21375 41.86728)
2,STREAM-gauge-1618,1427510,DELAWARE RIVER AT CALLICOON NY,USGS,41.75675,-75.057417,PA,Pennsylvania,4713.7818,POINT (-75.05742 41.75675)
3,STREAM-gauge-1619,1428750,"West Branch Lackawaxen River near Aldenville, PA",USGS,41.67453,-75.376013,PA,Pennsylvania,105.153594,POINT (-75.37601 41.67453)
4,STREAM-gauge-1620,1429000,"West Branch Lackawaxen River at Prompton, PA",USGS,41.587218,-75.326829,PA,Pennsylvania,154.622403,POINT (-75.32683 41.58722)
5,STREAM-gauge-1621,1431500,"Lackawaxen River at Hawley, PA",USGS,41.4762,-75.172119,PA,Pennsylvania,751.0971,POINT (-75.17212 41.4762)
6,STREAM-gauge-1622,1432055,"Lackawaxen River near Baoba, PA",USGS,41.469417,-75.108944,PA,Pennsylvania,1403.77458,POINT (-75.10894 41.46942)
7,STREAM-gauge-1623,1432110,"Lackawaxen River at Rowland, PA",USGS,41.475923,-75.036281,PA,Pennsylvania,1525.50411,POINT (-75.03628 41.47592)
8,STREAM-gauge-1624,1432160,DELAWARE RIVER AT BARRYVILLE NY,USGS,41.474833,-74.91325,PA,Pennsylvania,6886.78341,POINT (-74.91325 41.47483)
9,STREAM-gauge-1625,1432495,"Shohola Creek near Walker Lake, PA",USGS,41.45,-74.921667,PA,Pennsylvania,192.954255,POINT (-74.92167 41.45)
