In [None]:
import pandas as pd
from datetime import datetime
import os.path
from dotenv import load_dotenv
import awswrangler as wr
import h3

In [None]:
load_dotenv()
DATA_DIR = os.environ.get("DATA_DIR")
S3_DATA_DIR = os.environ.get("S3_DATA_DIR")

## Using Pandas to Read Parquet Files

In [None]:
data_pd = pd.read_parquet(f"{DATA_DIR}\df.parquet.gzip")
print(data_pd.count()) # example of operation on the returned DataFrame

## Reset Index to Datetime

In [None]:
start_time = pd.to_datetime("2022-05-01 21:00:00") #inclusive
end_time = pd.to_datetime("2022-05-01 23:00:00") #not inclusive

In [None]:
# Reset index so the time1 is only index
data_pd_time_index = data_pd.reset_index(level=['lon','lat','nv'])

In [None]:
# Set index to pandas datetime
data_pd_time_index.index = pd.to_datetime(data_pd_time_index.index)

In [None]:
data_pd_time_index.index[0]

In [None]:
# Filter by time
data_pd_time_index.loc[(data_pd_time_index.index>=start_time)&(data_pd_time_index.index<end_time)]

## Filtering by the hierarchical geospatial index

In [None]:
# Add h3 field to the dataframe
data_pd_time_index['h3_index'] = data_pd_time_index.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], 2), axis=1)

In [None]:
# Test a query using h3
h3_index = '820327fffffffff'
filtered_df = data_pd_time_index[data_pd_time_index['h3_index'] == h3_index]

In [None]:
filtered_df.describe()

## Exploring AWS Wrangler for Reading Parquet Files from S3

In [None]:
generator = wr.s3.read_parquet(path = f"{S3_DATA_DIR}/precipitation_data/", chunked=1_000_000)

In [None]:
# Loop through the first generator created to reset indices
for df in generator:
    df_time_index = df.reset_index(level=['lon','lat','nv'])
    pd.to_datetime(df_time_index.index)
    print(df_time_index.index.day)
    break
    