In [None]:
import geopandas as gpd
import s3fs
import pandas as pd
import boto3
import dask_geopandas
import dask.dataframe as dd

In [None]:
pqt_list = [
    'climate_risk/flood/exposure/koordinates/climate_koordinates_floodplain.parquet.gzip',
    'climate_risk/wildfire/exposure/historical/iowa_state_environmental_mesonet/climate_iowa_mesonet_wildfire_warnings.parquet.gzip',
    'governance/community_preparedness/usda_forest_service/governance_usda_fuel_reduction.parquet.gzip',
    'governance/natural_resource_conservation/usda/forest_to_faucets/F2F2_Assessment/governance_usda_watershed_risk.parquet.gzip'
]

In [None]:
fs = s3fs.S3FileSystem()
bucket = 'ca-climate-index'
path = '2b_reproject/' 

for pqt in pqt_list:
    ppath = path+pqt
    bucket_uri = f's3://{bucket}/{ppath}'
    print(pqt)
    df = gpd.read_parquet(bucket_uri)
    display(df)

In [None]:
### whatever metric calculation(s) ###

# The biggest dataset: ISU Mesonet's flood warning database

In [None]:
# make a list of paths, since this is actually 10 files
def build_isu_mesonet_file_list(
    path='2b_reproject/climate_risk/flood/exposure/isu_environmental_mesonet'
):
    """ Build a list of shapefile URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_shapefiles = []
    bucket_name = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        all_shapefiles.append(obj.key)
    return all_shapefiles

In [None]:
pqt_list = build_isu_mesonet_file_list()
path = '2b_reproject/climate_risk/flood/exposure/isu_environmental_mesonet' 
to_drop = ['hilbert_distance', 'WFO', 'EXPIRED', 'INIT_ISS', 'INIT_EXP',
       'PHENOM', 'GTYPE', 'SIG', 'ETN', 'STATUS', 'NWS_UGC',
       'UPDATED', 'HV_NWSLI', 'HV_SEV', 'HV_CAUSE', 'HV_REC', 
       'POLY_BEG', 'POLY_END', 'WINDTAG', 'HAILTAG', 'TORNTAG', 
        'DAMAGTAG', 'index_right', 'USCB_NAME','AREA_KM2','EMERGENC',
          'geometry']

df_list = []
for f in pqt_list:
    bucket_uri = f's3://{bucket}/{f}'
    # read in as dask geopandas dataframe
    df = dask_geopandas.read_parquet(bucket_uri)
    # reduce memory use by dropping unneeded columns
    df = df.drop(columns=to_drop)
    # reduce by counting the # of events per tract:
    df_out = df.groupby(['USCB_GEOID']).count(
    ).compute().reset_index().rename(
        columns={'ISSUED':'number_warnings'})
    # append df_out to the list of dfs
    df_list.append(df_out)
    # clear memory
    df_out = None

In [None]:
import dask.dataframe as dd

# initiate merged dataframe
merged_df = df_list[0]
# loop and merge one by one in list
for df_i in df_list[1:]:
    # merge on GEOID
    merged_df = dd.merge(merged_df, df_i, left_on=["USCB_GEOID"],
        right_on=["USCB_GEOID"], how="outer")
    # add up flood warning counts from both the dataframes
    merged_df['number_warnings'] = merged_df['number_warnings_x'].add(
        merged_df['number_warnings_y'], fill_value=0)
    display(merged_df)
    # display(merged_df.loc[merged_df['USCB_GEOID'] == '06065046900'])
    # only keep the total
    merged_df = merged_df.drop(columns=['number_warnings_x','number_warnings_y'])
    # clear df_i to start on the next df in the list
    # and avoid double-counting
    df_i = None

In [None]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [None]:
merged_df = merged_df.rename(columns={'USCB_GEOID':'GEOID'})
flood_warning_df = pd.merge(merged_df,ca_boundaries,on="GEOID")

In [None]:
flood_gdf = gpd.GeoDataFrame(
    flood_warning_df, geometry=flood_warning_df["geometry"]
)

In [None]:
flood_gdf.plot(column="number_warnings",legend=True,scheme="quantiles")