### Cal-CRAI Subsetting -- ISU Mesonet Flood Warning Data
This notebook processes three different flood datasets sourced from the Iowa State University Mesonet:
https://mesonet.agron.iastate.edu/request/gis/watchwarn.phtml

Data subsetting includes:
* merging datasets into one based on time and location
* eliminates duplicate warnings across the datasets based on time and location

Output is uploaded to 2a_subset directory within AWS

In [1]:
import geopandas as gpd
import os
import boto3
import zipfile
import sys
import pandas as pd
import shutil

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

In [2]:
@append_metadata
def merge_flood(file_names, aws_dir, aws_out_dir, export=False, varname=''):
    """
Iowa State University Mesonet data on flood warnings is spread over three different files: flash floods and two separate entries of flood (fl and fa). This function merges all three into one file and checks for duplicate flood/flash flood warning entries based on the same time and location. The resulting merged and subsetted .zip file is then uploaded to AWS.
    
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are 
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    file_names: string
        list of flood files  
    aws_dir: string
        AWS directory of the .zip flood files: 
        1_pull_data/governance/natural_resource_conservation/ca_state_water_resources_board
    aws_out_dir: string
        AWS directory for the output .zip file:
        2a_subset/climate_risk/flood/exposure/isu_environmental_mesonet/
    export: bool
        if True, uploads final result to designated AWS bucket

    Script
    ------
    isu_mesonet_flood_merge.ipynb
    """
    print('Data transformation: the three different flooding files are merged together')
    print('Data transformation: duplicate entries were removed based on location and date')
    print('Data transformation: the final file was saved as a shp file, zipped, and reuploaded to AWS')

    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index'  

    # Local directory to store the downloaded zip file and extracted contents
    local_directory = 'temp'
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)

    # List to store GeoPandas DataFrames
    dfs = []

    for name in file_names:
        #print(f'Pulling data for filename: {name}')
        # Specify the S3 key (path + file) to download
        s3_key = f'{aws_dir}/{name}'
        #print('S3 Key = ', s3_key)

        # Download the zip file
        local_zip_file_path = os.path.join(local_directory, name)

        if not os.path.exists(local_zip_file_path):
            # File not found locally, download from S3
            #print('This can take a good bit, depending on file size.')
            s3_client.download_file(bucket_name, s3_key, local_zip_file_path)
            #print(f'Download complete: {name}')
        
        # Read the file using GeoPandas
        data = gpd.read_file(local_zip_file_path)
        # Print number of rows for each individual dataset
        #print(f'Number of rows in {name}: {len(data)}')        
        # Append the DataFrame to the list
        dfs.append(data)
        
    # Merge all DataFrames together
    #print('Merging the datasets together')
    merged_data = pd.concat(dfs, ignore_index=True)
    #print('Merge complete')
    #print('Number of merged rows before looking at duplicates:', len(merged_data))
    
    # Identify and print all rows of duplicates
    duplicates_mask = merged_data.duplicated(subset=['geometry', 'ISSUED'], keep=False)
    duplicates = merged_data[duplicates_mask]

    #print(f'Number of duplicated rows: {len(duplicates)}') 
    #print("Rows of duplicates:")
    display(duplicates)

    # Drop duplicate rows, keeping only the first occurrence
    merged_data.drop_duplicates(subset=['geometry', 'ISSUED'], keep='first', inplace=True)
    #print('Number of merged rows after removing duplicates:', len(merged_data))

    # Create a directory to store the shapefile and its associated files
    output_folder = os.path.join(local_directory, 'output_shapefile')
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the filtered shapefile inside the output folder
    #print('Now saving merged and subsetted flood data to: merged_flood.shp')
    output_shapefile_path = os.path.join(output_folder, 'merged_flood.shp')
    merged_data.to_file(output_shapefile_path)
    
    # Zip the output folder
    #print('Zipping file...')
    output_zip_file_path = os.path.join(local_directory, 'merged_flood.zip')
    shutil.make_archive(output_zip_file_path[:-4], 'zip', output_folder)
    #print('Zip complete')
    
    if export == True:
        print('Uploading to AWS...')
        s3_client.upload_file(output_zip_file_path, bucket_name, os.path.join(aws_out_dir, 'merged_flood.zip'))
        print(f'merged_flood.zip uploaded to {aws_out_dir}')

    if export == False:
        print(f'merged_flood.zip uploaded to AWS.')

In [3]:
flood_files = [
            'fa_flood_warnings_1986_2024.zip', 
            'fl_flood_warnings_1986_2024.zip', 
            'flash_flood_warnings_1986_2024.zip'
]
aws_dir = "1_pull_data/climate_risk/flood/exposure/isu_environmental_mesonet"
aws_out_dir = "2a_subset/climate_risk/flood/exposure/isu_environmental_mesonet/"
merge_flood(flood_files, aws_dir, aws_out_dir, export=False, varname='climate_iowa_mesonet_flash_flood_warnings')

Unnamed: 0,WFO,ISSUED,EXPIRED,INIT_ISS,INIT_EXP,PHENOM,GTYPE,SIG,ETN,STATUS,...,HV_CAUSE,HV_REC,EMERGENC,POLY_BEG,POLY_END,WINDTAG,HAILTAG,TORNTAG,DAMAGTAG,geometry
918,SGX,201902141859,201902150100,201902141859,201902150100,FA,C,W,1,EXP,...,ER,OO,0,,,,,,,"POLYGON ((-117.67000 33.89000, -117.65000 33.8..."
1614,SGX,201012212020,201012211730,201012211449,201012230800,FL,C,W,4,CAN,...,ER,NO,0,,,,,,,"MULTIPOLYGON (((-117.51000 33.51000, -117.5000..."
1615,SGX,201012212020,201012211730,201012211438,201012230800,FL,C,W,4,CAN,...,ER,NO,0,,,,,,,"MULTIPOLYGON (((-117.51000 33.51000, -117.5000..."
1616,SGX,201012211438,201012220527,201012211449,201012220345,FL,C,W,5,CAN,...,ER,NO,0,,,,,,,"MULTIPOLYGON (((-117.51000 33.51000, -117.5000..."
1617,SGX,201012211438,201012220527,201012211438,201012220345,FL,C,W,5,CAN,...,ER,NO,0,,,,,,,"MULTIPOLYGON (((-117.51000 33.51000, -117.5000..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10794,SGX,202209122054,202209130000,202209122054,202209130000,FF,C,W,58,NEW,...,ER,OO,0,,,,,,,"POLYGON ((-117.67000 33.89000, -117.67000 33.8..."
10919,VEF,202209122054,202209130000,202209122054,202209130000,FF,C,W,276,EXP,...,ER,OO,0,,,,,,,"POLYGON ((-117.67000 33.89000, -117.67000 33.8..."
10931,VEF,202209281905,202209282215,202209281905,202209282215,FF,C,W,311,EXP,...,ER,OO,0,,,,,,,"POLYGON ((-117.67000 33.89000, -117.67000 33.8..."
11087,PSR,202308182347,202308190315,202308182347,202308190315,FF,C,W,12,EXP,...,ER,OO,0,,,,,,,"POLYGON ((-117.67000 33.89000, -117.67000 33.8..."
