In [None]:
import geopandas as gpd
import os
import boto3
import zipfile
import sys
import pandas as pd
import shutil

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

In [None]:
# @append_metadata
def merge_flood(file_names, aws_dir, aws_out_dir, 
                varname='climate_iowa_mesonet_flash_flood_warnings'):
    """
    Iowa State University Mesonet data on flood warnings is spread over three different files. 
    This function merges all three into one file for smoother subsequent analysis.
    """
    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index'  

    # Local directory to store the downloaded zip file and extracted contents
    local_directory = 'temp'
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)

    # List to store GeoPandas DataFrames
    dfs = []

    for name in file_names:
        print(f'Pulling data from filename: {name}')
        # Specify the S3 key (path + file) to download
        s3_key = f'{aws_dir}/{name}'
        print('S3 Key = ', s3_key)

        # Download the zip file
        local_zip_file_path = os.path.join(local_directory, name)

        if not os.path.exists(local_zip_file_path):
            # File not found locally, download from S3
            print('This can take a good bit, depending on file size.')
            s3_client.download_file(bucket_name, s3_key, local_zip_file_path)
            print(f'Download complete: {name}')
        
        # Read the file using GeoPandas
        data = gpd.read_file(local_zip_file_path)
        # Print number of rows for each individual dataset
        print(f'Number of rows in {name}: {len(data)}')        
        print('')
        # Append the DataFrame to the list
        dfs.append(data)
        
        # Merge all DataFrames together
    merged_data = pd.concat(dfs, ignore_index=True)
    print('Number of merged rows before looking at duplicates:', len(merged_data))
    
    # Identify and print all rows of duplicates
    duplicates_mask = merged_data.duplicated(subset=['geometry', 'ISSUED'], keep=False)
    duplicates = merged_data[duplicates_mask]

    print(f'Number of duplicated rows: {len(duplicates)}') 
    print("Rows of duplicates:")
    display(duplicates)
    print("")

    # Drop duplicate rows, keeping only the first occurrence
    merged_data.drop_duplicates(subset=['geometry', 'ISSUED'], keep='first', inplace=True)
    print('Number of merged rows after removing duplicates:', len(merged_data))

    # Create a directory to store the shapefile and its associated files
    output_folder = os.path.join(local_directory, 'output_shapefile')
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the filtered shapefile inside the output folder
    output_shapefile_path = os.path.join(output_folder, 'merged_flood.shp')
    merged_data.to_file(output_shapefile_path)
    
    # Zip the output folder
    print('Zipping file...')
    output_zip_file_path = os.path.join(local_directory, 'merged_flood.zip')
    shutil.make_archive(output_zip_file_path[:-4], 'zip', output_folder)

    print('Uploading to AWS...')
    s3_client.upload_file(output_zip_file_path, bucket_name, os.path.join(aws_out_dir, 'merged_flood.zip'))
    print(f'merged_flood.zip uploaded to {aws_out_dir}')

In [None]:
flood_files = [
            'fa_flood_warnings_1986_2024.zip', 
            'fl_flood_warnings_1986_2024.zip', 
            'flash_flood_warnings_1986_2024.zip'
]
aws_dir = "1_pull_data/climate_risk/flood/exposure/isu_environmental_mesonet"
aws_out_dir = "2a_subset/climate_risk/flood/exposure/isu_environmental_mesonet/"
merge_flood(flood_files, aws_dir, aws_out_dir, 
            varname='climate_iowa_mesonet_flash_flood_warnings')