In [1]:
import os
import csv
import boto3
import sys
import pandas as pd
import zipfile
import io

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
import boto3
import os
import io
import zipfile
import pandas as pd

def pull_csv_or_xls_from_directory(bucket_name, directory, main_folder, search_zipped=True):
    """
    Pulls CSV or XLS files from a specified directory in an S3 bucket and downloads them locally
    into a main folder.

    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for CSV or XLS files.
    - main_folder (str): The local main folder where files will be saved.
    - search_zipped (bool): If True, search for CSV or XLS files within zip files. If False, search for CSV or XLS files directly.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Ensure main folder exists
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']

            # Determine the local file path
            relative_path = os.path.relpath(key, directory)  # Get relative path
            local_file_path = os.path.join(main_folder, relative_path)
            local_dir_path = os.path.dirname(local_file_path)

            # Ensure local directory exists
            os.makedirs(local_dir_path, exist_ok=True)

            # Check if the object is a .zip file
            if search_zipped and key.endswith('.zip'):
                # Download the zip file into memory
                zip_object = s3.get_object(Bucket=bucket_name, Key=key)
                zip_data = io.BytesIO(zip_object['Body'].read())

                # Open the zip file
                with zipfile.ZipFile(zip_data, 'r') as zip_ref:
                    # Iterate through each file in the zip
                    for file_name in zip_ref.namelist():
                        # Check if the file is a .csv or .xls file
                        if file_name.endswith('.csv') or file_name.endswith('.xls'):
                            # Extract and save the file
                            zip_ref.extract(file_name, local_dir_path)
                            print(f"Saved '{file_name}' to '{local_dir_path}'")
            elif not search_zipped and (key.endswith('.csv') or key.endswith('.xls')):
                # Directly download the CSV or XLS file
                file_object = s3.get_object(Bucket=bucket_name, Key=key)
                file_data = io.BytesIO(file_object['Body'].read())

                # Save the file locally
                with open(local_file_path, 'wb') as f:
                    f.write(file_data.getbuffer())
                print(f"Saved '{key}' to '{local_file_path}'")
    else:
        print("No objects found in the specified directory.")

## Set-up for AWS
s3_client = boto3.client('s3')  
bucket_name = 'ca-climate-index'  
directory = '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files'

pull_csv_or_xls_from_directory(bucket_name, directory, 'noaa_storm_event_files', search_zipped=False)


Saved '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files/StormEvents_details-ftp_v1.0_d1997_c20220425.csv/StormEvents_details-ftp_v1.0_d1997_c20220425.csv' to 'noaa_storm_event_files\StormEvents_details-ftp_v1.0_d1997_c20220425.csv\StormEvents_details-ftp_v1.0_d1997_c20220425.csv'
Saved '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files/StormEvents_details-ftp_v1.0_d1998_c20220425.csv/StormEvents_details-ftp_v1.0_d1998_c20220425.csv' to 'noaa_storm_event_files\StormEvents_details-ftp_v1.0_d1998_c20220425.csv\StormEvents_details-ftp_v1.0_d1998_c20220425.csv'
Saved '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files/StormEvents_details-ftp_v1.0_d1999_c20220425.csv/StormEvents_details-ftp_v1.0_d1999_c20220425.csv' to 'noaa_storm_event_files\StormEvents_details-ftp_v1.0_d1999_c20220425.csv\StormEvents_details-ftp_v1.0_d1999_c20220425.csv'
Saved '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files/StormEvents_details-ftp_v1.0_d2000_c20220425.csv/StormEvents_deta

In [3]:
# Path to the main directory containing subfolders and CSV files
main_folder = 'noaa_storm_event_files'

# Output file to store the filtered and merged data
output_file = 'all_noaa_storm_events_ca.csv'

In [4]:
@append_metadata
def merge_and_filter(data_folder, output_file, export=False, varname=''):
    '''
    Iterates through a folder with NOAA's storm event data and filters to the state of California, compiles to a single .csv file, and uploads to AWS bucket.
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    folder_path: string
        The folder containing all NOAA storm event CSV files from NOAA's bulk storm event download page: https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/
    output_file: string
        Final output as a .csv file.
    '''

    headers_written = False  # Flag to track if headers have been written to the output file

    # Create an empty list to hold rows
    rows = []

    # Iterate through the directory structure
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            # Check if the file is a CSV file
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                # Open and read each CSV file
                with open(file_path, 'r', newline='') as infile:
                    reader = csv.reader(infile)
                    # Read the header row from the first file and write it to the output file
                    if not headers_written:
                        headers = next(reader)
                        rows.append(headers)
                        headers_written = True

                        # Find the index of the column related to 'State'
                        state_index = headers.index('STATE')

                    # Append rows for California only using the index of the 'State' column
                    for row in reader:
                        if headers_written and row[state_index] == 'CALIFORNIA':  # Filter for California data
                            rows.append(row)
    
    # Write the data to a CSV file
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(rows)

    if export == True:
        # Save the file to AWS S3 using the client
        with open(output_file, 'rb') as data:
            s3_client.upload_fileobj(data, bucket_name, f"{directory}/{output_file}")
        print(f"Merged and sorted files written to {output_file}")

In [5]:
# Call the function with the main folder path and output file
merge_and_filter(main_folder, output_file, export=False, varname='climate_noaa_flood_fatalities')