In [1]:
import os
import csv
import boto3

In [2]:
## set-up for AWS  
s3_client = boto3.client('s3')  
bucket_name = 'ca-climate-index'  
directory = '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files'

In [3]:
# Path to the main directory containing subfolders and CSV files
main_folder = 'noaa_storm_event_files'

# Output file to store the filtered and merged data
output_file = 'all_noaa_storm_events_ca.csv'

In [4]:
def merge_and_filter(data_folder, output_file):
    '''
    Iterates through a folder with NOAA's storm event data and filters to the state of California, compiles to a single .csv file, and uploads to AWS bucket.
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    folder_path: string
        The folder containing all NOAA storm event CSV files from NOAA's bulk storm event download page: https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/
    output_file: string
        Final output as a .csv file.
    '''

    headers_written = False  # Flag to track if headers have been written to the output file

    # Create an empty list to hold rows
    rows = []

    # Iterate through the directory structure
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            # Check if the file is a CSV file
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                # Open and read each CSV file
                with open(file_path, 'r', newline='') as infile:
                    reader = csv.reader(infile)
                    # Read the header row from the first file and write it to the output file
                    if not headers_written:
                        headers = next(reader)
                        rows.append(headers)
                        headers_written = True

                        # Find the index of the column related to 'State'
                        state_index = headers.index('STATE')

                    # Append rows for California only using the index of the 'State' column
                    for row in reader:
                        if headers_written and row[state_index] == 'CALIFORNIA':  # Filter for California data
                            rows.append(row)

    # Write the data to a CSV file
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(rows)

    # Save the file to AWS S3 using the client
    with open(output_file, 'rb') as data:
        s3_client.upload_fileobj(data, bucket_name, f"{directory}/{output_file}")
    print(f"Merged and sorted files written to {output_file}")

In [5]:
# Call the function with the main folder path and output file
merge_and_filter(main_folder, output_file)

Merged and sorted files written to all_noaa_storm_events_ca.csv
