In [1]:
import pandas as pd
import os
import boto3
from zipfile import ZipFile
import shutil
import numpy as np
import sys

In [2]:
sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import (
    min_max_standardize
)

In [3]:
def gov_built_society_natural_domain_calc(output_folder_name, domain_name):
    '''
    Pulls dummy data from AWS, groups them into respective domains, and sums values from indicators per census tract/geoid. Total values are then 
    min/maxed standardized and placed into a .csv file within a new folder. Resulting files (per domain) are uploaded to AWS.
    
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    output_folder_name: string
        Name of output folder    
    domain_name: string
        Starting string of the dummy data e.g. DUMMY_built
    '''
    
    # Initialize the S3 client
    s3_client = boto3.client('s3')
    
    # Bucket name and file paths
    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/dummy_data/dummy_dataset.zip'
    
    # Local directory to store the downloaded zip file and extracted contents
    local_directory = 'dummy_dataset'
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)
    
    # Download the zip file
    local_zip_file_path = os.path.join(local_directory, os.path.basename(directory))
    s3_client.download_file(bucket_name, directory, local_zip_file_path)
    
    # Extract the contents of the zip file
    with ZipFile(local_zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(local_directory)
    
    # List all files in the input folder
    files = os.listdir(local_directory)
    
    # Filter files that start with 'DUMMY__' and end with '.csv'
    csv_files = [file for file in files if file.startswith(domain_name) and file.endswith('.csv')]
    
    # Initialize an empty DataFrame to accumulate data from all files
    aggregated_df = None
    
    # Loop through each CSV file
    for file in csv_files:
        # Read the CSV file
        df = pd.read_csv(os.path.join(local_directory, file))
        
        # Group by 'GEOID' and sum the values in the second column
        grouped_df = df.groupby('GEOID').agg({df.columns[1]: 'sum'})
        
        # Merge the grouped data with the aggregated DataFrame
        if aggregated_df is None:
            aggregated_df = grouped_df
        else:
            aggregated_df = aggregated_df.add(grouped_df, fill_value=0)
    
    # Reset index before saving
    aggregated_df.reset_index(inplace=True)
    
    # Add a final column containing the sum of values across all columns except the first
    aggregated_df['sum_of_values'] = aggregated_df.iloc[:, 1:].sum(axis=1)
    
    # Use min max standardize function
    min_max_standardize(aggregated_df, 'sum_of_values')

    # check for values <0 or >1:
    bad_values = aggregated_df[~aggregated_df['min_max_standardized'].between(0., 1., inclusive='both')]
    if not bad_values.empty:
        print(f"ERROR: Out of bounds data found in {domain_name} data at the following tracts:")
        print(bad_values.GEOID)
        raise Exception
        
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder_name):
        os.makedirs(output_folder_name)
    
    # Save the final grouped DataFrame to a new CSV file
    final_output_file_path = os.path.join(output_folder_name, f"{domain_name}_summed_indicators.csv")
    aggregated_df.to_csv(final_output_file_path, index=False)
    
    
    # Upload the sum file to AWS S3
    s3_client.upload_file(final_output_file_path, bucket_name, f"3_fair_data/dummy_data/{domain_name}_summed_indicators.csv")
    print('Summed indicators uploaded to aws')
    
# os.remove(metric_avgs_folder)
# os.remove(final_output_file_path)

In [4]:
list = ['DUMMY_built',
        'DUMMY_society',
        'DUMMY_governance',
        'DUMMY_natural'
        ]

for file in list:
    df = gov_built_society_natural_domain_calc('sum_of_indicators', file)

Summed indicators uploaded to aws
Summed indicators uploaded to aws
Summed indicators uploaded to aws
Summed indicators uploaded to aws


In [5]:
def climate_domain_calc(output_folder_name, file_identifier):
    '''
    Pulls climate risk dummy data from AWS, groups them into exposure/loss indicators, and sums values from indicators per census tract. 
    
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    output_folder_name: string
        Name of output folder    
    file_identifier: string
        Keyword to separate file names e.g. exposure
    '''
    
    # Initialize the S3 client
    s3_client = boto3.client('s3')
    
    # Bucket name and file paths
    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/dummy_data/dummy_dataset.zip'
    
    # Local directory to store the downloaded zip file and extracted contents
    local_directory = 'dummy_dataset'
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)
    
    # Download the zip file
    local_zip_file_path = os.path.join(local_directory, os.path.basename(directory))
    s3_client.download_file(bucket_name, directory, local_zip_file_path)
    
    # Extract the contents of the zip file
    with ZipFile(local_zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(local_directory)
    
    # List all files in the input folder
    files = os.listdir(local_directory)
    
    # Filter files that contain 'file_identifier' and end with '.csv'
    csv_files = [file for file in files if (file_identifier) in file and file.endswith('.csv')]
    
    # Initialize an empty DataFrame to accumulate data from all files
    aggregated_df = None
    
    # Loop through each CSV file
    for file in csv_files:
        # Read the CSV file
        df = pd.read_csv(os.path.join(local_directory, file))
        
        # Group by 'GEOID' and sum the values in the second column
        grouped_df = df.groupby('GEOID').agg({df.columns[1]: 'sum'})
        
        # Merge the grouped data with the aggregated DataFrame
        if aggregated_df is None:
            aggregated_df = grouped_df
        else:
            aggregated_df = aggregated_df.add(grouped_df, fill_value=0)
    
    # Reset index before saving
    aggregated_df.reset_index(inplace=True)
    
    # Add a final column containing the sum of values across all columns except the first
    aggregated_df['sum_of_values'] = aggregated_df.iloc[:, 1:].sum(axis=1)
    
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder_name):
        os.makedirs(output_folder_name)
    
    # Save the final grouped DataFrame to a new CSV file
    final_output_file_path = os.path.join(output_folder_name, f"{file_identifier}_summed_indicators.csv")
    aggregated_df.to_csv(final_output_file_path, index=False) 
    
#os.remove(final_output_file_path)
           

In [6]:
# For climate risk domain
list = ['exposure',
        'loss'
        ]

for file in list:
    climate_domain_calc('climate_domain', file)

# Create output folder to hold output from multiplication & standardization steps
output_folder = 'climate_domain_product'

# Load both files created from climate_domain_calc
df1 = pd.read_csv(os.path.join('climate_domain', 'exposure_summed_indicators.csv'))
df2 = pd.read_csv(os.path.join('climate_domain', 'loss_summed_indicators.csv'))

# Merge based on GEOID
merged_df = pd.merge(df1, df2, on='GEOID')

# Multiply 'sum_of_values' columns together
merged_df['product_of_sum_of_values_exposure_loss'] = merged_df['sum_of_values_x'] * merged_df['sum_of_values_y']

# Use min max standardize function
min_max_standardize(merged_df, 'product_of_sum_of_values_exposure_loss')

# Change the column names to accurately match the multiplication portion of the climate risk domain
merged_df['min_product_value'] = merged_df['min_sum_value']
merged_df['max_product_value'] = merged_df['max_sum_value']
merged_df['min_max_standardized_from_product'] = merged_df['min_max_standardized']

# Define the directory for the multiplied output file
multiplied_output_folder = 'climate_domain_product'
if not os.path.exists(multiplied_output_folder):
    os.makedirs(multiplied_output_folder)

# Save the multiplied DataFrame to a new CSV file
multiplied_output_file_path = os.path.join(multiplied_output_folder, "climate_multiplied.csv")
merged_df[['GEOID', 
           'sum_of_values_x', 
           'sum_of_values_y', 
           'product_of_sum_of_values_exposure_loss', 
           'min_product_value', 
           'max_product_value', 
           'min_max_standardized_from_product']].to_csv(multiplied_output_file_path, index=False)

# Upload the sum file to AWS S3
s3_client = boto3.client('s3')
bucket_name = 'ca-climate-index'
s3_client.upload_file(multiplied_output_file_path, bucket_name, f"3_fair_data/dummy_data/DUMMY_climate_indicator_product.csv")
print('Climate risk domain calculation uploaded to aws')

#os.remove(multiplied_output_folder)

Climate risk domain calculation uploaded to aws
