## This notebook calculates the index values from the dummy domain data following the CRSI index equation outlined in our doc: https://docs.google.com/document/d/15207KL6Q0Y3jxKcAm0hijikDNbQ5HpQqrNCL_VXet0Y/edit

In [1]:
import os
import sys
import pandas as pd
import io
import numpy as np
import boto3
import zipfile
import shutil

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

# IMPORT WHEN PR #42 IS MERGED
'''
sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import (
    pull_zipped_csv, upload_csv_aws
)
'''

"\nsys.path.append(os.path.expanduser('../../'))\nfrom scripts.utils.file_helpers import (\n    pull_zipped_csv, upload_csv_aws\n)\n"

### The pull_csv_from_directory function is in another PR right now, once merged, get rid of the code, import and call function

In [2]:
def pull_csv_from_directory(bucket_name, directory, search_zipped=True):
    """
    Pulls CSV files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for CSV files.
    - search_zipped (bool): If True, search for CSV files within zip files. If False, search for CSV files directly.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .zip file
            if search_zipped and key.endswith('.zip'):
                # Download the zip file into memory
                zip_object = s3.get_object(Bucket=bucket_name, Key=key)
                zip_data = io.BytesIO(zip_object['Body'].read())
                
                # Open the zip file
                with zipfile.ZipFile(zip_data, 'r') as zip_ref:
                    # Iterate through each file in the zip
                    for file_name in zip_ref.namelist():
                        # Check if the file is a .csv file
                        if file_name.endswith('.csv'):
                            # Read the .csv file
                            with zip_ref.open(file_name) as csv_file:
                                # Convert the csv content to pandas DataFrame
                                df = pd.read_csv(csv_file)
                                # Save the DataFrame with a similar name as the .csv file
                                df_name = file_name[:-4]  # Remove .csv extension
                                df.to_csv(f"{df_name}.csv", index=False)
                                print(f"Saved DataFrame as '{df_name}.csv'")
                                # You can now manipulate df as needed
            elif not search_zipped and key.endswith('.csv'):
                # Directly download the CSV file
                csv_object = s3.get_object(Bucket=bucket_name, Key=key)
                csv_data = io.BytesIO(csv_object['Body'].read())
                # Convert the csv content to pandas DataFrame
                df = pd.read_csv(csv_data)
                # Save the DataFrame with a similar name as the .csv file
                df_name = key.split('/')[-1][:-4]  # Extract filename from key
                df.to_csv(f"{df_name}.csv", index=False)
                print(f"Saved DataFrame as '{df_name}.csv'")
                # You can now manipulate df as needed

    else:
        print("No objects found in the specified directory.")

bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/dummy_data/'

# Search for non-zipped files
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'DUMMY_built_summed_indicators.csv'
Saved DataFrame as 'DUMMY_climate_indicator_product.csv'
Saved DataFrame as 'DUMMY_governance_summed_indicators.csv'
Saved DataFrame as 'DUMMY_natural_summed_indicators.csv'
Saved DataFrame as 'DUMMY_society_summed_indicators.csv'


### Once csv files are pulled from aws, place them into a folder for easier processing

In [3]:
source_files = [
    'DUMMY_built_summed_indicators.csv',
    'DUMMY_governance_summed_indicators.csv',
    'DUMMY_natural_summed_indicators.csv',
    'DUMMY_society_summed_indicators.csv'
]

# Define the output folder path
output_folder = 'output_folder'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate through source files and copy them to the output folder
for file in source_files:
    # Construct the source file path
    source_path = file
    
    # Construct the destination file path
    destination_path = os.path.join(output_folder, os.path.basename(file))
    
    # Copy the file to the output folder
    shutil.copyfile(source_path, destination_path)

    os.remove(file)

# Create one csv within the folder that places all data from each domain and merges them based on census tract

In [5]:
def merge_csv_files(input_folder):
    # Initialize an empty DataFrame to store the merged data
    master_df = pd.DataFrame()

    # Iterate over each CSV file in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(input_folder, filename)
            # Read the CSV file
            df = pd.read_csv(filepath)
            # Extract 'GEOID' and 'min_max_standardized' columns
            subset_df = df[['GEOID', 'min_max_standardized']].copy()
            # Rename 'min_max_standardized' column with filename prefix
            subset_df.rename(columns={'min_max_standardized': f"{os.path.splitext(filename)[0]}_min_max_standardized"}, inplace=True)
            # Merge with master DataFrame based on 'GEOID'
            if master_df.empty:
                master_df = subset_df
            else:
                master_df = pd.merge(master_df, subset_df, on='GEOID', how='outer')

    # Save the master DataFrame to a new CSV file
    output_filename = 'merged_data.csv'
    output_filepath = os.path.join(input_folder, output_filename)
    master_df.to_csv(output_filepath, index=False)
    print(f"Merged data saved to {output_filepath}")

# Specify the input folder containing CSV files
input_folder = 'your_input_folder_path'

# Call the function
merge_csv_files('output_folder')


Merged data saved to output_folder\merged_data.csv


# Read in newly made csv file for further analysis

In [6]:
all_data = os.path.join('output_folder', 'merged_data.csv')
grouped_data = pd.read_csv(all_data)

In [7]:
grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized
0,6001400100,0.399889,0.678999,0.744960,0.542488
1,6001400200,0.484907,0.314080,0.152722,0.330226
2,6001400300,0.544448,0.548310,0.057964,0.499104
3,6001400400,0.551648,0.579186,0.061492,0.414485
4,6001400500,0.592911,0.536066,0.732863,0.589100
...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640
9125,6115041001,0.519801,0.757519,0.864415,0.413051
9126,6115041002,0.547771,0.442640,0.322581,0.340982
9127,6115041101,0.572418,0.544850,0.287298,0.258874


## Calculate the median value of each domain column's min_max_standardized data and assign a new column for those values on each row for further calculation

In [8]:
grouped_data['DUMMY_built_median'] = grouped_data['DUMMY_built_summed_indicators_min_max_standardized'].median()
grouped_data['DUMMY_governance_median'] = grouped_data['DUMMY_governance_summed_indicators_min_max_standardized'].median()
grouped_data['DUMMY_natural_median'] = grouped_data['DUMMY_natural_summed_indicators_min_max_standardized'].median()
grouped_data['DUMMY_society_median'] = grouped_data['DUMMY_society_summed_indicators_min_max_standardized'].median()

grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized,DUMMY_built_median,DUMMY_governance_median,DUMMY_natural_median,DUMMY_society_median
0,6001400100,0.399889,0.678999,0.744960,0.542488,0.487123,0.501464,0.495968,0.503406
1,6001400200,0.484907,0.314080,0.152722,0.330226,0.487123,0.501464,0.495968,0.503406
2,6001400300,0.544448,0.548310,0.057964,0.499104,0.487123,0.501464,0.495968,0.503406
3,6001400400,0.551648,0.579186,0.061492,0.414485,0.487123,0.501464,0.495968,0.503406
4,6001400500,0.592911,0.536066,0.732863,0.589100,0.487123,0.501464,0.495968,0.503406
...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640,0.487123,0.501464,0.495968,0.503406
9125,6115041001,0.519801,0.757519,0.864415,0.413051,0.487123,0.501464,0.495968,0.503406
9126,6115041002,0.547771,0.442640,0.322581,0.340982,0.487123,0.501464,0.495968,0.503406
9127,6115041101,0.572418,0.544850,0.287298,0.258874,0.487123,0.501464,0.495968,0.503406


## Calculating tract adjusted values per domain with (domain tract - domain median / domain median) and adding new column as 'DUMMY_domain_tract_adjusted'

In [9]:
def calculate_adjusted_value(row, category):
    category_median_col = f'DUMMY_{category}_median'
    category_col = f'DUMMY_{category}_summed_indicators_min_max_standardized'
    adjusted_col = f'DUMMY_{category}_tract_adjusted'
    return (row[category_col] - row[category_median_col]) / row[category_median_col]

# Loop through each row and calculate the adjusted value for each category
for index, row in grouped_data.iterrows():
    for category in ['natural', 'society', 'built']:
        adjusted_value = calculate_adjusted_value(row, category)
        adjusted_col = f'DUMMY_{category}_tract_adjusted'
        grouped_data.at[index, adjusted_col] = adjusted_value

# Display the DataFrame
grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized,DUMMY_built_median,DUMMY_governance_median,DUMMY_natural_median,DUMMY_society_median,DUMMY_natural_tract_adjusted,DUMMY_society_tract_adjusted,DUMMY_built_tract_adjusted
0,6001400100,0.399889,0.678999,0.744960,0.542488,0.487123,0.501464,0.495968,0.503406,0.502033,0.077635,-0.179079
1,6001400200,0.484907,0.314080,0.152722,0.330226,0.487123,0.501464,0.495968,0.503406,-0.692073,-0.344017,-0.004548
2,6001400300,0.544448,0.548310,0.057964,0.499104,0.487123,0.501464,0.495968,0.503406,-0.883130,-0.008547,0.117681
3,6001400400,0.551648,0.579186,0.061492,0.414485,0.487123,0.501464,0.495968,0.503406,-0.876016,-0.176638,0.132462
4,6001400500,0.592911,0.536066,0.732863,0.589100,0.487123,0.501464,0.495968,0.503406,0.477642,0.170228,0.217169
...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640,0.487123,0.501464,0.495968,0.503406,-0.065041,0.081909,0.355884
9125,6115041001,0.519801,0.757519,0.864415,0.413051,0.487123,0.501464,0.495968,0.503406,0.742886,-0.179487,0.067084
9126,6115041002,0.547771,0.442640,0.322581,0.340982,0.487123,0.501464,0.495968,0.503406,-0.349593,-0.322650,0.124503
9127,6115041101,0.572418,0.544850,0.287298,0.258874,0.487123,0.501464,0.495968,0.503406,-0.420732,-0.485755,0.175099


## Calculating the TOP half of the CRSI equation per census tract:
Making this as a function which will eventually go in as a utility function once we have a final version

        governance min max standard +
        ((society weight * society adjusted tract) * governance min max standard) +
        ((built weight * built adjusted tract) * governance min max standard) +
        ((natural weight * natural adjusted tract) * governance min max standard)

In [30]:
def crsi_calculation_top(df, society_weight, built_weight, natural_weight):    
    # Loop through each row and calculate crsi_top_equation
    for index, row in df.iterrows():
        governance_col = 'DUMMY_governance_summed_indicators_min_max_standardized'
        society_adjusted_col = 'DUMMY_society_tract_adjusted'
        built_adjusted_col = 'DUMMY_built_tract_adjusted'
        natural_adjusted_col = 'DUMMY_natural_tract_adjusted'
        
        crsi_top_equation = (
            row[governance_col] +
            (society_weight * (row[society_adjusted_col]) * row[governance_col]) +
            (built_weight * (row[built_adjusted_col]) * row[governance_col]) +
            (natural_weight * (row[natural_adjusted_col]) * row[governance_col])
        )
        
        df.at[index, 'crsi_top_equation'] = crsi_top_equation

        df

crsi_calculation_top(grouped_data, 1, 1, 1)

In [34]:
grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized,DUMMY_built_median,DUMMY_governance_median,DUMMY_natural_median,DUMMY_society_median,DUMMY_natural_tract_adjusted,DUMMY_society_tract_adjusted,DUMMY_built_tract_adjusted,crsi_top_equation
0,6001400100,0.399889,0.678999,0.744960,0.542488,0.487123,0.501464,0.495968,0.503406,0.502033,0.077635,-0.179079,0.950999
1,6001400200,0.484907,0.314080,0.152722,0.330226,0.487123,0.501464,0.495968,0.503406,-0.692073,-0.344017,-0.004548,-0.012764
2,6001400300,0.544448,0.548310,0.057964,0.499104,0.487123,0.501464,0.495968,0.503406,-0.883130,-0.008547,0.117681,0.123920
3,6001400400,0.551648,0.579186,0.061492,0.414485,0.487123,0.501464,0.495968,0.503406,-0.876016,-0.176638,0.132462,0.046223
4,6001400500,0.592911,0.536066,0.732863,0.589100,0.487123,0.501464,0.495968,0.503406,0.477642,0.170228,0.217169,0.999784
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640,0.487123,0.501464,0.495968,0.503406,-0.065041,0.081909,0.355884,0.717982
9125,6115041001,0.519801,0.757519,0.864415,0.413051,0.487123,0.501464,0.495968,0.503406,0.742886,-0.179487,0.067084,1.235122
9126,6115041002,0.547771,0.442640,0.322581,0.340982,0.487123,0.501464,0.495968,0.503406,-0.349593,-0.322650,0.124503,0.200188
9127,6115041101,0.572418,0.544850,0.287298,0.258874,0.487123,0.501464,0.495968,0.503406,-0.420732,-0.485755,0.175099,0.146354


## Reading in climate domain min/max standardized data to calculate rest of CRSI

In [35]:
climate_domain_min_max_data = pd.read_csv('DUMMY_climate_indicator_product.csv')

In [36]:
climate_domain_min_max_data

Unnamed: 0,GEOID,sum_of_values_x,sum_of_values_y,product_of_sum_of_values_exposure_loss,min_product_value,max_product_value,min_max_standardized_from_product
0,6001400100,2.167,1.448,3.137816,0.660548,18.51211,0.138770
1,6001400200,3.086,2.575,7.946450,0.660548,18.51211,0.408138
2,6001400300,2.241,2.757,6.178437,0.660548,18.51211,0.309098
3,6001400400,2.603,2.258,5.877574,0.660548,18.51211,0.292245
4,6001400500,2.126,3.501,7.443126,0.660548,18.51211,0.379943
...,...,...,...,...,...,...,...
9124,6115040902,3.204,3.262,10.451448,0.660548,18.51211,0.548462
9125,6115041001,1.703,3.588,6.110364,0.660548,18.51211,0.305285
9126,6115041002,1.541,1.201,1.850741,0.660548,18.51211,0.066672
9127,6115041101,3.671,3.607,13.241297,0.660548,18.51211,0.704742


## Moving the min/max standardized column from the csv we just read in to our df with all other domain calculations

In [37]:
grouped_data['acute_risk'] = climate_domain_min_max_data[['min_max_standardized_from_product']]
grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized,DUMMY_built_median,DUMMY_governance_median,DUMMY_natural_median,DUMMY_society_median,DUMMY_natural_tract_adjusted,DUMMY_society_tract_adjusted,DUMMY_built_tract_adjusted,crsi_top_equation,acute_risk
0,6001400100,0.399889,0.678999,0.744960,0.542488,0.487123,0.501464,0.495968,0.503406,0.502033,0.077635,-0.179079,0.950999,0.138770
1,6001400200,0.484907,0.314080,0.152722,0.330226,0.487123,0.501464,0.495968,0.503406,-0.692073,-0.344017,-0.004548,-0.012764,0.408138
2,6001400300,0.544448,0.548310,0.057964,0.499104,0.487123,0.501464,0.495968,0.503406,-0.883130,-0.008547,0.117681,0.123920,0.309098
3,6001400400,0.551648,0.579186,0.061492,0.414485,0.487123,0.501464,0.495968,0.503406,-0.876016,-0.176638,0.132462,0.046223,0.292245
4,6001400500,0.592911,0.536066,0.732863,0.589100,0.487123,0.501464,0.495968,0.503406,0.477642,0.170228,0.217169,0.999784,0.379943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640,0.487123,0.501464,0.495968,0.503406,-0.065041,0.081909,0.355884,0.717982,0.548462
9125,6115041001,0.519801,0.757519,0.864415,0.413051,0.487123,0.501464,0.495968,0.503406,0.742886,-0.179487,0.067084,1.235122,0.305285
9126,6115041002,0.547771,0.442640,0.322581,0.340982,0.487123,0.501464,0.495968,0.503406,-0.349593,-0.322650,0.124503,0.200188,0.066672
9127,6115041101,0.572418,0.544850,0.287298,0.258874,0.487123,0.501464,0.495968,0.503406,-0.420732,-0.485755,0.175099,0.146354,0.704742


## Calculate the final CRSI value by dividing the CRSI top equation by the acute risk column and adding results to a new column

In [38]:
grouped_data['crsi_results'] = grouped_data['crsi_top_equation'] / grouped_data['acute_risk']
grouped_data

Unnamed: 0,GEOID,DUMMY_built_summed_indicators_min_max_standardized,DUMMY_governance_summed_indicators_min_max_standardized,DUMMY_natural_summed_indicators_min_max_standardized,DUMMY_society_summed_indicators_min_max_standardized,DUMMY_built_median,DUMMY_governance_median,DUMMY_natural_median,DUMMY_society_median,DUMMY_natural_tract_adjusted,DUMMY_society_tract_adjusted,DUMMY_built_tract_adjusted,crsi_top_equation,acute_risk,crsi_results
0,6001400100,0.399889,0.678999,0.744960,0.542488,0.487123,0.501464,0.495968,0.503406,0.502033,0.077635,-0.179079,0.950999,0.138770,6.853038
1,6001400200,0.484907,0.314080,0.152722,0.330226,0.487123,0.501464,0.495968,0.503406,-0.692073,-0.344017,-0.004548,-0.012764,0.408138,-0.031273
2,6001400300,0.544448,0.548310,0.057964,0.499104,0.487123,0.501464,0.495968,0.503406,-0.883130,-0.008547,0.117681,0.123920,0.309098,0.400908
3,6001400400,0.551648,0.579186,0.061492,0.414485,0.487123,0.501464,0.495968,0.503406,-0.876016,-0.176638,0.132462,0.046223,0.292245,0.158166
4,6001400500,0.592911,0.536066,0.732863,0.589100,0.487123,0.501464,0.495968,0.503406,0.477642,0.170228,0.217169,0.999784,0.379943,2.631405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.660482,0.523024,0.463710,0.544640,0.487123,0.501464,0.495968,0.503406,-0.065041,0.081909,0.355884,0.717982,0.548462,1.309083
9125,6115041001,0.519801,0.757519,0.864415,0.413051,0.487123,0.501464,0.495968,0.503406,0.742886,-0.179487,0.067084,1.235122,0.305285,4.045798
9126,6115041002,0.547771,0.442640,0.322581,0.340982,0.487123,0.501464,0.495968,0.503406,-0.349593,-0.322650,0.124503,0.200188,0.066672,3.002601
9127,6115041101,0.572418,0.544850,0.287298,0.258874,0.487123,0.501464,0.495968,0.503406,-0.420732,-0.485755,0.175099,0.146354,0.704742,0.207670


## Done for now, all that is left is to impliment the csv upload to aws function once that is also merged into main from PR #42