# Cal-CRAI Index Calculation Testing

Note: Functions and figures within this notebook are in development

In [20]:
import pandas as pd
import os
import sys
import numpy as np
import shutil
import glob
import geopandas as gpd
import matplotlib.pyplot as plt

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata
from scripts.utils.index_plot import index_plot
from scripts.utils.calculate_index import handle_outliers, min_max_standardize

## Pulling all calculated metric files

In [21]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'built_broadband_internet_metric.csv'
Saved DataFrame as 'built_cellular_towers_metric.csv'
Saved DataFrame as 'built_energy_transmission_lines_metric.csv'
Saved DataFrame as 'built_housing_median_age_metric.csv'
Saved DataFrame as 'built_housing_mobile_homes_metric.csv'
Saved DataFrame as 'built_housing_quality_metric.csv'
Saved DataFrame as 'built_metric_housing_vacancy_metric.csv'
Saved DataFrame as 'built_microwave_towers_metric.csv'
Saved DataFrame as 'built_mobile_towers_metric.csv'
Saved DataFrame as 'built_paging_towers_metric.csv'
Saved DataFrame as 'built_power_plant_metric.csv'
Saved DataFrame as 'built_power_shutoffs_metric.csv'
Saved DataFrame as 'built_radio_towers_metric.csv'
Saved DataFrame as 'built_transportation_airports_metric.csv'
Saved DataFrame as 'built_transportation_bottleneck_metric.csv'
Saved DataFrame as 'built_transportation_bridge_metric.csv'
Saved DataFrame as 'built_transportation_highway_metric.csv'
Saved DataFrame as 'built_transpor

In [22]:
meta_csv = r'C:/Users/jespi/eagle/carb-climate-index-9/metadata/Full Data Pipeline Notes - 4_ Calculate Metric.csv'
# read in first tab of the sheet
df = pd.read_csv(meta_csv)
df.columns

Index(['Data Source', 'Data', 'Domain', 'Indicator', 'Metric',
       'High value result (vulnerable or resilient)', 'Metric file name',
       'Metric Calculated (Y/N)', 'Metadata Finalized (Y/N)',
       'Metric Calculator', 'Metric goes into what indicator',
       'Includes Indigenous Tribes', 'Notes'],
      dtype='object')

## Selecting files specifically for climate environment domain

In [23]:
# Define the output folder path
output_folder = 'output_folder'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load the metadata CSV
meta_csv = r'C:/Users/jespi/eagle/carb-climate-index-9/metadata/Full Data Pipeline Notes - 4_ Calculate Metric.csv'
df = pd.read_csv(meta_csv)

# Get the list of metric file names and corresponding 'High value result' entries
metric_files = df[['Metric file name', 'Indicator']]

# Replace spaces with underscores in 'Indicator' entries
metric_files['Indicator'] = metric_files['Indicator'].apply(lambda x: x.split()[-1].replace(' ', '_'))

# Find all CSV files starting with 'climate_' and matching the metric file names
source_files = [file for file in glob.glob('climate_*.csv') if os.path.basename(file) in metric_files['Metric file name'].values]

# Iterate through the source files and process them
for file in source_files:
    # Get the 'Indicator' entry for the current file
    column_result = metric_files.loc[metric_files['Metric file name'] == os.path.basename(file), 'Indicator'].values[0]
    
    # Load the CSV file
    csv_df = pd.read_csv(file)
    
    # Get the last column name
    last_column = csv_df.columns[-1]
    
    # Append the column result to the last column name
    csv_df.rename(columns={last_column: f"{last_column}_{column_result}"}, inplace=True)
    
    # Construct the destination file path
    destination_path = os.path.join(output_folder, os.path.basename(file))
    
    # Save the modified CSV to the output folder
    csv_df.to_csv(destination_path, index=False)
    
    # Remove the original file
    os.remove(file)

print(f"Processed and removed {len(source_files)} CSV files.")

# Delete all CSV files in the current directory that are not in the output folder
current_files = glob.glob('*.csv')
for file in current_files:
    if file not in [os.path.basename(f) for f in source_files]:
        os.remove(file)

print(f"Deleted {len(current_files) - len(source_files)} local CSV files.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric_files['Indicator'] = metric_files['Indicator'].apply(lambda x: x.split()[-1].replace(' ', '_'))


Processed and removed 24 CSV files.
Deleted 80 local CSV files.


## Placing relevant files in a folder and removing others

In [24]:
# Define the output folder path
output_folder = 'output_folder'

# Get a list of all CSV files in the output folder
csv_files = glob.glob(os.path.join(output_folder, '*.csv'))

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()
# Iterate through each CSV file and merge them on the 'census_tract' column
for file in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Rename 'GEO_ID' or 'tract' to 'census_tract' if they exist
    if 'USCB_GEOID' in df.columns:
        df.rename(columns={'USCB_GEOID': 'census_tract'}, inplace=True)
    elif 'GEO_ID' in df.columns:
        df.rename(columns={'GEO_ID': 'census_tract'}, inplace=True)
    elif 'GEOID' in df.columns:
        df.rename(columns={'GEOID': 'census_tract'}, inplace=True)
    elif 'tract' in df.columns:
        df.rename(columns={'tract': 'census_tract'}, inplace=True)
    elif 'TRACT' in df.columns:
        df.rename(columns={'TRACT': 'census_tract'}, inplace=True)
    elif 'Census_Tract' in df.columns:
        df.rename(columns={'Census_Tract': 'census_tract'}, inplace=True)

    # Convert 'census_tract' to string to ensure consistent data type
    df['census_tract'] = df['census_tract'].astype(str)

    # Keep only the 'census_tract' and the last column from each file
    last_column = df.columns[-1]
    df = df[['census_tract', last_column]]
    
    # Merge the DataFrame with the existing merged DataFrame
    if merged_df.empty:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on='census_tract', how='outer')

# Save the merged DataFrame to a CSV file
merged_df.to_csv('concatenate_climate_metrics.csv', index=False)

print(f"Merged CSV saved as concatenate_climate_metrics.csv")

Merged CSV saved as concatenate_climate_metrics.csv


In [25]:
merged_df.columns

Index(['census_tract', 'sum_d1_d4_exposure', 'determined_acres_loss_x',
       'indemnity_amount_loss_x', 'percent_weeks_drought_exposure',
       'protected_areas_percentage_exposure',
       'avg_insurance_payout_per_claim_loss', 'estimated_crop_loss_cost_loss',
       'change_chill_hours_min_max_standardized_loss',
       'determined_acres_loss_y', 'indemnity_amount_loss_y',
       'avg_age_adjust_heat_hospitalizations_per_10000_loss',
       'building_content_cost_exposure', 'fire_stations_count_diff_exposure',
       'hospitals_count_diff_exposure', 'police_stations_count_diff_exposure',
       'schools_count_diff_exposure', 'building_count_exposure',
       'delta_percentage_change_exposure', 'wastewater_count_exposure',
       'rcp_4.5__50th_percent_change_loss',
       'average_damaged_destroyed_structures_loss', 'average_fatalities_loss'],
      dtype='object')

In [26]:
merged_df

Unnamed: 0,census_tract,sum_d1_d4_exposure,determined_acres_loss_x,indemnity_amount_loss_x,percent_weeks_drought_exposure,protected_areas_percentage_exposure,avg_insurance_payout_per_claim_loss,estimated_crop_loss_cost_loss,total_fatalities_loss,median_warning_days_exposure,change_chill_hours_min_max_standardized_loss,determined_acres_loss_y,indemnity_amount_loss_y,avg_age_adjust_heat_hospitalizations_per_10000_loss,building_content_cost_exposure,fire_stations_count_diff_exposure,hospitals_count_diff_exposure,police_stations_count_diff_exposure,schools_count_diff_exposure,building_count_exposure,delta_percentage_change_exposure,wastewater_count_exposure,rcp_4.5__50th_percent_change_loss,average_damaged_destroyed_structures_loss,average_fatalities_loss
0,6001400100,37.984370,197.583333,7394.25,0.570255,,5463.8925,0.0,1,3.0,,769.522222,14348.571429,74.250000,650084.0,3.0,0.0,1.0,14.0,1315.0,,4.0,-3.740648,4.0,0.0
1,6001400200,37.984370,197.583333,7394.25,0.570255,,,0.0,1,3.0,,769.522222,14348.571429,74.250000,434115.0,3.0,0.0,1.0,14.0,775.0,,4.0,-3.740648,4.0,0.0
2,6001400300,37.984370,197.583333,7394.25,0.570255,,2600.0000,0.0,1,3.0,,769.522222,14348.571429,74.250000,922941.0,3.0,0.0,1.0,14.0,1941.0,,4.0,-3.740648,4.0,0.0
3,6001400400,37.984370,197.583333,7394.25,0.570255,,,0.0,1,3.0,,769.522222,14348.571429,74.250000,594158.0,3.0,0.0,1.0,14.0,1328.0,,4.0,-3.740648,4.0,0.0
4,6001400500,37.984370,197.583333,7394.25,0.570255,,2625.3700,0.0,1,3.0,,769.522222,14348.571429,74.250000,398157.0,3.0,0.0,1.0,14.0,1057.0,,4.0,-3.740648,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,38.971624,,,0.571168,0.02,,0.0,3,1.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,1762.0,,,,22.0,0.0
9125,6115041001,38.971624,,,0.571168,45.42,,0.0,3,2.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9126,6115041002,38.971624,,,0.571168,5.96,,0.0,3,2.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9127,6115041101,38.971624,,,0.571168,0.33,,0.0,3,4.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0


In [27]:
climate_metrics = pd.read_csv('concatenate_climate_metrics.csv')

# Drop rows where 'census_tract' is NaN
climate_metrics = climate_metrics.dropna(subset=['census_tract'])

# Convert census tract to string and eliminate scientific notation default
climate_metrics['census_tract'] = climate_metrics['census_tract'].dropna().apply(lambda x: '{:.0f}'.format(x))

# Convert all values within the island tract (near San Francisco) to nan, as it is uninhabited 
island_tract = '6075980401'
climate_metrics.loc[climate_metrics['census_tract'] == island_tract, 
                            climate_metrics.columns != 'census_tract'] = np.nan

climate_metrics

Unnamed: 0,census_tract,sum_d1_d4_exposure,determined_acres_loss_x,indemnity_amount_loss_x,percent_weeks_drought_exposure,protected_areas_percentage_exposure,avg_insurance_payout_per_claim_loss,estimated_crop_loss_cost_loss,total_fatalities_loss,median_warning_days_exposure,change_chill_hours_min_max_standardized_loss,determined_acres_loss_y,indemnity_amount_loss_y,avg_age_adjust_heat_hospitalizations_per_10000_loss,building_content_cost_exposure,fire_stations_count_diff_exposure,hospitals_count_diff_exposure,police_stations_count_diff_exposure,schools_count_diff_exposure,building_count_exposure,delta_percentage_change_exposure,wastewater_count_exposure,rcp_4.5__50th_percent_change_loss,average_damaged_destroyed_structures_loss,average_fatalities_loss
0,6001400100,37.984370,197.583333,7394.25,0.570255,,5463.8925,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,650084.0,3.0,0.0,1.0,14.0,1315.0,,4.0,-3.740648,4.0,0.0
1,6001400200,37.984370,197.583333,7394.25,0.570255,,,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,434115.0,3.0,0.0,1.0,14.0,775.0,,4.0,-3.740648,4.0,0.0
2,6001400300,37.984370,197.583333,7394.25,0.570255,,2600.0000,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,922941.0,3.0,0.0,1.0,14.0,1941.0,,4.0,-3.740648,4.0,0.0
3,6001400400,37.984370,197.583333,7394.25,0.570255,,,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,594158.0,3.0,0.0,1.0,14.0,1328.0,,4.0,-3.740648,4.0,0.0
4,6001400500,37.984370,197.583333,7394.25,0.570255,,2625.3700,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,398157.0,3.0,0.0,1.0,14.0,1057.0,,4.0,-3.740648,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,38.971624,,,0.571168,0.02,,0.0,3.0,1.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,1762.0,,,,22.0,0.0
9125,6115041001,38.971624,,,0.571168,45.42,,0.0,3.0,2.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9126,6115041002,38.971624,,,0.571168,5.96,,0.0,3.0,2.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9127,6115041101,38.971624,,,0.571168,0.33,,0.0,3.0,4.0,,31274.729606,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0


In [28]:
# Make sure all columns within the island tract are NaN
checking_island_tract = climate_metrics[climate_metrics['census_tract']=='6075980401']
checking_island_tract

Unnamed: 0,census_tract,sum_d1_d4_exposure,determined_acres_loss_x,indemnity_amount_loss_x,percent_weeks_drought_exposure,protected_areas_percentage_exposure,avg_insurance_payout_per_claim_loss,estimated_crop_loss_cost_loss,total_fatalities_loss,median_warning_days_exposure,change_chill_hours_min_max_standardized_loss,determined_acres_loss_y,indemnity_amount_loss_y,avg_age_adjust_heat_hospitalizations_per_10000_loss,building_content_cost_exposure,fire_stations_count_diff_exposure,hospitals_count_diff_exposure,police_stations_count_diff_exposure,schools_count_diff_exposure,building_count_exposure,delta_percentage_change_exposure,wastewater_count_exposure,rcp_4.5__50th_percent_change_loss,average_damaged_destroyed_structures_loss,average_fatalities_loss
7295,6075980401,,,,,,,,,,,,,,,,,,,,,,,,


## Checking to see if there are infinite values within our columns
* if so, replacing infinite values with NaN

In [29]:
# Selecting only numeric columns
numeric_df = climate_metrics.select_dtypes(include=[np.number])

# Counting infinite values
num_infinite = np.isinf(numeric_df).sum().sum()

print(f"\nNumber of infinite entries in the DataFrame: {num_infinite}")


Number of infinite entries in the DataFrame: 0


In [30]:
# Replace infinite values with NaN
climate_metrics.replace([np.inf, -np.inf], np.nan, inplace=True)

# Selecting only numeric columns
numeric_df = climate_metrics.select_dtypes(include=[np.number])

# Counting infinite values
num_infinite = np.isinf(numeric_df).sum().sum()

print(f"\nNumber of infinite entries in the DataFrame: {num_infinite}")


Number of infinite entries in the DataFrame: 0


## Call outlier function to handle outliers:
Max = 3 x 75th percentile
Min = -3 x 25th percentile

Outliers beyond these values are set to metric value closest to the fence
Baum et al. 1970

In [31]:
# Handle outliers
handle_outlier_csv = 'no_outlier_climate_metrics.csv'
output = handle_outliers(climate_metrics, handle_outlier_csv)
print(f"Processed and saved {handle_outlier_csv} with outlier handling.")

For column sum_d1_d4_exposure:
  Q1 (25th percentile): 37.98437043795621
  Q3 (75th percentile): 46.30292883211679
  IQR: 8.318558394160583
  Max fence: 138.9087864963504
  Min fence: -113.95311131386862
For column determined_acres_loss_x:
  Q1 (25th percentile): 488.5
  Q3 (75th percentile): 4407.123529411765
  IQR: 3918.623529411765
  Max fence: 13221.370588235295
  Min fence: -1465.5
Outliers detected in column 'determined_acres_loss_x':
census_tract: 6019000100, value: 89505.42764000001
census_tract: 6019000200, value: 89505.42764000001
census_tract: 6019000300, value: 89505.42764000001
census_tract: 6019000400, value: 89505.42764000001
census_tract: 6019000501, value: 89505.42764000001
census_tract: 6019000502, value: 89505.42764000001
census_tract: 6019000601, value: 89505.42764000001
census_tract: 6019000602, value: 89505.42764000001
census_tract: 6019000701, value: 89505.42764000001
census_tract: 6019000702, value: 89505.42764000001
census_tract: 6019000901, value: 89505.427640

In [32]:
# viewing new 'fenced' outliers
no_outlier_climate_metrics = pd.read_csv('no_outlier_climate_metrics.csv')
no_outlier_climate_metrics

Unnamed: 0,census_tract,sum_d1_d4_exposure,determined_acres_loss_x,indemnity_amount_loss_x,percent_weeks_drought_exposure,protected_areas_percentage_exposure,avg_insurance_payout_per_claim_loss,estimated_crop_loss_cost_loss,total_fatalities_loss,median_warning_days_exposure,change_chill_hours_min_max_standardized_loss,determined_acres_loss_y,indemnity_amount_loss_y,avg_age_adjust_heat_hospitalizations_per_10000_loss,building_content_cost_exposure,fire_stations_count_diff_exposure,hospitals_count_diff_exposure,police_stations_count_diff_exposure,schools_count_diff_exposure,building_count_exposure,delta_percentage_change_exposure,wastewater_count_exposure,rcp_4.5__50th_percent_change_loss,average_damaged_destroyed_structures_loss,average_fatalities_loss
0,6001400100,37.984370,197.583333,7394.25,0.570255,,5463.8925,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,650084.0,3.0,0.0,1.0,12.0,1315.0,,4.0,-3.740648,4.0,0.0
1,6001400200,37.984370,197.583333,7394.25,0.570255,,,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,434115.0,3.0,0.0,1.0,12.0,775.0,,4.0,-3.740648,4.0,0.0
2,6001400300,37.984370,197.583333,7394.25,0.570255,,2600.0000,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,922941.0,3.0,0.0,1.0,12.0,1941.0,,4.0,-3.740648,4.0,0.0
3,6001400400,37.984370,197.583333,7394.25,0.570255,,,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,594158.0,3.0,0.0,1.0,12.0,1328.0,,4.0,-3.740648,4.0,0.0
4,6001400500,37.984370,197.583333,7394.25,0.570255,,2625.3700,0.0,1.0,3.0,,769.522222,14348.571429,74.250000,398157.0,3.0,0.0,1.0,12.0,1057.0,,4.0,-3.740648,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,38.971624,,,0.571168,0.0200,,0.0,3.0,1.0,,23418.163206,394088.020213,273.066667,,0.0,0.0,0.0,0.0,1762.0,,,,22.0,0.0
9125,6115041001,38.971624,,,0.571168,40.2825,,0.0,3.0,2.0,,23418.163206,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9126,6115041002,38.971624,,,0.571168,5.9600,,0.0,3.0,2.0,,23418.163206,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0
9127,6115041101,38.971624,,,0.571168,0.3300,,0.0,3.0,4.0,,23418.163206,394088.020213,273.066667,,0.0,0.0,0.0,0.0,,,,,22.0,0.0


## Metrics are now min-max standardized on 0.01 to 0.99 scale

In [33]:
# standardizing our df
min_max_metrics = no_outlier_climate_metrics.copy()
columns_to_process = [col for col in min_max_metrics.columns if col != 'census_tract']
min_max_metrics = min_max_standardize(min_max_metrics, columns_to_process)

min_max_metrics.head()

Unnamed: 0,census_tract,sum_d1_d4_exposure_min,sum_d1_d4_exposure_max,sum_d1_d4_exposure_min_max_standardized,determined_acres_loss_x_min,determined_acres_loss_x_max,determined_acres_loss_x_min_max_standardized,indemnity_amount_loss_x_min,indemnity_amount_loss_x_max,indemnity_amount_loss_x_min_max_standardized,percent_weeks_drought_exposure_min,percent_weeks_drought_exposure_max,percent_weeks_drought_exposure_min_max_standardized,protected_areas_percentage_exposure_min,protected_areas_percentage_exposure_max,protected_areas_percentage_exposure_min_max_standardized,avg_insurance_payout_per_claim_loss_min,avg_insurance_payout_per_claim_loss_max,avg_insurance_payout_per_claim_loss_min_max_standardized,estimated_crop_loss_cost_loss_min,estimated_crop_loss_cost_loss_max,estimated_crop_loss_cost_loss_min_max_standardized,total_fatalities_loss_min,total_fatalities_loss_max,total_fatalities_loss_min_max_standardized,median_warning_days_exposure_min,median_warning_days_exposure_max,median_warning_days_exposure_min_max_standardized,change_chill_hours_min_max_standardized_loss_min,change_chill_hours_min_max_standardized_loss_max,change_chill_hours_min_max_standardized_loss_min_max_standardized,determined_acres_loss_y_min,determined_acres_loss_y_max,determined_acres_loss_y_min_max_standardized,indemnity_amount_loss_y_min,indemnity_amount_loss_y_max,indemnity_amount_loss_y_min_max_standardized,avg_age_adjust_heat_hospitalizations_per_10000_loss_min,avg_age_adjust_heat_hospitalizations_per_10000_loss_max,avg_age_adjust_heat_hospitalizations_per_10000_loss_min_max_standardized,building_content_cost_exposure_min,building_content_cost_exposure_max,building_content_cost_exposure_min_max_standardized,fire_stations_count_diff_exposure_min,fire_stations_count_diff_exposure_max,fire_stations_count_diff_exposure_min_max_standardized,hospitals_count_diff_exposure_min,hospitals_count_diff_exposure_max,hospitals_count_diff_exposure_min_max_standardized,police_stations_count_diff_exposure_min,police_stations_count_diff_exposure_max,police_stations_count_diff_exposure_min_max_standardized,schools_count_diff_exposure_min,schools_count_diff_exposure_max,schools_count_diff_exposure_min_max_standardized,building_count_exposure_min,building_count_exposure_max,building_count_exposure_min_max_standardized,delta_percentage_change_exposure_min,delta_percentage_change_exposure_max,delta_percentage_change_exposure_min_max_standardized,wastewater_count_exposure_min,wastewater_count_exposure_max,wastewater_count_exposure_min_max_standardized,rcp_4.5__50th_percent_change_loss_min,rcp_4.5__50th_percent_change_loss_max,rcp_4.5__50th_percent_change_loss_min_max_standardized,average_damaged_destroyed_structures_loss_min,average_damaged_destroyed_structures_loss_max,average_damaged_destroyed_structures_loss_min_max_standardized,average_fatalities_loss_min,average_fatalities_loss_max,average_fatalities_loss_min_max_standardized
0,6001400100,22.007144,55.920036,0.471125,15.0,13221.370588,0.013825,795.0,135068.0,0.049148,0.405109,0.832117,0.386752,0.0,40.2825,,0.0,19558.95,0.279355,0.0,22530000.0,0.0,0.0,20.0,0.05,1.0,12.0,0.181818,0.0,1.0,,14.646154,23418.163206,0.032255,10178.523077,663346.368,0.006384,45.625,543.0,0.057552,0.0,2119685.25,0.306689,0.0,3.0,1.0,0.0,3.0,0.0,0.0,3.0,0.333333,0.0,12.0,1.0,0.0,4967.25,0.264734,0.014976,73.41142,,1.0,6.0,0.6,-11.538462,0.0,0.67581,0.0,438.0,0.009132,0.0,20.0,0.0
1,6001400200,22.007144,55.920036,0.471125,15.0,13221.370588,0.013825,795.0,135068.0,0.049148,0.405109,0.832117,0.386752,0.0,40.2825,,0.0,19558.95,,0.0,22530000.0,0.0,0.0,20.0,0.05,1.0,12.0,0.181818,0.0,1.0,,14.646154,23418.163206,0.032255,10178.523077,663346.368,0.006384,45.625,543.0,0.057552,0.0,2119685.25,0.204802,0.0,3.0,1.0,0.0,3.0,0.0,0.0,3.0,0.333333,0.0,12.0,1.0,0.0,4967.25,0.156022,0.014976,73.41142,,1.0,6.0,0.6,-11.538462,0.0,0.67581,0.0,438.0,0.009132,0.0,20.0,0.0
2,6001400300,22.007144,55.920036,0.471125,15.0,13221.370588,0.013825,795.0,135068.0,0.049148,0.405109,0.832117,0.386752,0.0,40.2825,,0.0,19558.95,0.132931,0.0,22530000.0,0.0,0.0,20.0,0.05,1.0,12.0,0.181818,0.0,1.0,,14.646154,23418.163206,0.032255,10178.523077,663346.368,0.006384,45.625,543.0,0.057552,0.0,2119685.25,0.435414,0.0,3.0,1.0,0.0,3.0,0.0,0.0,3.0,0.333333,0.0,12.0,1.0,0.0,4967.25,0.390759,0.014976,73.41142,,1.0,6.0,0.6,-11.538462,0.0,0.67581,0.0,438.0,0.009132,0.0,20.0,0.0
3,6001400400,22.007144,55.920036,0.471125,15.0,13221.370588,0.013825,795.0,135068.0,0.049148,0.405109,0.832117,0.386752,0.0,40.2825,,0.0,19558.95,,0.0,22530000.0,0.0,0.0,20.0,0.05,1.0,12.0,0.181818,0.0,1.0,,14.646154,23418.163206,0.032255,10178.523077,663346.368,0.006384,45.625,543.0,0.057552,0.0,2119685.25,0.280305,0.0,3.0,1.0,0.0,3.0,0.0,0.0,3.0,0.333333,0.0,12.0,1.0,0.0,4967.25,0.267351,0.014976,73.41142,,1.0,6.0,0.6,-11.538462,0.0,0.67581,0.0,438.0,0.009132,0.0,20.0,0.0
4,6001400500,22.007144,55.920036,0.471125,15.0,13221.370588,0.013825,795.0,135068.0,0.049148,0.405109,0.832117,0.386752,0.0,40.2825,,0.0,19558.95,0.134229,0.0,22530000.0,0.0,0.0,20.0,0.05,1.0,12.0,0.181818,0.0,1.0,,14.646154,23418.163206,0.032255,10178.523077,663346.368,0.006384,45.625,543.0,0.057552,0.0,2119685.25,0.187838,0.0,3.0,1.0,0.0,3.0,0.0,0.0,3.0,0.333333,0.0,12.0,1.0,0.0,4967.25,0.212794,0.014976,73.41142,,1.0,6.0,0.6,-11.538462,0.0,0.67581,0.0,438.0,0.009132,0.0,20.0,0.0


## Now isolating for census tract and standardized columns exclusively

In [34]:
words = ['census_tract','standardized']
selected_columns = []
for word in words:
    selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(word)].tolist())

min_max_standardized_climate_metrics_df = min_max_metrics[selected_columns]
pd.set_option('display.max_columns', None)

min_max_standardized_climate_metrics_df

Unnamed: 0,census_tract,sum_d1_d4_exposure_min_max_standardized,determined_acres_loss_x_min_max_standardized,indemnity_amount_loss_x_min_max_standardized,percent_weeks_drought_exposure_min_max_standardized,protected_areas_percentage_exposure_min_max_standardized,avg_insurance_payout_per_claim_loss_min_max_standardized,estimated_crop_loss_cost_loss_min_max_standardized,total_fatalities_loss_min_max_standardized,median_warning_days_exposure_min_max_standardized,change_chill_hours_min_max_standardized_loss_min,change_chill_hours_min_max_standardized_loss_max,change_chill_hours_min_max_standardized_loss_min_max_standardized,determined_acres_loss_y_min_max_standardized,indemnity_amount_loss_y_min_max_standardized,avg_age_adjust_heat_hospitalizations_per_10000_loss_min_max_standardized,building_content_cost_exposure_min_max_standardized,fire_stations_count_diff_exposure_min_max_standardized,hospitals_count_diff_exposure_min_max_standardized,police_stations_count_diff_exposure_min_max_standardized,schools_count_diff_exposure_min_max_standardized,building_count_exposure_min_max_standardized,delta_percentage_change_exposure_min_max_standardized,wastewater_count_exposure_min_max_standardized,rcp_4.5__50th_percent_change_loss_min_max_standardized,average_damaged_destroyed_structures_loss_min_max_standardized,average_fatalities_loss_min_max_standardized
0,6001400100,0.471125,0.013825,0.049148,0.386752,,0.279355,0.0,0.05,0.181818,0.0,1.0,,0.032255,0.006384,0.057552,0.306689,1.0,0.0,0.333333,1.0,0.264734,,0.6,0.67581,0.009132,0.0
1,6001400200,0.471125,0.013825,0.049148,0.386752,,,0.0,0.05,0.181818,0.0,1.0,,0.032255,0.006384,0.057552,0.204802,1.0,0.0,0.333333,1.0,0.156022,,0.6,0.67581,0.009132,0.0
2,6001400300,0.471125,0.013825,0.049148,0.386752,,0.132931,0.0,0.05,0.181818,0.0,1.0,,0.032255,0.006384,0.057552,0.435414,1.0,0.0,0.333333,1.0,0.390759,,0.6,0.67581,0.009132,0.0
3,6001400400,0.471125,0.013825,0.049148,0.386752,,,0.0,0.05,0.181818,0.0,1.0,,0.032255,0.006384,0.057552,0.280305,1.0,0.0,0.333333,1.0,0.267351,,0.6,0.67581,0.009132,0.0
4,6001400500,0.471125,0.013825,0.049148,0.386752,,0.134229,0.0,0.05,0.181818,0.0,1.0,,0.032255,0.006384,0.057552,0.187838,1.0,0.0,0.333333,1.0,0.212794,,0.6,0.67581,0.009132,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.500237,,,0.388889,0.000496,,0.0,0.15,0.000000,0.0,1.0,,1.000000,0.587765,0.457284,,0.0,0.0,0.000000,0.0,0.354723,,,,0.050228,0.0
9125,6115041001,0.500237,,,0.388889,1.000000,,0.0,0.15,0.090909,0.0,1.0,,1.000000,0.587765,0.457284,,0.0,0.0,0.000000,0.0,,,,,0.050228,0.0
9126,6115041002,0.500237,,,0.388889,0.147955,,0.0,0.15,0.090909,0.0,1.0,,1.000000,0.587765,0.457284,,0.0,0.0,0.000000,0.0,,,,,0.050228,0.0
9127,6115041101,0.500237,,,0.388889,0.008192,,0.0,0.15,0.272727,0.0,1.0,,1.000000,0.587765,0.457284,,0.0,0.0,0.000000,0.0,,,,,0.050228,0.0


In [35]:
min_max_standardized_climate_metrics_df.columns

Index(['census_tract', 'sum_d1_d4_exposure_min_max_standardized',
       'determined_acres_loss_x_min_max_standardized',
       'indemnity_amount_loss_x_min_max_standardized',
       'percent_weeks_drought_exposure_min_max_standardized',
       'protected_areas_percentage_exposure_min_max_standardized',
       'avg_insurance_payout_per_claim_loss_min_max_standardized',
       'estimated_crop_loss_cost_loss_min_max_standardized',
       'total_fatalities_loss_min_max_standardized',
       'change_chill_hours_min_max_standardized_loss_min',
       'change_chill_hours_min_max_standardized_loss_max',
       'change_chill_hours_min_max_standardized_loss_min_max_standardized',
       'determined_acres_loss_y_min_max_standardized',
       'indemnity_amount_loss_y_min_max_standardized',
       'avg_age_adjust_heat_hospitalizations_per_10000_loss_min_max_standardized',
       'building_content_cost_exposure_min_max_standardized',
       'fire_stations_count_diff_exposure_min_max_standardized',


In [36]:
# Identify columns containing 'loss' and 'exposure'
loss_columns = [col for col in min_max_standardized_climate_metrics_df.columns if 'loss' in col]
exposure_columns = [col for col in min_max_standardized_climate_metrics_df.columns if 'exposure' in col]

# Safely sum columns, avoiding KeyErrors
min_max_standardized_climate_metrics_df['total_loss'] = min_max_standardized_climate_metrics_df[loss_columns].sum(axis=1, skipna=True)
min_max_standardized_climate_metrics_df['total_exposure'] = min_max_standardized_climate_metrics_df[exposure_columns].sum(axis=1, skipna=True)

# Select only the new columns to create the resulting DataFrame
summed_exposure_loss = min_max_standardized_climate_metrics_df[['census_tract','total_loss', 'total_exposure']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_max_standardized_climate_metrics_df['total_loss'] = min_max_standardized_climate_metrics_df[loss_columns].sum(axis=1, skipna=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_max_standardized_climate_metrics_df['total_exposure'] = min_max_standardized_climate_metrics_df[exposure_columns].sum(axis=1, skipna=True)


In [37]:
summed_exposure_loss

Unnamed: 0,census_tract,total_loss,total_exposure
0,6001400100,2.173463,4.544452
1,6001400200,1.894108,4.333853
2,6001400300,2.027039,4.799203
3,6001400400,1.894108,4.520685
4,6001400500,2.028336,4.373661
...,...,...,...
9124,6115040902,3.245278,1.244346
9125,6115041001,3.245278,1.980035
9126,6115041002,3.245278,1.127990
9127,6115041101,3.245278,1.170045


In [42]:
product_exposure_loss = summed_exposure_loss
product_exposure_loss['loss_exposure_product'] = summed_exposure_loss['total_exposure'] * summed_exposure_loss['total_loss']

product_exposure_loss = product_exposure_loss[['census_tract', 'loss_exposure_product']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_exposure_loss['loss_exposure_product'] = summed_exposure_loss['total_exposure'] * summed_exposure_loss['total_loss']


In [43]:
# standardizing our df
min_max_climate_product = product_exposure_loss.copy()
columns_to_process = [col for col in min_max_climate_product.columns if col != 'census_tract']
min_max_climate_product = min_max_standardize(min_max_climate_product, columns_to_process)

min_max_climate_product.head()

Unnamed: 0,census_tract,loss_exposure_product_min,loss_exposure_product_max,loss_exposure_product_min_max_standardized
0,6001400100,0.0,20.094262,0.491543
1,6001400200,0.0,20.094262,0.408514
2,6001400300,0.0,20.094262,0.484127
3,6001400400,0.0,20.094262,0.426125
4,6001400500,0.0,20.094262,0.441482


In [51]:
words = ['census_tract','standardized']
selected_columns = []
for word in words:
    selected_columns.extend(min_max_climate_product.columns[min_max_climate_product.columns.str.contains(word)].tolist())

min_max_standardized_climate_products = min_max_climate_product[selected_columns]
pd.set_option('display.max_columns', None)

min_max_standardized_climate_products = min_max_standardized_climate_products.rename(columns={'census_tract':'GEOID'})

## Calling census tract shape files so we can reproject and map our data

In [45]:
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_boundaries['GEOID'] = ca_boundaries['GEOID'].astype(str)
ca_boundaries.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,6,85,504321,6085504321,5043.21,Census Tract 5043.21,G5020,S,1450237,0,37.3931319,-121.8651427,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6,85,504410,6085504410,5044.1,Census Tract 5044.10,G5020,S,1102136,0,37.4093719,-121.8788884,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6,85,507003,6085507003,5070.03,Census Tract 5070.03,G5020,S,9529865,0,37.2199936,-121.9979512,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6,85,507004,6085507004,5070.04,Census Tract 5070.04,G5020,S,2424447,0,37.2260144,-121.9763816,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6,85,502204,6085502204,5022.04,Census Tract 5022.04,G5020,S,331145,0,37.3013832,-121.9258424,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."


In [52]:
min_max_standardized_climate_products.to_csv('climate_products.csv', index=False)

In [53]:
bucket_name = 'ca-climate-index'
directory = '3_fair_data/domain_standardized_data'

upload_csv_aws(['climate_products.csv'], bucket_name, directory)

climate_products.csv uploaded to AWS
