# Cal-CRAI Index Calculation Testing

**Order of operations**:
- Metric handling
   - Retrieve data
   - Outlier handling
   - Min-max standardization
   - Set vulnerability orientation (pos for vulnerable, neg for resilience)
- Calculate indicators
   - Min-max standardization
- Calculate domain score
- Visualizations

Note: Functions and figures within this notebook are in development

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import shutil
import glob
import geopandas as gpd
import matplotlib.pyplot as plt

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata
from scripts.utils.index_plot import index_plot
from scripts.utils.calculate_index import handle_outliers, min_max_standardize, process_domain_csv_files, compute_averaged_indicators, compute_summed_indicators

### Step 1: Retrieve metric files and process

In [2]:
## set-up
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

# metric tracking -- change to your local version (to do: add public facing version)
meta_csv = r'/Users/victoriaford/Desktop/carb-climate-index/scripts/utils/Full Data Pipeline Notes - 4_ Calculate Metric.csv'

# domain-specific
domain_prefix = 'society_'  # You can change this to any prefix you need
output_folder = domain_prefix + "folder"

In [3]:
# retrieve data from AWS
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False, print_name=False)
merged_output_file = 'concatenate_society_economy_metrics.csv'
process_domain_csv_files(domain_prefix, output_folder, meta_csv, merged_output_file)

Metric data retrieved from 3_fair_data/index_data/.
Processed and saved 33 CSV files.
Deleted 68 local non-relevant CSV files.

All entries within the island tract (6075980401) are NaN.

Number of infinite entries in the DataFrame: 67
Replacing infinite entries (if any) with NaN
Number of infinite entries in the DataFrame: 0

File processing complete, dataframe will now be saved as a .csv
Processed CSV saved as concatenate_society_economy_metrics.csv


In [4]:
# read-in and view processed data
processed_society_df = pd.read_csv('concatenate_society_economy_metrics.csv')
processed_society_df

Unnamed: 0,GEOID,health and personal care stores per 10000 people_resilient,poverty_percent_2019_vulnerable,hospitals per 10000 people_resilient,percent_population_without_health_insurance_vulnerable,linguistic_isolation_percent_2019_vulnerable,avg_hpsscore_primary_care_metric_vulnerable,real_percent_total_pop_over_65_vulnerable,percent_population_cognitive_disabilities_vulnerable,Percent of households without air conditioning_vulnerable,...,hachman_index_vulnerable,percent_1miurban_10mirural_vulnerable,Percent of population employed and aged > 16 working outdoors_vulnerable,est_gini_index_vulnerable,specialty trade contractors per 10000 people_resilient,personal and household goods repair and maintenance per 10000 people_resilient,"Number of Violent Crimes per 10,000 Population_vulnerable",avg_percent_population_homeless_vulnerable,blood and organ banks per 10000 people_resilient,percent_total_pop_american_indian_alaska_native_vulnerable
0,6085504321,2.201550,17.500000,0.083471,3.843311,21.100000,,16.555802,5.210552,35.430875,...,0.788695,27.547393,2.022881,0.4197,11.837246,0.511261,31.703849,0.691595,,0.3
1,6085504410,2.201550,23.100000,0.083471,5.237633,21.100000,,16.197866,3.507880,35.430875,...,0.788695,27.547393,1.144842,0.4309,11.837246,0.511261,31.703849,0.691595,,1.3
2,6085507003,2.201550,17.793413,0.083471,0.260247,11.064134,,26.545218,1.706572,35.430875,...,0.788695,27.547393,4.619098,0.5357,11.837246,0.511261,31.703849,0.691595,,0.0
3,6085507004,2.201550,17.793413,0.083471,0.305655,11.064134,,21.599592,2.310994,35.430875,...,0.788695,27.547393,4.619098,0.4793,11.837246,0.511261,31.703849,0.691595,,0.0
4,6085502204,2.201550,17.793413,0.083471,4.441703,11.064134,,10.579889,9.356358,35.430875,...,0.788695,27.547393,4.619098,0.4010,11.837246,0.511261,31.703849,0.691595,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6059001303,3.561950,28.300000,0.119676,7.751343,10.200000,,10.744436,4.544180,28.057946,...,0.934257,25.604689,4.455156,0.3590,14.890274,0.834586,25.871294,0.465096,0.050390,0.4
9125,6059001304,3.561950,46.900000,0.119676,19.607293,15.800000,,18.513324,4.226241,28.057946,...,0.934257,25.604689,8.676318,0.3582,14.890274,0.834586,25.871294,0.465096,0.050390,3.9
9126,6059001401,3.561950,37.600000,0.119676,11.480235,13.300000,,16.000841,6.016556,28.057946,...,0.934257,25.604689,6.810978,0.4594,14.890274,0.834586,25.871294,0.465096,0.050390,2.2
9127,6013367200,1.840626,37.100000,0.197824,6.440620,10.500000,,8.843074,3.351287,30.390365,...,0.919995,36.159475,9.026887,0.4130,13.125211,0.498861,42.765917,0.603696,,0.1


### Step 2: Outlier handling
Outlier handling is handled according to the process laid out by Baum et al. 1970 [link]. Outliers beyond the max/min fence values are reset to the fence values.
* Max fence = 3 x 75th percentile
* Min fence = -3 x 25th percentile

In [12]:
# handle outliers
no_outlier_society_economy_metrics = handle_outliers(processed_society_df, domain_prefix, summary_stats=False)
no_outlier_society_economy_metrics

Processed and saved no_outlier_society_metrics.csv with outlier handling.


Unnamed: 0,GEOID,health and personal care stores per 10000 people_resilient,poverty_percent_2019_vulnerable,hospitals per 10000 people_resilient,percent_population_without_health_insurance_vulnerable,linguistic_isolation_percent_2019_vulnerable,avg_hpsscore_primary_care_metric_vulnerable,real_percent_total_pop_over_65_vulnerable,percent_population_cognitive_disabilities_vulnerable,Percent of households without air conditioning_vulnerable,...,hachman_index_vulnerable,percent_1miurban_10mirural_vulnerable,Percent of population employed and aged > 16 working outdoors_vulnerable,est_gini_index_vulnerable,specialty trade contractors per 10000 people_resilient,personal and household goods repair and maintenance per 10000 people_resilient,"Number of Violent Crimes per 10,000 Population_vulnerable",avg_percent_population_homeless_vulnerable,blood and organ banks per 10000 people_resilient,percent_total_pop_american_indian_alaska_native_vulnerable
0,6085504321,2.201550,17.500000,0.083471,3.843311,21.100000,,16.555802,5.210552,35.430875,...,0.788695,27.547393,2.022881,0.4197,11.837246,0.511261,31.703849,0.691595,,0.3
1,6085504410,2.201550,23.100000,0.083471,5.237633,21.100000,,16.197866,3.507880,35.430875,...,0.788695,27.547393,1.144842,0.4309,11.837246,0.511261,31.703849,0.691595,,1.3
2,6085507003,2.201550,17.793413,0.083471,0.260247,11.064134,,26.545218,1.706572,35.430875,...,0.788695,27.547393,4.619098,0.5357,11.837246,0.511261,31.703849,0.691595,,0.0
3,6085507004,2.201550,17.793413,0.083471,0.305655,11.064134,,21.599592,2.310994,35.430875,...,0.788695,27.547393,4.619098,0.4793,11.837246,0.511261,31.703849,0.691595,,0.0
4,6085502204,2.201550,17.793413,0.083471,4.441703,11.064134,,10.579889,9.356358,35.430875,...,0.788695,27.547393,4.619098,0.4010,11.837246,0.511261,31.703849,0.691595,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6059001303,3.561950,28.300000,0.119676,7.751343,10.200000,,10.744436,4.544180,28.057946,...,0.934257,25.604689,4.455156,0.3590,14.890274,0.834586,25.871294,0.465096,0.050390,0.4
9125,6059001304,3.561950,46.900000,0.119676,19.607293,15.800000,,18.513324,4.226241,28.057946,...,0.934257,25.604689,8.676318,0.3582,14.890274,0.834586,25.871294,0.465096,0.050390,3.9
9126,6059001401,3.561950,37.600000,0.119676,11.480235,13.300000,,16.000841,6.016556,28.057946,...,0.934257,25.604689,6.810978,0.4594,14.890274,0.834586,25.871294,0.465096,0.050390,2.2
9127,6013367200,1.840626,37.100000,0.197824,6.440620,10.500000,,8.843074,3.351287,30.390365,...,0.919995,36.159475,9.026887,0.4130,13.125211,0.498861,42.765917,0.603696,,0.1


### Step 3: Min-max standardization
Metrics are min-max standardized on 0.01 to 0.99 scale.

In [14]:
# standardizing our df
columns_to_process = [col for col in no_outlier_society_economy_metrics.columns if col != 'GEOID']
min_max_metrics = min_max_standardize(no_outlier_society_economy_metrics, columns_to_process)
min_max_metrics.head()

All standardized columns are within the [0, 1] range (considering tolerance).


Unnamed: 0,GEOID,health and personal care stores per 10000 people_resilient_min,health and personal care stores per 10000 people_resilient_max,health and personal care stores per 10000 people_resilient_min_max_standardized,poverty_percent_2019_vulnerable_min,poverty_percent_2019_vulnerable_max,poverty_percent_2019_vulnerable_min_max_standardized,hospitals per 10000 people_resilient_min,hospitals per 10000 people_resilient_max,hospitals per 10000 people_resilient_min_max_standardized,...,"Number of Violent Crimes per 10,000 Population_vulnerable_min_max_standardized",avg_percent_population_homeless_vulnerable_min,avg_percent_population_homeless_vulnerable_max,avg_percent_population_homeless_vulnerable_min_max_standardized,blood and organ banks per 10000 people_resilient_min,blood and organ banks per 10000 people_resilient_max,blood and organ banks per 10000 people_resilient_min_max_standardized,percent_total_pop_american_indian_alaska_native_vulnerable_min,percent_total_pop_american_indian_alaska_native_vulnerable_max,percent_total_pop_american_indian_alaska_native_vulnerable_min_max_standardized
0,6085504321,0.90808,3.782434,0.450004,1.0,91.6,0.182119,0.083471,0.446305,0.0,...,0.160749,0.098746,2.572686,0.239638,0.02058,0.15117,,0.0,3.9,0.076923
1,6085504410,0.90808,3.782434,0.450004,1.0,91.6,0.243929,0.083471,0.446305,0.0,...,0.160749,0.098746,2.572686,0.239638,0.02058,0.15117,,0.0,3.9,0.333333
2,6085507003,0.90808,3.782434,0.450004,1.0,91.6,0.185358,0.083471,0.446305,0.0,...,0.160749,0.098746,2.572686,0.239638,0.02058,0.15117,,0.0,3.9,0.0
3,6085507004,0.90808,3.782434,0.450004,1.0,91.6,0.185358,0.083471,0.446305,0.0,...,0.160749,0.098746,2.572686,0.239638,0.02058,0.15117,,0.0,3.9,0.0
4,6085502204,0.90808,3.782434,0.450004,1.0,91.6,0.185358,0.083471,0.446305,0.0,...,0.160749,0.098746,2.572686,0.239638,0.02058,0.15117,,0.0,3.9,0.0


### Step 4: Now isolating for GEOID and standardized columns exclusively

In [None]:
words = ['GEOID','standardized']
selected_columns = []
for word in words:
    selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(word)].tolist())

min_max_standardized_society_economy_metrics_df = min_max_metrics[selected_columns]
pd.set_option('display.max_columns', None)

min_max_standardized_society_economy_metrics_df

## Now need to refactor metrics where high values indicate resistance
* subtract resilient columns values from 1

In [None]:
# Select columns with 'resilient' in their names
resilient_columns = [col for col in min_max_standardized_society_economy_metrics_df.columns if 'resilient' in col]

# Use .loc to ensure you're modifying the DataFrame correctly
min_max_standardized_society_economy_metrics_df.loc[:, resilient_columns] = 1 - min_max_standardized_society_economy_metrics_df.loc[:, resilient_columns]

In [None]:
min_max_standardized_society_economy_metrics_df

In [None]:
min_max_standardized_society_economy_metrics_df.columns

## Dictionary to associate a keyword within the df column with its indicator

In [None]:
metric_to_indicator_society_dict = {
    'vulnerable_populations' : ['asthma', 
                                'cardiovascular_disease', 
                                'birth_weight',
                                'education',
                                'linguistic',
                                'poverty', 
                                'unemployment',
                                'housing_burden',
                                'imp_water_bodies',
                                'homeless',
                                'health_insurance',
                                'ambulatory_disabilities',
                                'cognitive_disabilities',
                                'air conditioning',
                                'Violent Crimes',
                                'working outdoors', 
                                '1miurban_10mirural',
                                'american_indian',
                                'over_65',
                                'under_5',
                                'household_financial_assistance'],

            'social_services' : ['blood',
                                 'hospitals',
                                 'care store',
                                 'engineering',
                                 'specialty trade',
                                 'repair',
                                 'mental_shortage',
                                 'primary_care',
                                 'narcotic'],

            'economic_health' : ['gini',
                                 'median_income',
                                 'hachman'] 
}

# Loop to go through df columns and average metrics that belong within an indicator based off of the metric to indicator dictionary

In [None]:
df = min_max_standardized_society_economy_metrics_df
dict = metric_to_indicator_society_dict

averaged_indicators_society_economy = compute_averaged_indicators(
    df, 
    dict
)

In [None]:
averaged_indicators_society_economy

## Sum all the non-census tract columns together to calculate the domain score

In [None]:
columns_to_sum = [col for col in averaged_indicators_society_economy.columns if col != 'GEOID']

summed_indicators_society_economy = compute_summed_indicators(
    df=averaged_indicators_society_economy, 
    columns_to_sum=columns_to_sum
)


## Min-max standardize the summed columns

In [None]:
min_max_domain = summed_indicators_society_economy.copy()

columns_to_process = [col for col in min_max_domain.columns if col != 'GEOID']

min_max_domain = min_max_standardize(min_max_domain, columns_to_process)
print(len(min_max_domain))
min_max_domain.head()

## Isolate to census tract and summed standardized columns, and rename tract to GEOID for merging
* add a zero at the beginning of the GEOID to match census tract that will be merged

In [None]:
min_max_standardized_society_economy_domain = min_max_domain[['GEOID', 'summed_indicators_society_economy_domain_min_max_standardized']]

min_max_standardized_society_economy_domain['GEOID'] = min_max_standardized_society_economy_domain['GEOID'].apply(lambda x: '0' + str(x))
min_max_standardized_society_economy_domain['GEOID'] = min_max_standardized_society_economy_domain['GEOID'].astype(str).apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

# Print the DataFrame to check the 'GEOID' column
print(len(min_max_standardized_society_economy_domain))
min_max_standardized_society_economy_domain

## Calling census tract shape files so we can reproject and map our data

In [None]:
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_boundaries['GEOID'] = ca_boundaries['GEOID'].astype(str)
ca_boundaries.head()

## Merge the df and census tracts and convert the geometry to our uniformly used coordinate reference system (4269)
* Map it!

In [None]:
# Merge the DataFrames
df2 = min_max_standardized_society_economy_domain.merge(ca_boundaries, on='GEOID')

# Convert to GeoDataFrame with the correct CRS
df2 = gpd.GeoDataFrame(df2, geometry='geometry', crs=4269)

# Check for invalid geometries
invalid_geometries = df2[~df2['geometry'].is_valid]
print("Number of invalid geometries:", len(invalid_geometries))

# Check if there are any geometries left
if len(df2) == 0:
    print("No valid geometries remaining. Cannot plot.")
else:
    # Set up the figure
    fig, ax = plt.subplots(1, 1, figsize=(7, 10), layout='compressed')

    # Plot the data
    plot = df2.plot(column='summed_indicators_society_economy_domain_min_max_standardized', 
             ax=ax, 
             vmin=0, vmax=1, 
             legend=True, 
             cmap='RdYlBu_r',
             legend_kwds={'label': 'Vulnerability (larger values are more vulnerable)', 'orientation': 'horizontal', 'shrink': 1.0, 'pad': 0.03})
    
     # Set title
    ax.set_title('California Vulnerability - Society & Economy Domain', fontsize = 16.5)

    # Display the plot
    plt.show()


In [None]:
df2.to_csv('society_economy_index_data.csv', index=False)

In [None]:
bucket_name = 'ca-climate-index'
directory = '3_fair_data/domain_standardized_data'

upload_csv_aws(['society_economy_index_data.csv'], bucket_name, directory)

## Selecting counties to be mapped, by the COUNTYFP number
* manually enter the names of the counties linked with their number

In [None]:
list_of_counties = [
    '037',
    '071',
    '065',
    '029',
    '111']

df2 = min_max_standardized_society_economy_domain.merge(ca_boundaries, on='GEOID')

# Filtering rows where COUNTYFP is in the list_of_counties
df2_filtered = df2[df2['COUNTYFP'].isin(list_of_counties)]

# Convert to GeoDataFrame with the correct CRS
df2_filtered = gpd.GeoDataFrame(df2_filtered, geometry='geometry', crs=4269)

# Check for invalid geometries
invalid_geometries = df2_filtered[~df2_filtered['geometry'].is_valid]
print("Number of invalid geometries:", len(invalid_geometries))

# Group by COUNTYFP and take the geometry of the first row in each group
county_boundaries = df2_filtered.dissolve(by='COUNTYFP')['geometry']

# Dictionary mapping county codes to labels
county_labels = {
    '037': 'Los \n Angeles',
    '071': 'San Bernardino',
    '065': 'Riverside',
    '029': 'Kern',
    '111': 'Ventura'
}

# Set up the figure
fig, ax = plt.subplots(1, 1, figsize=(6, 12), layout='compressed')

# Plot county boundaries
county_boundaries.boundary.plot(ax=ax, linewidth=0.7, edgecolor='black')

# Plot the data
df2_filtered.plot(column='summed_indicators_society_economy_domain_min_max_standardized', 
         ax=ax, 
         vmin=0, vmax=1, 
         legend=True, 
         cmap='RdYlBu_r', 
         legend_kwds={'label': 'Vulnerability (larger values are more vulnerable)', 'orientation': 'horizontal', 'shrink': 0.9, 'pad': -0.3})

# Add county labels
for county_code, label in county_labels.items():
    centroid = county_boundaries[county_code].centroid
    ax.text(centroid.x, centroid.y, label, weight='light', fontsize=9, ha='center', va='baseline')

ax.set_title('Society & Economy Domain - LA & Surrounding Areas', fontsize=16)

# Display the plot
plt.show()