# Cal-CRAI Index Calculation Testing

Note: Functions and figures within this notebook are in development

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import shutil
import glob
import geopandas as gpd
import matplotlib.pyplot as plt

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata
from scripts.utils.index_plot import index_plot
from scripts.utils.calculate_index import handle_outliers, min_max_standardize

## Pulling all calculated metric files

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'built_broadband_internet_metric.csv'
Saved DataFrame as 'built_cellular_towers_metric.csv'
Saved DataFrame as 'built_energy_transmission_lines_metric.csv'
Saved DataFrame as 'built_housing_before_1980_metric.csv'
Saved DataFrame as 'built_housing_mobile_homes_metric.csv'
Saved DataFrame as 'built_housing_quality_metric.csv'
Saved DataFrame as 'built_housing_vacancy_metric.csv'
Saved DataFrame as 'built_metric_housing_vacancy_metric.csv'
Saved DataFrame as 'built_microwave_towers_metric.csv'
Saved DataFrame as 'built_mobile_towers_metric.csv'
Saved DataFrame as 'built_paging_towers_metric.csv'
Saved DataFrame as 'built_power_plant_metric.csv'
Saved DataFrame as 'built_power_shutoffs_metric.csv'
Saved DataFrame as 'built_radio_towers_metric.csv'
Saved DataFrame as 'built_transportation_airports_metric.csv'
Saved DataFrame as 'built_transportation_bottleneck_metric.csv'
Saved DataFrame as 'built_transportation_bridge_metric.csv'
Saved DataFrame as 'built_transportation

In [3]:
# change to call in your local dir's calculate metric tab from the full data pipeline spreadsheet
meta_csv = r'C:/Users/jespi/eagle/carb-climate-index-9/metadata/Full Data Pipeline Notes - 4_ Calculate Metric.csv'
# read in first tab of the sheet
df = pd.read_csv(meta_csv)
df.columns

Index(['Data Source', 'Data', 'Domain', 'Indicator', 'Metric',
       'High value result (vulnerable or resilient)', 'Metric file name',
       'Metric Calculated (Y/N)', 'Metadata Finalized (Y/N)',
       'Metric Calculator', 'Metric goes into what indicator',
       'Includes Indigenous Tribes', 'Notes'],
      dtype='object')

Function call

In [4]:
def process_domain_csv_files(prefix, output_folder, meta_csv, merged_output_file):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load the metadata CSV
    df = pd.read_csv(meta_csv)

    # Get the list of metric file names and corresponding 'High value result' entries
    metric_files = df[['Metric file name', 'High value result (vulnerable or resilient)']]

    # Find all CSV files starting with the provided prefix and matching the metric file names
    source_files = [file for file in glob.glob(f'{prefix}*.csv') if os.path.basename(file) in metric_files['Metric file name'].values]

    # Iterate through the source files and process them
    for file in source_files:
        # Get the 'High value result (vulnerable or resilient)' entry for the current file
        column_result = metric_files.loc[metric_files['Metric file name'] == os.path.basename(file), 'High value result (vulnerable or resilient)'].values[0]

        # Load the CSV file
        csv_df = pd.read_csv(file)

        # Get the last column name
        last_column = csv_df.columns[-1]

        # Append the column result to the last column name
        csv_df.rename(columns={last_column: f"{last_column}_{column_result}"}, inplace=True)

        # Construct the destination file path
        destination_path = os.path.join(output_folder, os.path.basename(file))

        # Save the modified CSV to the output folder
        csv_df.to_csv(destination_path, index=False)

        # Remove the original file
        os.remove(file)

    print(f"Processed and saved {len(source_files)} CSV files.")

    # Delete all CSV files in the current directory that are not in the output folder
    current_files = glob.glob('*.csv')
    for file in current_files:
        if file not in [os.path.basename(f) for f in source_files]:
            os.remove(file)

    print(f"Deleted {len(current_files) - len(source_files)} local non-relevant CSV files.")
    print('')

    # --- Additional Processing: Merging CSV Files ---

    # Get a list of all CSV files in the output folder
    csv_files = glob.glob(os.path.join(output_folder, '*.csv'))

    # Initialize an empty DataFrame for merging
    merged_df = pd.DataFrame()

    # Iterate through each CSV file and merge them on the 'census_tract' column
    for file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
        
        # Rename 'GEO_ID', 'tract', 'TRACT', 'Census_Tract', 'GEOID', 'USCB_GEOID' to 'census_tract' if they exist
        rename_cols = ['GEO_ID', 'tract', 'TRACT', 'Census_Tract', 'GEOID', 'USCB_GEOID']
        for col in rename_cols:
            if col in df.columns:
                df.rename(columns={col: 'census_tract'}, inplace=True)
                break
        
        # Keep only the 'census_tract' and the last column from each file
        last_column = df.columns[-1]
        df = df[['census_tract', last_column]]
        
        # Merge the DataFrame with the existing merged DataFrame
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='census_tract', how='outer')

    # Drop rows where 'census_tract' is NaN
    merged_df = merged_df.dropna(subset=['census_tract'])

    # Convert census tract to string and eliminate scientific notation default
    merged_df['census_tract'] = merged_df['census_tract'].dropna().apply(lambda x: '{:.0f}'.format(x))

    # Convert all values within the island tract (near San Francisco) to NaN, as it is uninhabited 
    island_tract = '6075980401'
    merged_df.loc[merged_df['census_tract'] == island_tract, merged_df.columns != 'census_tract'] = np.nan

    # Check if all entries within the island tract are NaN
    island_row = merged_df.loc[merged_df['census_tract'] == island_tract]
    if island_row.iloc[:, 1:].isnull().all().all():
        print(f"All entries within the island tract ({island_tract}) are NaN.")
    else:
        print(f"Some entries within the island tract ({island_tract}) are not NaN.")

    # Selecting only numeric columns
    numeric_df = merged_df.select_dtypes(include=[np.number])

    # Counting infinite values
    num_infinite = np.isinf(numeric_df).sum().sum()

    print(f"\nNumber of infinite entries in the DataFrame: {num_infinite}")
    print('Replacing infinite entries (if any) with NaN')

    # Replace infinite values with NaN
    merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Selecting only numeric columns
    numeric_df = merged_df.select_dtypes(include=[np.number])

    # Counting infinite values
    num_infinite = np.isinf(numeric_df).sum().sum()
    print(f"Number of infinite entries in the DataFrame: {num_infinite}")

    print(f"\nFile processing complete, dataframe will now be saved as a .csv")
    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(merged_output_file, index=False)

    print(f"Processed CSV saved as {merged_output_file}")


In [5]:
# Society & Economy usage:
prefix = 'society_'  # You can change this to any prefix you need
output_folder = 'society_folder'
meta_csv = r'C:/Users/jespi/eagle/carb-climate-index-9/metadata/Full Data Pipeline Notes - 4_ Calculate Metric.csv'
merged_output_file = 'concatenate_society_economy_metrics.csv'

process_domain_csv_files(prefix, output_folder, meta_csv, merged_output_file)

Processed and saved 33 CSV files.
Deleted 67 local non-relevant CSV files.
All entries within the island tract (6075980401) are NaN.

Number of infinite entries in the DataFrame: 67

Number of infinite entries in the DataFrame: 0
Merged CSV saved as concatenate_society_economy_metrics.csv


In [6]:
processed_society_df = pd.read_csv('concatenate_society_economy_metrics.csv')
processed_society_df

Unnamed: 0,census_tract,percent_population_ambulatory_disabilities_vulnerable,percent_total_pop_american_indian_alaska_native_vulnerable,blood and organ banks per 10000 people_resilient,percent_population_cognitive_disabilities_vulnerable,est_gini_index_vulnerable,hachman_index_vulnerable,est_median_income_dollars_resilient,percent_children_household_financial_assistance_vulnerable,percent_1miurban_10mirural_vulnerable,...,housing_burden_percent_2019_vulnerable,sum_imp_water_bodies_vulnerable,linguistic_isolation_percent_2019_vulnerable,low_birth_weight_percent_2019_vulnerable,Percent of households without air conditioning_vulnerable,Percent of population employed and aged > 16 working outdoors_vulnerable,poverty_percent_2019_vulnerable,unemployment_percent_2019_vulnerable,"Number of Violent Crimes per 10,000 Population_vulnerable",percent_population_without_health_insurance_vulnerable
0,6001400100,4.680330,0.0,0.042072,5.170005,0.4228,0.926769,234236,15.885023,58.256725,...,11.200000,2.0,1.200000,3.850000,64.306645,0.919842,10.400000,4.823653,72.674237,0.275314
1,6001400200,4.005589,0.4,0.042072,1.904253,0.4084,0.926769,225500,0.000000,30.832177,...,4.000000,0.0,0.000000,4.050000,64.306645,1.595745,10.600000,3.000000,72.674237,0.838379
2,6001400300,2.295782,0.5,0.042072,3.529687,0.4615,0.926769,164000,3.184713,30.832177,...,8.900000,0.0,8.000000,3.780000,64.306645,1.152702,10.300000,3.900000,72.674237,1.655099
3,6001400400,3.646564,0.5,0.042072,3.165358,0.5063,0.926769,158836,14.240170,30.832177,...,14.800000,0.0,0.900000,4.440000,64.306645,2.146272,21.100000,2.500000,72.674237,2.244039
4,6001400500,3.367941,0.1,0.042072,6.219631,0.4571,0.926769,95078,4.435484,30.832177,...,14.800000,0.0,1.700000,3.640000,64.306645,3.771252,21.900000,3.800000,72.674237,4.482147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.535332,0.2,,0.544662,0.4252,0.784133,54265,13.194444,0.000000,...,35.200000,0.0,5.916667,4.330000,1.725293,6.049022,28.300000,3.400000,49.063183,1.445396
9125,6115041001,8.387800,0.3,,3.745976,0.5024,0.784133,67321,0.000000,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,2.777778
9126,6115041002,9.160082,2.6,,9.702797,0.4875,0.784133,102534,0.000000,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,15.510682
9127,6115041101,16.826923,1.1,,9.405941,0.4072,0.784133,37018,44.331984,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,7.430070


## Call outlier function to handle outliers:
Max = 3 x 75th percentile
Min = -3 x 25th percentile

Outliers beyond these values are set to metric value closest to the fence
Baum et al. 1970

In [7]:
# Handle outliers
handle_outlier_csv = 'no_outlier_society_economy_metrics.csv'
output = handle_outliers(processed_society_df, handle_outlier_csv)
print(f"Processed and saved {handle_outlier_csv} with outlier handling.")

For column percent_population_ambulatory_disabilities_vulnerable:
  Q1 (25th percentile): 3.4249678882757117
  Q3 (75th percentile): 7.035102433873741
  IQR: 3.610134545598029
  Max fence: 21.105307301621224
  Min fence: -10.274903664827136
Outliers detected in column 'percent_population_ambulatory_disabilities_vulnerable':
census_tract: 6013351105, value: 21.223814773980155
census_tract: 6025012400, value: 21.39219015280136
census_tract: 6033000601, value: 22.210690192008304
census_tract: 6037125322, value: 21.15093387178193
census_tract: 6037206301, value: 31.129864041256443
census_tract: 6037206302, value: 25.71554381329811
census_tract: 6037206303, value: 28.146536089952846
census_tract: 6037302201, value: 22.377807133421403
census_tract: 6037310701, value: 22.231614539306847
census_tract: 6037980014, value: 48.888888888888886
census_tract: 6037980021, value: 23.52941176470588
census_tract: 6039000111, value: 21.643109540636043
census_tract: 6059021813, value: 84.61538461538461
cen

In [8]:
# viewing new 'fenced' outliers
no_outlier_society_economy_metrics = pd.read_csv('no_outlier_society_economy_metrics.csv')
no_outlier_society_economy_metrics

Unnamed: 0,census_tract,percent_population_ambulatory_disabilities_vulnerable,percent_total_pop_american_indian_alaska_native_vulnerable,blood and organ banks per 10000 people_resilient,percent_population_cognitive_disabilities_vulnerable,est_gini_index_vulnerable,hachman_index_vulnerable,est_median_income_dollars_resilient,percent_children_household_financial_assistance_vulnerable,percent_1miurban_10mirural_vulnerable,...,housing_burden_percent_2019_vulnerable,sum_imp_water_bodies_vulnerable,linguistic_isolation_percent_2019_vulnerable,low_birth_weight_percent_2019_vulnerable,Percent of households without air conditioning_vulnerable,Percent of population employed and aged > 16 working outdoors_vulnerable,poverty_percent_2019_vulnerable,unemployment_percent_2019_vulnerable,"Number of Violent Crimes per 10,000 Population_vulnerable",percent_population_without_health_insurance_vulnerable
0,6001400100,4.680330,0.0,0.042072,5.170005,0.4228,0.926769,234236.0,15.885023,58.256725,...,11.200000,2.0,1.200000,3.850000,64.306645,0.919842,10.400000,4.823653,72.674237,0.275314
1,6001400200,4.005589,0.4,0.042072,1.904253,0.4084,0.926769,225500.0,0.000000,30.832177,...,4.000000,0.0,0.000000,4.050000,64.306645,1.595745,10.600000,3.000000,72.674237,0.838379
2,6001400300,2.295782,0.5,0.042072,3.529687,0.4615,0.926769,164000.0,3.184713,30.832177,...,8.900000,0.0,8.000000,3.780000,64.306645,1.152702,10.300000,3.900000,72.674237,1.655099
3,6001400400,3.646564,0.5,0.042072,3.165358,0.5063,0.926769,158836.0,14.240170,30.832177,...,14.800000,0.0,0.900000,4.440000,64.306645,2.146272,21.100000,2.500000,72.674237,2.244039
4,6001400500,3.367941,0.1,0.042072,6.219631,0.4571,0.926769,95078.0,4.435484,30.832177,...,14.800000,0.0,1.700000,3.640000,64.306645,3.771252,21.900000,3.800000,72.674237,4.482147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0.535332,0.2,,0.544662,0.4252,0.784133,54265.0,13.194444,0.000000,...,35.200000,0.0,5.916667,4.330000,1.725293,6.049022,28.300000,3.400000,49.063183,1.445396
9125,6115041001,8.387800,0.3,,3.745976,0.5024,0.784133,67321.0,0.000000,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,2.777778
9126,6115041002,9.160082,2.6,,9.702797,0.4875,0.784133,102534.0,0.000000,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,15.510682
9127,6115041101,16.826923,1.1,,9.405941,0.4072,0.784133,37018.0,44.331984,37.636696,...,19.722222,3.0,5.916667,4.863333,1.725293,11.883790,48.111111,9.388889,49.063183,7.430070


## Metrics are now min-max standardized on 0.01 to 0.99 scale

In [39]:
def min_max_standardize2(df, cols_to_run_on, tolerance=1e-9):
    '''
    Calculates min and max values for specified columns, then calculates
    min-max standardized values with a tolerance for floating-point precision errors.

    Parameters
    ----------
    df: DataFrame
        Input dataframe   
    cols_to_run_on: list
        List of columns to calculate min, max, and standardize
    tolerance: float
        Tolerance value for checking if standardized values are within the [0, 1] range
    '''
    all_good = True  # Flag to track if all columns are within range

    for col in cols_to_run_on:
        max_value = df[col].max()
        min_value = df[col].min()

        # Print out the min and max for the current column
        print(f"Processing column '{col}': min={min_value}, max={max_value}")
        
        # Get min-max values, standardize, and add columns to df
        prefix = col  # Using the column name as the prefix for new columns
        df[f'{prefix}_min'] = min_value
        df[f'{prefix}_max'] = max_value
        df[f'{prefix}_min_max_standardized'] = ((df[col] - min_value) / (max_value - min_value))

        # Check if the new standardized column values are between 0 and 1, ignoring NaN values
        standardized_col = df[f'{prefix}_min_max_standardized']
        is_within_range = standardized_col.dropna().between(-tolerance, 1 + tolerance)

        if not is_within_range.all():
            all_good = False
            out_of_bounds = standardized_col[~is_within_range]
            print(f"Warning: Column '{prefix}_min_max_standardized' has values outside the [0, 1] range (considering tolerance).")
            print(out_of_bounds)

        # Drop the original column
        df.drop(columns=[col], inplace=True)
    
    # Print a summary at the end
    if all_good:
        print("All standardized columns are within the [0, 1] range (considering tolerance).")
    else:
        print("Some columns have values outside the [0, 1] range.")

    return df


In [40]:
# standardizing our df
min_max_metrics = no_outlier_society_economy_metrics.copy()
columns_to_process = [col for col in min_max_metrics.columns if col != 'census_tract']
min_max_metrics = min_max_standardize2(min_max_metrics, columns_to_process)

min_max_metrics.head()

Processing column 'percent_population_ambulatory_disabilities_vulnerable': min=0.0, max=21.105307301621224
Processing column 'percent_total_pop_american_indian_alaska_native_vulnerable': min=0.0, max=3.9
Processing column 'blood and organ banks per 10000 people_resilient': min=0.0205804764544943, max=0.151170294281322
Processing column 'percent_population_cognitive_disabilities_vulnerable': min=0.0, max=17.051712368848662
Processing column 'est_gini_index_vulnerable': min=0.0687, max=0.7716
Processing column 'hachman_index_vulnerable': min=0.1868562993558402, max=0.95573004786745
Processing column 'est_median_income_dollars_resilient': min=9417.0, max=249500.0
Processing column 'percent_children_household_financial_assistance_vulnerable': min=0.0, max=100.0
Processing column 'percent_1miurban_10mirural_vulnerable': min=0.0, max=92.65066239578704
Processing column 'health and personal care stores per 10000 people_resilient': min=0.9080796385843036, max=3.782434374763598
Processing colum

Unnamed: 0,census_tract,percent_population_ambulatory_disabilities_vulnerable_min,percent_population_ambulatory_disabilities_vulnerable_max,percent_population_ambulatory_disabilities_vulnerable_min_max_standardized,percent_total_pop_american_indian_alaska_native_vulnerable_min,percent_total_pop_american_indian_alaska_native_vulnerable_max,percent_total_pop_american_indian_alaska_native_vulnerable_min_max_standardized,blood and organ banks per 10000 people_resilient_min,blood and organ banks per 10000 people_resilient_max,blood and organ banks per 10000 people_resilient_min_max_standardized,...,poverty_percent_2019_vulnerable_min_max_standardized,unemployment_percent_2019_vulnerable_min,unemployment_percent_2019_vulnerable_max,unemployment_percent_2019_vulnerable_min_max_standardized,"Number of Violent Crimes per 10,000 Population_vulnerable_min","Number of Violent Crimes per 10,000 Population_vulnerable_max","Number of Violent Crimes per 10,000 Population_vulnerable_min_max_standardized",percent_population_without_health_insurance_vulnerable_min,percent_population_without_health_insurance_vulnerable_max,percent_population_without_health_insurance_vulnerable_min_max_standardized
0,6001400100,0.0,21.105307,0.221761,0.0,3.9,0.0,0.02058,0.15117,0.164571,...,0.103753,0.0,23.532482,0.204978,21.355248,85.732652,0.797158,0.0,29.677171,0.009277
1,6001400200,0.0,21.105307,0.189791,0.0,3.9,0.102564,0.02058,0.15117,0.164571,...,0.10596,0.0,23.532482,0.127483,21.355248,85.732652,0.797158,0.0,29.677171,0.02825
2,6001400300,0.0,21.105307,0.108777,0.0,3.9,0.128205,0.02058,0.15117,0.164571,...,0.102649,0.0,23.532482,0.165728,21.355248,85.732652,0.797158,0.0,29.677171,0.05577
3,6001400400,0.0,21.105307,0.172779,0.0,3.9,0.128205,0.02058,0.15117,0.164571,...,0.221854,0.0,23.532482,0.106236,21.355248,85.732652,0.797158,0.0,29.677171,0.075615
4,6001400500,0.0,21.105307,0.159578,0.0,3.9,0.025641,0.02058,0.15117,0.164571,...,0.230684,0.0,23.532482,0.161479,21.355248,85.732652,0.797158,0.0,29.677171,0.15103


In [17]:
over_1 = min_max_metrics[min_max_metrics['hospitals per 10000 people_resilient_min_max_standardized'] > 1]
over_1

Unnamed: 0,census_tract,percent_population_ambulatory_disabilities_vulnerable_min,percent_population_ambulatory_disabilities_vulnerable_max,percent_population_ambulatory_disabilities_vulnerable_min_max_standardized,percent_total_pop_american_indian_alaska_native_vulnerable_min,percent_total_pop_american_indian_alaska_native_vulnerable_max,percent_total_pop_american_indian_alaska_native_vulnerable_min_max_standardized,blood and organ banks per 10000 people_resilient_min,blood and organ banks per 10000 people_resilient_max,blood and organ banks per 10000 people_resilient_min_max_standardized,...,poverty_percent_2019_vulnerable_min_max_standardized,unemployment_percent_2019_vulnerable_min,unemployment_percent_2019_vulnerable_max,unemployment_percent_2019_vulnerable_min_max_standardized,"Number of Violent Crimes per 10,000 Population_vulnerable_min","Number of Violent Crimes per 10,000 Population_vulnerable_max","Number of Violent Crimes per 10,000 Population_vulnerable_min_max_standardized",percent_population_without_health_insurance_vulnerable_min,percent_population_without_health_insurance_vulnerable_max,percent_population_without_health_insurance_vulnerable_min_max_standardized


In [9]:
# standardizing our df
min_max_metrics = no_outlier_society_economy_metrics.copy()
columns_to_process = [col for col in min_max_metrics.columns if col != 'census_tract']
min_max_metrics = min_max_standardize(min_max_metrics, columns_to_process)

min_max_metrics.head()

Unnamed: 0,census_tract,percent_population_ambulatory_disabilities_vulnerable_min,percent_population_ambulatory_disabilities_vulnerable_max,percent_population_ambulatory_disabilities_vulnerable_min_max_standardized,percent_total_pop_american_indian_alaska_native_vulnerable_min,percent_total_pop_american_indian_alaska_native_vulnerable_max,percent_total_pop_american_indian_alaska_native_vulnerable_min_max_standardized,blood and organ banks per 10000 people_resilient_min,blood and organ banks per 10000 people_resilient_max,blood and organ banks per 10000 people_resilient_min_max_standardized,...,poverty_percent_2019_vulnerable_min_max_standardized,unemployment_percent_2019_vulnerable_min,unemployment_percent_2019_vulnerable_max,unemployment_percent_2019_vulnerable_min_max_standardized,"Number of Violent Crimes per 10,000 Population_vulnerable_min","Number of Violent Crimes per 10,000 Population_vulnerable_max","Number of Violent Crimes per 10,000 Population_vulnerable_min_max_standardized",percent_population_without_health_insurance_vulnerable_min,percent_population_without_health_insurance_vulnerable_max,percent_population_without_health_insurance_vulnerable_min_max_standardized
0,6001400100,0.0,21.105307,0.221761,0.0,3.9,0.0,0.02058,0.15117,0.164571,...,0.103753,0.0,23.532482,0.204978,21.355248,85.732652,0.797158,0.0,29.677171,0.009277
1,6001400200,0.0,21.105307,0.189791,0.0,3.9,0.102564,0.02058,0.15117,0.164571,...,0.10596,0.0,23.532482,0.127483,21.355248,85.732652,0.797158,0.0,29.677171,0.02825
2,6001400300,0.0,21.105307,0.108777,0.0,3.9,0.128205,0.02058,0.15117,0.164571,...,0.102649,0.0,23.532482,0.165728,21.355248,85.732652,0.797158,0.0,29.677171,0.05577
3,6001400400,0.0,21.105307,0.172779,0.0,3.9,0.128205,0.02058,0.15117,0.164571,...,0.221854,0.0,23.532482,0.106236,21.355248,85.732652,0.797158,0.0,29.677171,0.075615
4,6001400500,0.0,21.105307,0.159578,0.0,3.9,0.025641,0.02058,0.15117,0.164571,...,0.230684,0.0,23.532482,0.161479,21.355248,85.732652,0.797158,0.0,29.677171,0.15103


## Now isolating for census tract and standardized columns exclusively

In [None]:
words = ['census_tract','standardized']
selected_columns = []
for word in words:
    selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(word)].tolist())

min_max_standardized_society_economy_metrics_df = min_max_metrics[selected_columns]
pd.set_option('display.max_columns', None)

min_max_standardized_society_economy_metrics_df

## Now need to refactor metrics where high values indicate resistance
* subtract resilient columns values from 1

In [None]:
# Select columns with 'resilient' in their names
resilient_columns = [col for col in min_max_standardized_society_economy_metrics_df.columns if 'resilient' in col]

# Use .loc to ensure you're modifying the DataFrame correctly
min_max_standardized_society_economy_metrics_df.loc[:, resilient_columns] = 1 - min_max_standardized_society_economy_metrics_df.loc[:, resilient_columns]

In [None]:
min_max_standardized_society_economy_metrics_df

In [None]:
# testing to see how to average all df columns together barring census tract to calculate indicator score
testing = pd.DataFrame()
testing['census_tract'] = min_max_standardized_society_economy_metrics_df['census_tract']
testing['average'] = min_max_standardized_society_economy_metrics_df.drop(columns=['census_tract']).mean(axis=1)

testing

In [None]:
min_max_standardized_society_economy_metrics_df.columns

## Dictionary to associate a keyword within the df column with its indicator

In [None]:
metric_to_indicator_dict = {
    'vulnerable_populations' : ['asthma', 
                                'cardiovascular_disease', 
                                'birth_weight',
                                'education',
                                'linguistic',
                                'poverty', 
                                'unemployment',
                                'housing_burden',
                                'imp_water_bodies',
                                'homeless',
                                'health_insurance',
                                'ambulatory_disabilities',
                                'cognitive_disabilities',
                                'air conditioning',
                                'Violent Crimes',
                                'working outdoors', 
                                '1miurban_10mirural',
                                'american_indian',
                                'over_65',
                                'under_5',
                                'household_financial_assistance'],

            'social_services' : ['blood',
                                 'hospitals',
                                 'care store',
                                 'engineering',
                                 'specialty trade',
                                 'repair',
                                 'mental_shortage',
                                 'primary_care',
                                 'narcotic'],

            'economic_health' : ['gini',
                                 'median_income',
                                 'hachman'] 
}

# Loop to go through df columns and average metrics that belong within an indicator based off of the metric to indicator dictionary

In [None]:
# Create an empty DataFrame to store the results
averaged_indicators_society_economy = pd.DataFrame()

# Iterate through the items of the dictionary
for indicator, keywords in metric_to_indicator_dict.items():
    # Filter columns based on the keyword values for the current indicator
    indicator_columns = [col for col in min_max_standardized_society_economy_metrics_df.columns if any(keyword in col for keyword in keywords)]
    
    # Compute the average of the selected columns
    averaged_values = min_max_standardized_society_economy_metrics_df[indicator_columns].mean(axis=1)
    print(indicator_columns)
    print(len(indicator_columns))
    # Store the averaged values in the result DataFrame with the indicator name as the column name
    averaged_indicators_society_economy[indicator] = averaged_values
    
# Include the 'census_tract' column from the original DataFrame
averaged_indicators_society_economy['census_tract'] = min_max_standardized_society_economy_metrics_df['census_tract']
# Reorder the columns to have 'census_tract' as the first column

averaged_indicators_society_economy = averaged_indicators_society_economy[['census_tract'] + [col for col in averaged_indicators_society_economy.columns if col != 'census_tract']]
# Show the resulting DataFrame
print(averaged_indicators_society_economy)


## Sum all the non-census tract columns together to calculate the domain score

In [None]:
# Create a new DataFrame to store the results
summed_indicators_society_economy = pd.DataFrame()

# Calculate the sum of all columns except 'census_tract'
columns_to_process = [col for col in averaged_indicators_society_economy.columns if col != 'census_tract']

summed_values = averaged_indicators_society_economy[columns_to_process].sum(axis=1)

# Store the summed values in the result DataFrame with the column name 'summed_indicators_society_economy_domain'
summed_indicators_society_economy['summed_indicators_society_economy_domain'] = summed_values

# Include the 'census_tract' column from the original DataFrame
summed_indicators_society_economy['census_tract'] = averaged_indicators_society_economy['census_tract']

# Reorder the columns to have 'census_tract' as the first column
summed_indicators_society_economy = summed_indicators_society_economy[['census_tract', 'summed_indicators_society_economy_domain']]

# Show the resulting DataFrame
print(summed_indicators_society_economy)
print('min value:', summed_indicators_society_economy.summed_indicators_society_economy_domain.min())
print('max value:', summed_indicators_society_economy.summed_indicators_society_economy_domain.max())

## Min-max standardize the summed columns

In [None]:
min_max_domain = summed_indicators_society_economy.copy()


columns_to_process = [col for col in min_max_domain.columns if col != 'census_tract']

min_max_domain = min_max_standardize(min_max_domain, columns_to_process)
print(len(min_max_domain))
min_max_domain.head()

## Isolate to census tract and summed standardized columns, and rename tract to GEOID for merging
* add a zero at the beginning of the GEOID to match census tract that will be merged

In [None]:
min_max_standardized_society_economy_domain = min_max_domain[['census_tract', 'summed_indicators_society_economy_domain_min_max_standardized']]
min_max_standardized_society_economy_domain = min_max_standardized_society_economy_domain.dropna(subset=['census_tract'])
min_max_standardized_society_economy_domain = min_max_standardized_society_economy_domain.rename(columns={'census_tract':'GEOID'})
min_max_standardized_society_economy_domain['GEOID'] = min_max_standardized_society_economy_domain['GEOID'].apply(lambda x: '0' + str(x))
min_max_standardized_society_economy_domain['GEOID'] = min_max_standardized_society_economy_domain['GEOID'].astype(str).apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

# Print the DataFrame to check the 'GEOID' column
print(len(min_max_standardized_society_economy_domain))
min_max_standardized_society_economy_domain

## Calling census tract shape files so we can reproject and map our data

In [None]:
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_boundaries['GEOID'] = ca_boundaries['GEOID'].astype(str)
ca_boundaries.head()

## Merge the df and census tracts and convert the geometry to our uniformly used coordinate reference system (4269)
* Map it!

In [None]:
# Merge the DataFrames
df2 = min_max_standardized_society_economy_domain.merge(ca_boundaries, on='GEOID')

# Convert to GeoDataFrame with the correct CRS
df2 = gpd.GeoDataFrame(df2, geometry='geometry', crs=4269)

# Check for invalid geometries
invalid_geometries = df2[~df2['geometry'].is_valid]
print("Number of invalid geometries:", len(invalid_geometries))

# Check if there are any geometries left
if len(df2) == 0:
    print("No valid geometries remaining. Cannot plot.")
else:
    # Set up the figure
    fig, ax = plt.subplots(1, 1, figsize=(7, 10), layout='compressed')

    # Plot the data
    plot = df2.plot(column='summed_indicators_society_economy_domain_min_max_standardized', 
             ax=ax, 
             vmin=0, vmax=1, 
             legend=True, 
             cmap='RdYlBu_r',
             legend_kwds={'label': 'Vulnerability (larger values are more vulnerable)', 'orientation': 'horizontal', 'shrink': 1.0, 'pad': 0.03})
    
     # Set title
    ax.set_title('California Vulnerability - Society & Economy Domain', fontsize = 16.5)

    # Display the plot
    plt.show()


In [None]:
df2.to_csv('society_economy_index_data.csv', index=False)

In [None]:
bucket_name = 'ca-climate-index'
directory = '3_fair_data/domain_standardized_data'

upload_csv_aws(['society_economy_index_data.csv'], bucket_name, directory)

## Selecting counties to be mapped, by the COUNTYFP number
* manually enter the names of the counties linked with their number

In [None]:
list_of_counties = [
    '037',
    '071',
    '065',
    '029',
    '111']

df2 = min_max_standardized_society_economy_domain.merge(ca_boundaries, on='GEOID')

# Filtering rows where COUNTYFP is in the list_of_counties
df2_filtered = df2[df2['COUNTYFP'].isin(list_of_counties)]

# Convert to GeoDataFrame with the correct CRS
df2_filtered = gpd.GeoDataFrame(df2_filtered, geometry='geometry', crs=4269)

# Check for invalid geometries
invalid_geometries = df2_filtered[~df2_filtered['geometry'].is_valid]
print("Number of invalid geometries:", len(invalid_geometries))

# Group by COUNTYFP and take the geometry of the first row in each group
county_boundaries = df2_filtered.dissolve(by='COUNTYFP')['geometry']

# Dictionary mapping county codes to labels
county_labels = {
    '037': 'Los \n Angeles',
    '071': 'San Bernardino',
    '065': 'Riverside',
    '029': 'Kern',
    '111': 'Ventura'
}

# Set up the figure
fig, ax = plt.subplots(1, 1, figsize=(6, 12), layout='compressed')

# Plot county boundaries
county_boundaries.boundary.plot(ax=ax, linewidth=0.7, edgecolor='black')

# Plot the data
df2_filtered.plot(column='summed_indicators_society_economy_domain_min_max_standardized', 
         ax=ax, 
         vmin=0, vmax=1, 
         legend=True, 
         cmap='RdYlBu_r', 
         legend_kwds={'label': 'Vulnerability (larger values are more vulnerable)', 'orientation': 'horizontal', 'shrink': 0.9, 'pad': -0.3})

# Add county labels
for county_code, label in county_labels.items():
    centroid = county_boundaries[county_code].centroid
    ax.text(centroid.x, centroid.y, label, weight='light', fontsize=9, ha='center', va='baseline')

ax.set_title('Society & Economy Domain - LA & Surrounding Areas', fontsize=16)

# Display the plot
plt.show()