## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Vulnerable populations

This notebook calculates nine metrics, sourced from CalEnviroScreen:
* Metric 1: Age-adjusted emergency department visits for asthma per 10,000 people
* Metric 2: Age-adjusted emergency department visits for myocardial infarction per 10,000 people
* Metric 3: Percentage of live, singleton births < 5.5 pounds (non-twin, including premature)
* Metric 4: Percentage of population 25 and older with less than a high school education
* Metric 5: Percentage of households where all members 14 and older have some difficult speaking English
* Metric 6: Percentage of population living below 2x federal poverty level
* Metric 7: Percentage of population > 16 years old unemployed and eligible for the workforce
* Metric 8: Percentage of households which are low-income and housing-burdened
* Metric 9: Percentile of drinking water score

In [1]:
import pandas as pd
import os
import sys
import math
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [4]:
# pull .xlsx from aws
enviroscreen_excel = 's3://ca-climate-index/1_pull_data/society_economy/vulnerable_populations/ca_enviro_screen/calenviroscreen.xlsx'
enviroscreen_data = pd.read_excel(enviroscreen_excel,converters={'Census Tract': '{:0>11}'.format})

In [None]:
enviroscreen_data

## Now we pull and inspect a separate CalEnviroScreen datafile for drinking water percentiles:
https://oehha.ca.gov/calenviroscreen/indicator/drinking-water-contaminants
* it has the same number of census tracts (unsurprisingly), so we isolate the relevant columns and merge it with the
rest of the CalEnviroScreen data

In [None]:
# pull .xlsx from aws
enviroscreen_water_data = 's3://ca-climate-index/1_pull_data/society_economy/vulnerable_populations/ca_enviro_screen/ces4finaldrinkingwaterdatabytract.xlsx'
enviroscreen_water_data = pd.read_excel(enviroscreen_water_data)
print(len(enviroscreen_water_data))

In [None]:
enviroscreen_water_data.columns

In [7]:
# Rename and isolate columns so we can merge
# Adjust tract entries to match the base data to merge
enviroscreen_water_data = enviroscreen_water_data.rename(columns={'CensusTract':'Census Tract'})
enviroscreen_water_data['Census Tract'] = enviroscreen_water_data['Census Tract'].astype(str).str.zfill(11)
enviroscreen_water_data = enviroscreen_water_data[['Census Tract', 'Drinking Water Score Percentile']]

In [None]:
# Merge the water percentile data with the rest of the enviroscreen data
merged_enviroscreen_data = pd.merge(enviroscreen_data, enviroscreen_water_data, on='Census Tract', how='left')
merged_enviroscreen_data

## The data is using older tract data, so we will join it with 2017 Tract data first

In [9]:
# read in CA census tiger file
old_census_path = "s3://ca-climate-index/0_map_data/tl_2017_06_tract/"
ca_old = gpd.read_file(old_census_path)
ca_old = ca_old.rename(columns={"GEOID":"Census Tract"})
ca_old = ca_old[["Census Tract","geometry"]]

In [10]:
old_tract_calenviroscreen_data = pd.merge(ca_old, merged_enviroscreen_data, on="Census Tract")
old_tract_calenviroscreen_data = gpd.GeoDataFrame(old_tract_calenviroscreen_data, geometry="geometry")

## Now call in 2021 census data

In [None]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
ca_boundaries = ca_boundaries.rename(columns={'GEOID':'Census Tract'})
# drop unnecessary columns
ca_boundaries = ca_boundaries[["geometry","Census Tract"]]
ca_boundaries

In [None]:
# need to convert to an area-preserving CRS for distance calculations
old_tract_calenviroscreen_data = old_tract_calenviroscreen_data.to_crs(crs=3857) 
ca_boundaries = ca_boundaries.to_crs(crs=3857) 
print(len(ca_boundaries['Census Tract'].unique()))

In [None]:
# first find the tracts which have not changed from 2010 to 2017
# find the indices which correspond to the new boundaries
unchanged_tracts_ca = pd.to_numeric(ca_boundaries['Census Tract']).isin(pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']))
ca_boundaries[unchanged_tracts_ca]

In [None]:
# now find the indices which correspond to the original data
unchanged_tracts_old = pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']).isin(pd.to_numeric(ca_boundaries['Census Tract']))
original_df = old_tract_calenviroscreen_data[unchanged_tracts_old]
original_df["Census Tract"] = original_df["Census Tract"].apply(lambda x: '{0:>13}'.format(x))
original_df

In [None]:
# now we only have to join the remaining tracts
mapped_df = gpd.sjoin_nearest(
    ca_boundaries[~unchanged_tracts_ca], 
    old_tract_calenviroscreen_data[~unchanged_tracts_old], 
    how="inner", distance_col="distances", 
    max_distance=5000
)
mapped_df = mapped_df.rename(columns={'Census Tract_1':'Census Tract'})
# remove unnecessary columns
mapped_df = mapped_df.drop(
    columns=[col for col in mapped_df.columns if col not in original_df.columns]
)
mapped_df

In [None]:
# then concatenate the sjoined tracts with the unchanged ones
joined_df = pd.concat([original_df,mapped_df])
joined_df

In [None]:
# select relevant columns
metric_enviroscreen_data = merged_enviroscreen_data[['Census Tract',
                                                'Asthma',
                                                'Low Birth Weight', 
                                                'Cardiovascular Disease', 
                                                'Education', 
                                                'Linguistic Isolation',
                                                'Poverty',
                                                'Unemployment', 
                                                'Housing Burden', 
                                                'Drinking Water Score Percentile'
                                                ]]
calenviroscreen_2019 = joined_df[metric_enviroscreen_data.columns]
calenviroscreen_2019

In [None]:
# last, get things down to the level of the newer census tracts
calenviroscreen_2019 = calenviroscreen_2019.groupby('Census Tract').mean().reset_index()
calenviroscreen_2019

## Adjust previously used dfs that contain county and population data and merge them back in to our data

In [None]:
tract_county = joined_df[['Census Tract', 'California County']]
tract_county = tract_county.rename(columns={'California County':'County'})
tract_county

In [None]:
population = joined_df[['Census Tract', 'Total Population']]
population

In [21]:
# Merging data to get population and county data
calenviroscreen_2019_final = pd.merge(tract_county, calenviroscreen_2019, on='Census Tract', how='right')
calenviroscreen_2019_final = pd.merge(calenviroscreen_2019_final, population, on='Census Tract', how='left')
calenviroscreen_2019_final = calenviroscreen_2019_final.rename(columns={'Total Population': 'Total Population 2019'})

In [None]:
calenviroscreen_2019_final

In [23]:
calenviroscreen_2019_final.to_csv('society_calenviroscreen_metric.csv')

### Function Call
The function below creates new df's for each metric listed below. Some metrics are already in percent from the 2019 data, so those columns are renamed and retained for Cal-CRAI metric. df's are saved as csv's named off of their metric column:

ones that are already in percent from 2019 data
* % of live, singleton births < 5.5 pounds (non-twin, including premature)
* % of population 25 and older with less than a high school education
* % of households where all members 14 and older have some difficult speaking English
* % of population living below 2x federal poverty level
* % of population > 16 years old unemployed and eligible for the workforce
* % of households which are low-income and housing-burdened

metric calculated as a percentile:
* Drinking Water Score Percentile 

The function can also calculate metric per 10,000 people for metrics that have a 'sum of' column rather than pre-baked in percentages:

metrics that have been calculated per 10,000 people:
* Age-adjusted emergency department visits for asthma per 10,000 people
* Age-adjusted emergency department visits for myocardial infarction per 10,000 people

Asthma and cardiovascular percentage can be calculated with 2019 and 2021 as the CalEnviroscreen values are 'Age-adjusted rate of emergency department visits for asthma/cardiovascular disease'

# Calling function for both metric calc types

In [24]:
@append_metadata
def calenviroscreen_metric_calc(input_csv, columns_to_process, calculate_per_10000=False, export=False, varname=""):
    '''
    Calculates the following metrics sourced from CalEnviroScreen:
    * % of live, singleton births < 5.5 pounds (non-twin, including premature)
    * % of population 25 and older with less than a high school education
    * % of households where all members 14 and older have some difficult speaking English
    * % of population living below 2x federal poverty level
    * % of population > 16 years old unemployed and eligible for the workforce
    * % of households which are low-income and housing-burdened
    * Age-adjusted emergency department visits for asthma per 10,000 people
    * Age-adjusted emergency department visits for myocardial infarction per 10,000 people
    * Drinking Water Score Percentile

    Note
    --------
    Each of the above metrics is calculated separately; please see the corresponding 
    variable name (the same as the filename for this document) to know which one this 
    particular metadata document describes. 
  
    Methods
    --------
    Relevant data columns were isolated and renamed to align with Cal-CRAI metrics.
    Data was from older census tracts, so we merged it with 2017 California Tiger shape files first.
    The data was then set to Cal-CRAI standardized coordinate reference system.
    Data was then spatially joined to nearest 2021 census tract data.
    Extra tracts merged in were given the average value for each metric based on 
    the county they reside in.
    This averaging was also done for missing data in otherwise populated tracts.
    Metrics with % calculations were largely untouched as CalEnviroScreen data had
    those metrics calculated for 2019.
    Metrics with emergency department visits had their values adjusted to reflect
    number of visits per 10,000 people per tract with the 2019 population data.

    Parameters
    ------------
    columns_to_process: list
        list of columns that contain desired metric data
    calculate_per_10000: boolean
        if true, adds columns with calculations for # of visits per 10,000 people
        if false, retains the column but renames to 2019
    varname: string
        Final metric name.
    export: bool
        If True, uploads file to S3.
        If False, just generates metadata file.

    Script
    ------
    cal_enviroscreen_metrics.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    merged_df = pd.read_csv(input_csv)
        
    # List to store generated CSV file names
    csv_file_names = []
        
    for column in columns_to_process:
        # Create new DataFrame
        new_df = merged_df[['Census Tract', 'County', 'Total Population 2019']].copy()
        new_df = new_df.rename(columns={'Census Tract': 'census_tract'})  
        # Create new column name
        if column == 'Drinking Water Score Percentile':
            new_column_name = column.replace(' ', '_')
        else:
            new_column_name = column.replace(' ', '_')
            if calculate_per_10000:
                new_column_name += '_related_ED_visits_2019'
                new_column_name_per_10000_people_2019 = new_column_name.replace('_2019', '_per_10000_people_2019')
            else:
                new_column_name += '_percent_2019'

        # Lowercase the column name
        new_column_name = new_column_name.lower()
    
        # Add new column with the calculated name
        if not calculate_per_10000:
            new_df[new_column_name] = merged_df[column]

        else:
            new_df[column] = merged_df[column]
            new_df[new_column_name_per_10000_people_2019] = (merged_df[column] / merged_df['Total Population 2019']) * 10000
            print('Data transformation: adding calculation columns for metrics with emergency department visits.')
        
        if not calculate_per_10000:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'
        else:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'

        # Save the DataFrame to CSV
        new_df.columns = new_df.columns.str.lower()
        new_df.to_csv(csv_filename, index=False)
        
        print(f"Saved DataFrame to: {csv_filename}")
        # Append CSV filename to the list
        csv_file_names.append(csv_filename)
        # Output or further process new DataFrame
        display(new_df)
        
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        upload_csv_aws([csv_filename], bucket_name, directory)

    if export == False:
        print(f'{csv_filename} uploaded to AWS.')

In [None]:
input_csv = 'society_calenviroscreen_metric.csv'

# Lists of columns and varnames
columns_to_process_no_10000 = [
    'Low Birth Weight',
    'Education',
    'Linguistic Isolation',
    'Poverty',
    'Unemployment',
    'Housing Burden',
    'Drinking Water Score Percentile'
]
varnames_no_10000 = [
    'society_calenviroscreen_birth_weight', 
    'society_calenviroscreen_education_below_HS', 
    'society_calenviroscreen_nonenglish_speakers',
    'society_calenviroscreen_below_poverty_level',
    'society_calenviroscreen_unemployment',
    'society_calenviroscreen_housing_burdened',
    'society_calenviroscreen_impaired_waterbodies'
]

In [None]:
# Calculate metric without percentages
for col, var in zip(columns_to_process_no_10000, varnames_no_10000):
    print(f"Processing {col} without percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=False, export=False, varname=var)

varnames_10000 = [
    'society_calenviroscreen_emergency_dept_asthma_visits',
    'society_calenviroscreen_emergency_dept_myocardial_visits'
]

# Columns to loop through that include calculating percentages
columns_to_process_per_10000 = [
    'Asthma',
    'Cardiovascular Disease'
]

# Calculate percentages
for col, var in zip(columns_to_process_per_10000, varnames_10000):
    print(f"Processing {col} with percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=True, export=False, varname=var)