## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Vulnerable Populations

This notebook calculates 6 metrics, all sourced from the American Community Survey:
* Metric 1: Ambulatory Difficulty: Percentage of population living with an ambulatory disability
* Metric 2: Cognitive Difficulty: Percentage of population living with a cognitive disability
* Metric 3: Financial Assistance: Percentage of population living in a household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the last 12 months
* Metric 4: Health Insurance: Percentage of population without health insurance
* Metric 5: Percentage of population aged 65 years or older
* Metric 6: Percentage of population under 5 years old
* Metric 7: Percentage of population American Indian and Alaska Native

In [1]:
import os
import sys
import pandas as pd
import io
import numpy as np
import boto3
import zipfile
import shutil
import glob

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import append_metadata
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws

# Adjust display options, helpful for long descriptions within ACS data
pd.set_option('display.max_colwidth', None)

#### Pulling all zipped folders within the vulnerable populations folder from AWS

In [None]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/american_community_survey/'
folder = 'vulnerable_populations_data'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=True)

#### Metrics 1-3: % of population aged 65 years or older, under 5 years old, American Indian and Alaska Native

In [None]:
# Read in demographic data
# we do not need the second row since it only describes the data
demographic_data = pd.read_csv(r'vulnerable_populations_data/demographic_DP05.csv', skiprows=[1])
demographic_data.head()

In [None]:
# Making a Census tract column using the GEO_ID column
demographic_data['Census_Tract'] = demographic_data.copy()['GEO_ID'].str[10:]
demographic_data.head(3)

#### Re-naming demographic data columns from their code to our desired metrics
* dataset contains percent of population for each of the demographic metrics

In [6]:
# Renaming columns from dictionary code to definition
demographic_data = demographic_data.rename(columns={'DP05_0005PE': 'percent_total_pop_under_5'})
demographic_data = demographic_data.rename(columns={'DP05_0029PE': 'percent_total_pop_over_65'})
demographic_data = demographic_data.rename(columns={'DP05_0039PE': 'percent_total_pop_american_indian_alaska_native'})
demographic_data = demographic_data.rename(columns={'DP05_0001E': 'est_total_pop'})
demographic_data = demographic_data.rename(columns={'DP05_0024E': 'est_total_pop_over_65'})
demographic_data = demographic_data.rename(columns={'Census_Tract': 'census_tract'})

# Adding in estimates under age 18 as it is used in another metric below
demographic_data = demographic_data.rename(columns={'DP05_0019E': 'est_under_18'})

* have to calculate percent of the population over 65, the values in their 'percent_pop_over_65' are not percentages
* save df as a csv

In [None]:
# Isolating relevant columns to our data metrics
# Ommitting 'percent_total_pop_over_65' as the column is incorrect
cri_demographic_data = demographic_data[
['GEO_ID', 'census_tract', 'percent_total_pop_under_5', 
 'percent_total_pop_american_indian_alaska_native', 
 'est_total_pop', 'est_total_pop_over_65', 'est_under_18']
]

# Create a new column for % of population over 65 years using estimated population values 
cri_demographic_data = cri_demographic_data.assign(
    real_percent_total_pop_over_65=
    lambda x: 100*(x.est_total_pop_over_65 / x.est_total_pop)
)

# Saving metric df to .csv file
cri_demographic_data.to_csv('society_age_race_metric.csv')
print('Saving demographic metric data to a .csv')

cri_demographic_data

#### Separating the three metrics for individual csv creation

In [8]:
cri_under_5_metric = cri_demographic_data[
['census_tract', 'percent_total_pop_under_5']
]
cri_american_indian_alaska_native_metric = cri_demographic_data[
['census_tract', 'percent_total_pop_american_indian_alaska_native']
]
cri_over_65_metric = cri_demographic_data[
['census_tract', 'est_total_pop_over_65', 'real_percent_total_pop_over_65']
]

In [None]:
# Saving metric df to .csv file
cri_under_5_metric.to_csv('society_under_5yo_metric.csv')
print('Saving under 5 years old metric data to a .csv')

# Saving American Indian and Alaska Native population metrics to .csv file
cri_american_indian_alaska_native_metric.to_csv('society_american_indian_alaska_native_metric.csv')
print('Saving demographic metric data to a .csv')

# Saving over 65 years old metric data to .csv file
cri_over_65_metric.to_csv('society_over_65yo_metric.csv')
print('Saving demographic metric data to a .csv')

#### Using the ACS demographic data estimated population values for all other population percent calculations, we should create a separate csv file with just the population estimates per census tract

In [None]:
cri_demographic_estimated_population = cri_demographic_data[['census_tract']]
cri_demographic_estimated_population['est_total_pop'] = cri_demographic_data['est_total_pop']

# Saving metric df to .csv file
cri_demographic_estimated_population.to_csv('cri_acs_demographic_estimated_population.csv')
print('Saving demographic metric data to a .csv')
cri_demographic_estimated_population

#### Metrics 4-5
* will be using total population from demographic data (originally column DP05_0001E) to calculate percentages
    - so csv files resulting from these metrics will be run through a final function at the end to calculate percent of population metric

#### Ambulatory Disability

In [None]:
# Read in ambulatory data
ambulatory_data = pd.read_csv(r'vulnerable_populations_data/ambulatory_difficulty_B18105.csv', header=[0,1])
ambulatory_data.head(5)

In [None]:
# Making a Census tract column using the GEO_ID column
ambulatory_data['Census_Tract'] = ambulatory_data.copy()['GEO_ID', 'Geography'].str[10:]
filtered_ambulatory_disability = ambulatory_data[['GEO_ID', 'Census_Tract']]
# filter data to only include estimated population living with an ambulatory disability
filtered_ambulatory_disability = pd.concat(
    [filtered_ambulatory_disability,
    ambulatory_data.filter(regex=r'Estimate').filter(
    regex=r'With an ambulatory difficulty')], axis=1)

# Display the resulting DataFrame
filtered_ambulatory_disability.head(5)

In [None]:
# Create a new column by summing all columns with data (ie not tract info)
filtered_ambulatory_disability['sum_ambulatory_disabilities'] = filtered_ambulatory_disability.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
filtered_ambulatory_disability.head(5)

#### Subset necessary columns and clean up header row

In [None]:
ambulatory_disability_sum = filtered_ambulatory_disability.loc[
:,['Census_Tract', 'sum_ambulatory_disabilities']
]
ambulatory_disability_sum.columns = ambulatory_disability_sum.columns.droplevel(-1)
# Saving metric df to .csv file
ambulatory_disability_sum = ambulatory_disability_sum.rename(columns={'Census_Tract': 'census_tract'})

ambulatory_disability_sum.to_csv('ambulatory_disability_sum.csv')
print('Saving demographic metric data to a .csv')
ambulatory_disability_sum

#### Cognitive Disability

In [None]:
# Read in cognitive data
cognitive_data = pd.read_csv(r'vulnerable_populations_data/cognitive_difficulty_B18104.csv', header=[0,1])
cognitive_data.head(5)

In [None]:
# Making a Census tract column using the GEO_ID column
cognitive_data['Census_Tract'] = cognitive_data.copy()['GEO_ID', 'Geography'].str[10:]
# Filter columns based on criteria
columns_to_keep = ['Census_Tract']  # Always keep these columns
filtered_cognitive_disability = cognitive_data[['Census_Tract']]
# filter data to only include estimated population living with a cognitive disability
filtered_cognitive_disability = pd.concat(
    [filtered_cognitive_disability,
    cognitive_data.filter(regex=r'Estimate').filter(
    regex=r'With a cognitive difficulty')], axis=1)
# Display the resulting DataFrame
filtered_cognitive_disability.head(5)

In [None]:
# Create a new column by summing all data columns
filtered_cognitive_disability['sum_cognitive_disabilities'] = filtered_cognitive_disability.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
display(filtered_cognitive_disability)

In [None]:
# clean up headers and keep the columns we want
cognitive_disability_sum = filtered_cognitive_disability.loc[:,['Census_Tract', 'sum_cognitive_disabilities']]
cognitive_disability_sum = cognitive_disability_sum.rename(columns={'Census_Tract': 'census_tract'})

# Saving metric df to .csv file
cognitive_disability_sum.to_csv('cognitive_disability_sum.csv')
print('Saving demographic metric data to a .csv')
cognitive_disability_sum.head(5)

#### Metric 6: Financial Assistance
* ACS data is for children under 18 years in households
* number of children per tract in financial support data matches number\
of children in demographic data, so no conversion necessary

In [None]:
# Read in financial assistance data
financial_assistance_data = pd.read_csv(r'vulnerable_populations_data/financial_support_B09010.csv', header=[0,1])
# Making a Census tract column using the GEO_ID column
financial_assistance_data['Census_Tract'] = financial_assistance_data['GEO_ID', 'Geography'].str[10:]
financial_assistance_data.head(5)

In [None]:
# Renaming columns from dictionary code to definition
financial_assistance_data = financial_assistance_data.rename(
    columns={'B09010_001E': 'total_children_under_18',
             'B09010_002E': 'estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months'}
)
# Drop the first row that contains additional info about columns
financial_assistance_data.columns = financial_assistance_data.columns.droplevel(-1)
# Subset for desired columns
filtered_financial_assistance_data = financial_assistance_data.loc[
:,['GEO_ID', 'Census_Tract', 'total_children_under_18', 
   'estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months']
]
# Create a new column for % of children living in household with financial assistance 
filtered_financial_assistance_data.loc[:,'percent_children_household_financial_assistance'] = pd.to_numeric(
    filtered_financial_assistance_data['estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months']
) / pd.to_numeric(filtered_financial_assistance_data['total_children_under_18'])
# Convert to percentage
filtered_financial_assistance_data.loc[:,'percent_children_household_financial_assistance'] *= 100

filtered_financial_assistance_data.head(5)

In [None]:
# Saving metric df to .csv file
filtered_financial_assistance_data = filtered_financial_assistance_data.rename(columns={'Census_Tract':'census_tract'})
filtered_financial_assistance_data = filtered_financial_assistance_data[['census_tract', 'percent_children_household_financial_assistance']]
filtered_financial_assistance_data.to_csv('society_financial_assistance_metric.csv')
print('Saving demographic metric data to a .csv')
filtered_financial_assistance_data

#### Metric 7: Health Insurance
* though the estimated total code (_001E) is the same as cognitive and ambulatory disability datasets,\
SOME of the values are the same as the demographic data values. Strange, first three match, fourth one does not, so to be safe,
I will impliment the resulting csv in the function below to calculate percent based on the demographic data total population

In [None]:
# Read in cognitive data
health_insurance_data = pd.read_csv(r'vulnerable_populations_data/health_insurance_B27010.csv', header=[0,1])
# Making a Census tract column using the GEO_ID column
health_insurance_data['Census_Tract'] = health_insurance_data['GEO_ID', 'Geography'].str[10:]
health_insurance_data.head(5)

In [None]:
# Renaming columns from dictionary code to definition
filtered_health_insurance_data = health_insurance_data[['GEO_ID', 'Census_Tract']]
# filter data to only include estimated population living with a cognitive disability
filtered_health_insurance_data = pd.concat(
    [filtered_health_insurance_data,
    health_insurance_data.filter(regex=r'Estimate').filter(
    regex=r'No health insurance coverage')], axis=1)
# Display the resulting DataFrame
filtered_health_insurance_data.head(5)

In [None]:
filtered_health_insurance_data.columns = filtered_health_insurance_data.columns.droplevel(-1)
# Create a new column by summing all columns starting from the second column
filtered_health_insurance_data['sum_without_health_insurance'] = filtered_health_insurance_data.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
filtered_health_insurance_data.head(5)

In [None]:
filtered_health_insurance_data = filtered_health_insurance_data.rename(columns={'Census_Tract':'census_tract'})
without_health_insurance_sum = filtered_health_insurance_data.loc[:,['census_tract', 'sum_without_health_insurance']]
# Saving metric df to .csv file
without_health_insurance_sum.to_csv('without_health_insurance_sum.csv')
print('Saving health insurance metric data to a .csv')
without_health_insurance_sum

#### Calculate % of total population

* pull the acs demographic (dpo5) estimated population csv file from AWS
* iterate through each input csv file that needs to calculate metric percentage
* perform the calculation on the selected csv column (sum/demographic total population) *100
* save new metric as a csv file

In [29]:
@append_metadata
def calculate_acs_metric_percentage(
    input_csv, output_csv, calculate_percentage=True, varname="", export=False
):
    '''
    Calculates the following metrics sourced from the American Community Survey:
    - Ambulatory Difficulty: % of population living with an ambulatory disability
    - Cognitive Difficulty: % of population living with a cognitive disability
    - Financial Assistance: 
        % of population living in a household with Supplemental Security Income (SSI), 
        cash public assistance income, or Food Stamps/SNAP in the last 12 months
    - Health Insurance: % of population without health insurance

    Demography: 
    - % of population aged 65 years or older
    - % of population under 5 years old
    - % of population American Indian and Alaska Native
  
    Methods
    --------
    The estimated population column (DP05_0001E) from ACS dataset DP05 was used as the 
    universal population values when calculating metric percentage for the metrics above. 
    Columns were renamed and summed when a metric's values were separated by age group.

    Parameters
    ------------
    input_csv: string
        Dataframe containing a column with sum of people meeting a metric condition.
    output_csv: string
        Output filename.
    calculate_percentage: boolean
        if true, calculates percentage of input_csv based on a 'sum' column
        if false, skipts percentage calculation and uploads .csv to AWS
    varname: string
        Final metric name. 
    export: bool
        If True, uploads csv to S3.
        If False, just runs metadata portion

    Script
    ------
    society_vulnerable_populations.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    '''
    if export == False:
        print('Data transformation: utilizing ACS dataset DP05 population data to calculate metric.')
        print('Data transformation: columns renamed and summed for total metric value.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        cri_est_pop = f"s3://{bucket_name}/0_map_data/cri_acs_demographic_estimated_population.csv"
        cri_tract_est_pop = pd.read_csv(cri_est_pop)
        data = pd.read_csv(input_csv)
        
        if calculate_percentage:
            data['est_total_population'] = cri_tract_est_pop['est_total_pop']
            # get column with raw sum
            sum_column = data.filter(regex=r'sum_').columns[0]
            # name new column for 'percent of total' data
            percent_column_name = sum_column.replace('sum','percent_population')
            # calculate percent of total population from sum
            data[percent_column_name] = (data[sum_column] / data['est_total_population']) * 100
        
            # Reorder columns with the new column at the end
            columns = list(data.columns)
            columns.remove(percent_column_name)  # Remove the column to be placed at the end
            columns.append(percent_column_name)  # Append it to the end
            data = data[columns]
            
            # Save the updated DataFrame to a new CSV file
            data.to_csv(output_csv, index=False)
            upload_csv_aws([output_csv], bucket_name, directory)

        os.remove(input_csv)

In [31]:
# for metrics that need the percent calculation
input_csvs = [
    'ambulatory_disability_sum.csv', 
    'cognitive_disability_sum.csv', 
    'without_health_insurance_sum.csv'
]
output_csvs = [
    'society_ambulatory_disability_metric.csv', 
    'society_cognitive_disability_metric.csv', 
    'society_without_health_insurance_metric.csv'
]
varnames = [
    'society_acs_ambulatory',
    'society_acs_cognitive',
    'society_acs_health_insurance'
]
aa = list(zip(input_csvs,output_csvs,varnames))
for input_csv,output_csv,varname in list(
    zip(input_csvs,output_csvs,varnames)
):
    calculate_acs_metric_percentage(
        input_csv, output_csv, calculate_percentage=True, export=False,
        varname=varname
    )

In [32]:
# for metrics that dont need the percent calculation
input_csvs = [
    'society_under_5yo_metric.csv',
    'society_american_indian_alaska_native_metric.csv',
    'society_over_65yo_metric.csv',
    'society_financial_assistance_metric.csv'
]
output_csvs = [
    'society_under_5yo_metric.csv',
    'society_american_indian_alaska_native_metric.csv',
    'society_over_65yo_metric.csv',
    'society_financial_assistance_metric.csv'
]
varnames = [
    'society_acs_demographic_under_5',
    'society_acs_demographic_american_indian',
    'society_acs_demographic_over_65',
    'society_acs_financial_assistance'
]
aa = list(zip(input_csvs,output_csvs,varnames))
for input_csv,output_csv,varname in list(
    zip(input_csvs,output_csvs,varnames)
):
    calculate_acs_metric_percentage(
        input_csv, output_csv, calculate_percentage=False, export=False,
        varname=varname
    )

In [33]:
# Delete all CSV files in the current directory that are not in the output folder
current_files = glob.glob('*.csv')
for file in current_files:
    os.remove(file)