## Cal-CRAI Metric Calculation for: Vulnerable Communities
This notebook calculates 6 metrics, all sourced from the American Community Survey.

- Ambulatory Difficulty: % of population living with an ambulatory disability
- Cognitive Difficulty: % of population living with a cognitive disability
- Financial Assistance: % of population living in a household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the last 12 months
- Health Insurance: % of population without health insurance

- Demography: 
    - % of population aged 65 years or older
    - % of population under 5 years old
    - % of population American Indian and Alaska Native

In [29]:
import os
import sys
import pandas as pd
import io
import numpy as np
import boto3
import zipfile
import shutil
import glob

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import append_metadata
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws

# Adjust display options, helpful for long descriptions within ACS data
pd.set_option('display.max_colwidth', None)

#### Pulling all zipped folders within the vulnerable populations folder from AWS

In [2]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/american_community_survey/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=True)

Saved DataFrame as 'ACSDT5Y2022.B18105-Column-Metadata.csv'
Saved DataFrame as 'ambulatory_difficulty_B18105.csv'


  df = pd.read_csv(csv_file)


Saved DataFrame as 'ACSDT5Y2022.B18104-Column-Metadata.csv'
Saved DataFrame as 'cognitive_difficulty_B18104.csv'


  df = pd.read_csv(csv_file)


Saved DataFrame as 'ACSDP5Y2022.DP05-Column-Metadata.csv'


  df = pd.read_csv(csv_file)


Saved DataFrame as 'demographic_DP05.csv'
Saved DataFrame as 'ACSDT5Y2022.B09010-Column-Metadata.csv'
Saved DataFrame as 'financial_support_B09010.csv'
Saved DataFrame as 'ACSDT5Y2022.B27010-Column-Metadata.csv'


  df = pd.read_csv(csv_file)


Saved DataFrame as 'health_insurance_B27010.csv'


#### Metrics 1-3: % of population aged 65 years or older, under 5 years old, American Indian and Alaska Native

In [3]:
# Read in demographic data
# we do not need the second row since it only describes the data
demographic_data = pd.read_csv('demographic_DP05.csv', skiprows=[1])
demographic_data.head()
os.remove()

Unnamed: 0,GEO_ID,NAME,DP05_0001E,DP05_0001M,DP05_0002E,DP05_0002M,DP05_0003E,DP05_0003M,DP05_0004E,DP05_0004M,...,DP05_0087PM,DP05_0088PE,DP05_0088PM,DP05_0089PE,DP05_0089PM,DP05_0090PE,DP05_0090PM,DP05_0091PE,DP05_0091PM,Unnamed: 366
0,1400000US06001400100,Census Tract 4001; Alameda County; California,3269,452,1621,339,1648,205,98.4,19.9,...,2.5,(X),(X),2351,(X),47.4,4.4,52.6,4.4,
1,1400000US06001400200,Census Tract 4002; Alameda County; California,2147,201,1075,138,1072,129,100.3,16.3,...,3.4,(X),(X),1679,(X),49.4,4.5,50.6,4.5,
2,1400000US06001400300,Census Tract 4003; Alameda County; California,5619,571,2801,504,2818,332,99.4,22.5,...,4.3,(X),(X),4414,(X),47.6,5.6,52.4,5.6,
3,1400000US06001400400,Census Tract 4004; Alameda County; California,4278,598,1926,327,2352,363,81.9,13.5,...,1.8,(X),(X),3180,(X),46.9,4.0,53.1,4.0,
4,1400000US06001400500,Census Tract 4005; Alameda County; California,3949,737,1870,291,2079,565,89.9,23.0,...,4.9,(X),(X),3169,(X),44.3,7.0,55.7,7.0,


In [4]:
# Making a Census tract column using the GEO_ID column
demographic_data['Census_Tract'] = demographic_data.copy()['GEO_ID'].str[10:]
demographic_data.head(3)

Unnamed: 0,GEO_ID,NAME,DP05_0001E,DP05_0001M,DP05_0002E,DP05_0002M,DP05_0003E,DP05_0003M,DP05_0004E,DP05_0004M,...,DP05_0088PE,DP05_0088PM,DP05_0089PE,DP05_0089PM,DP05_0090PE,DP05_0090PM,DP05_0091PE,DP05_0091PM,Unnamed: 366,Census_Tract
0,1400000US06001400100,Census Tract 4001; Alameda County; California,3269,452,1621,339,1648,205,98.4,19.9,...,(X),(X),2351,(X),47.4,4.4,52.6,4.4,,6001400100
1,1400000US06001400200,Census Tract 4002; Alameda County; California,2147,201,1075,138,1072,129,100.3,16.3,...,(X),(X),1679,(X),49.4,4.5,50.6,4.5,,6001400200
2,1400000US06001400300,Census Tract 4003; Alameda County; California,5619,571,2801,504,2818,332,99.4,22.5,...,(X),(X),4414,(X),47.6,5.6,52.4,5.6,,6001400300


#### Re-naming demographic data columns from their code to our desired metrics
* dataset contains percent of population for each of the demographic metrics

In [5]:
# Renaming columns from dictionary code to definition
demographic_data = demographic_data.rename(columns={'DP05_0005PE': 'percent_total_pop_under_5'})
demographic_data = demographic_data.rename(columns={'DP05_0029PE': 'percent_total_pop_over_65'})
demographic_data = demographic_data.rename(columns={'DP05_0039PE': 'percent_total_pop_american_indian_alaska_native'})
demographic_data = demographic_data.rename(columns={'DP05_0001E': 'est_total_pop'})
demographic_data = demographic_data.rename(columns={'DP05_0024E': 'est_total_pop_over_65'})
demographic_data = demographic_data.rename(columns={'Census_Tract': 'census_tract'})

# Adding in estimates under age 18 as it is used in another metric below
demographic_data = demographic_data.rename(columns={'DP05_0019E': 'est_under_18'})

* have to calculate percent of the population over 65, the values in their 'percent_pop_over_65' are not percentages
* save df as a csv

In [6]:
# Isolating relevant columns to our data metrics
# Ommitting 'percent_total_pop_over_65' as the column is incorrect
cri_demographic_data = demographic_data[
['GEO_ID', 'census_tract', 'percent_total_pop_under_5', 
 'percent_total_pop_american_indian_alaska_native', 
 'est_total_pop', 'est_total_pop_over_65', 'est_under_18']
]

# Create a new column for % of population over 65 years using estimated population values 
cri_demographic_data = cri_demographic_data.assign(
    real_percent_total_pop_over_65=
    lambda x: 100*(x.est_total_pop_over_65 / x.est_total_pop)
)

# Saving metric df to .csv file
cri_demographic_data.to_csv('society_age_race_metric.csv')
print('Saving demographic metric data to a .csv')

cri_demographic_data

Saving demographic metric data to a .csv


Unnamed: 0,GEO_ID,census_tract,percent_total_pop_under_5,percent_total_pop_american_indian_alaska_native,est_total_pop,est_total_pop_over_65,est_under_18,real_percent_total_pop_over_65
0,1400000US06001400100,6001400100,4.1,0.0,3269,884,661,27.041909
1,1400000US06001400200,6001400200,7.9,0.4,2147,553,350,25.756870
2,1400000US06001400300,6001400300,2.3,0.5,5619,916,942,16.301833
3,1400000US06001400400,6001400400,7.5,0.5,4278,550,941,12.856475
4,1400000US06001400500,6001400500,4.0,0.1,3949,649,496,16.434540
...,...,...,...,...,...,...,...,...
9124,1400000US06115040902,6115040902,12.3,0.2,1868,0,432,0.000000
9125,1400000US06115041001,6115041001,6.9,0.3,3672,1234,514,33.605664
9126,1400000US06115041002,6115041002,2.6,2.6,3417,1025,552,29.997073
9127,1400000US06115041101,6115041101,2.7,1.1,2288,478,494,20.891608


#### Separating the three metrics for individual csv creation

In [7]:
cri_under_5_metric = cri_demographic_data[
['census_tract', 'percent_total_pop_under_5']
]
cri_american_indian_alaska_native_metric = cri_demographic_data[
['census_tract', 'percent_total_pop_american_indian_alaska_native']
]
cri_over_65_metric = cri_demographic_data[
['census_tract', 'est_total_pop_over_65', 'real_percent_total_pop_over_65']
]

In [8]:
# Saving metric df to .csv file
cri_under_5_metric.to_csv('society_under_5yo_metric.csv')
print('Saving under 5 years old metric data to a .csv')

# Saving American Indian and Alaska Native population metrics to .csv file
cri_american_indian_alaska_native_metric.to_csv('society_american_indian_alaska_native_metric.csv')
print('Saving demographic metric data to a .csv')

# Saving over 65 years old metric data to .csv file
cri_over_65_metric.to_csv('society_over_65yo_metric.csv')
print('Saving demographic metric data to a .csv')

Saving under 5 years old metric data to a .csv
Saving demographic metric data to a .csv
Saving demographic metric data to a .csv


#### Using the ACS demographic data estimated population values for all other population percent calculations, we should create a separate csv file with just the population estimates per census tract

In [9]:
cri_demographic_estimated_population = cri_demographic_data[['census_tract']]
cri_demographic_estimated_population['est_total_pop'] = cri_demographic_data['est_total_pop']

# Saving metric df to .csv file
cri_demographic_estimated_population.to_csv('cri_acs_demographic_estimated_population.csv')
print('Saving demographic metric data to a .csv')
cri_demographic_estimated_population

Saving demographic metric data to a .csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cri_demographic_estimated_population['est_total_pop'] = cri_demographic_data['est_total_pop']


Unnamed: 0,census_tract,est_total_pop
0,6001400100,3269
1,6001400200,2147
2,6001400300,5619
3,6001400400,4278
4,6001400500,3949
...,...,...
9124,6115040902,1868
9125,6115041001,3672
9126,6115041002,3417
9127,6115041101,2288


#### Upload the newly made demographic estimated population data to AWS so we can call it for other metrics

In [10]:
bucket_name = 'ca-climate-index'
file_name = 'cri_acs_demographic_estimated_population.csv'
directory = '0_map_data'

upload_csv_aws([file_name], bucket_name, directory)
# Remove final csv files from local directory
os.remove(file_name)

cri_acs_demographic_estimated_population.csv uploaded to AWS


#### Metrics 4-5
* will be using total population from demographic data (originally column DP05_0001E) to calculate percentages
    - so csv files resulting from these metrics will be run through a final function at the end to calculate percent of population metric

#### Ambulatory Disability

In [11]:
# Read in ambulatory data
ambulatory_data = pd.read_csv('ambulatory_difficulty_B18105.csv', header=[0,1])
ambulatory_data.head(5)

Unnamed: 0_level_0,GEO_ID,NAME,B18105_001E,B18105_001M,B18105_002E,B18105_002M,B18105_003E,B18105_003M,B18105_004E,B18105_004M,...,B18105_029M,B18105_030E,B18105_030M,B18105_031E,B18105_031M,B18105_032E,B18105_032M,B18105_033E,B18105_033M,Unnamed: 68
Unnamed: 0_level_1,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!Male:,Margin of Error!!Total:!!Male:,Estimate!!Total:!!Male:!!5 to 17 years:,Margin of Error!!Total:!!Male:!!5 to 17 years:,Estimate!!Total:!!Male:!!5 to 17 years:!!With an ambulatory difficulty,Margin of Error!!Total:!!Male:!!5 to 17 years:!!With an ambulatory difficulty,...,Margin of Error!!Total:!!Female:!!65 to 74 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!No ambulatory difficulty,Margin of Error!!Total:!!Female:!!65 to 74 years:!!No ambulatory difficulty,Estimate!!Total:!!Female:!!75 years and over:,Margin of Error!!Total:!!Female:!!75 years and over:,Estimate!!Total:!!Female:!!75 years and over:!!With an ambulatory difficulty,Margin of Error!!Total:!!Female:!!75 years and over:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!75 years and over:!!No ambulatory difficulty,Margin of Error!!Total:!!Female:!!75 years and over:!!No ambulatory difficulty,Unnamed: 68_level_1
0,1400000US06001400100,Census Tract 4001; Alameda County; California,3136,448,1549,341,300,216,0,13,...,47,206,71,279,113,47,38,232,114,
1,1400000US06001400200,Census Tract 4002; Alameda County; California,1978,199,992,130,76,37,0,13,...,13,136,33,174,60,64,45,110,39,
2,1400000US06001400300,Census Tract 4003; Alameda County; California,5492,574,2767,502,516,199,0,19,...,25,307,144,279,87,0,19,279,87,
3,1400000US06001400400,Census Tract 4004; Alameda County; California,3937,475,1791,277,261,118,0,13,...,27,190,60,119,50,33,28,86,43,
4,1400000US06001400500,Census Tract 4005; Alameda County; California,3791,737,1748,287,224,72,0,13,...,21,137,68,325,280,18,29,307,277,


In [12]:
# Making a Census tract column using the GEO_ID column
ambulatory_data['Census_Tract'] = ambulatory_data.copy()['GEO_ID', 'Geography'].str[10:]
filtered_ambulatory_disability = ambulatory_data[['GEO_ID', 'Census_Tract']]
# filter data to only include estimated population living with an ambulatory disability
filtered_ambulatory_disability = pd.concat(
    [filtered_ambulatory_disability,
    ambulatory_data.filter(regex=r'Estimate').filter(
    regex=r'With an ambulatory difficulty')], axis=1)

# Display the resulting DataFrame
filtered_ambulatory_disability.head(5)

Unnamed: 0_level_0,GEO_ID,Census_Tract,B18105_004E,B18105_007E,B18105_010E,B18105_013E,B18105_016E,B18105_020E,B18105_023E,B18105_026E,B18105_029E,B18105_032E
Unnamed: 0_level_1,Geography,Unnamed: 2_level_1,Estimate!!Total:!!Male:!!5 to 17 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!18 to 34 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!35 to 64 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!65 to 74 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!75 years and over:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!5 to 17 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!18 to 34 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!35 to 64 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!75 years and over:!!With an ambulatory difficulty
0,1400000US06001400100,6001400100,0,0,19,29,22,0,0,0,36,47
1,1400000US06001400200,6001400200,0,0,0,10,4,0,0,8,0,64
2,1400000US06001400300,6001400300,0,0,2,67,43,0,0,0,17,0
3,1400000US06001400400,6001400400,0,0,48,7,22,0,0,11,35,33
4,1400000US06001400500,6001400500,0,6,71,21,0,0,0,0,17,18


In [13]:
# Create a new column by summing all columns with data (ie not tract info)
filtered_ambulatory_disability['sum_ambulatory_disabilities'] = filtered_ambulatory_disability.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
filtered_ambulatory_disability.head(5)

Unnamed: 0_level_0,GEO_ID,Census_Tract,B18105_004E,B18105_007E,B18105_010E,B18105_013E,B18105_016E,B18105_020E,B18105_023E,B18105_026E,B18105_029E,B18105_032E,sum_ambulatory_disabilities
Unnamed: 0_level_1,Geography,Unnamed: 2_level_1,Estimate!!Total:!!Male:!!5 to 17 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!18 to 34 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!35 to 64 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!65 to 74 years:!!With an ambulatory difficulty,Estimate!!Total:!!Male:!!75 years and over:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!5 to 17 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!18 to 34 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!35 to 64 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!With an ambulatory difficulty,Estimate!!Total:!!Female:!!75 years and over:!!With an ambulatory difficulty,Unnamed: 13_level_1
0,1400000US06001400100,6001400100,0,0,19,29,22,0,0,0,36,47,153
1,1400000US06001400200,6001400200,0,0,0,10,4,0,0,8,0,64,86
2,1400000US06001400300,6001400300,0,0,2,67,43,0,0,0,17,0,129
3,1400000US06001400400,6001400400,0,0,48,7,22,0,0,11,35,33,156
4,1400000US06001400500,6001400500,0,6,71,21,0,0,0,0,17,18,133


#### Subset necessary columns and clean up header row

In [14]:
ambulatory_disability_sum = filtered_ambulatory_disability.loc[
:,['Census_Tract', 'sum_ambulatory_disabilities']
]
ambulatory_disability_sum.columns = ambulatory_disability_sum.columns.droplevel(-1)
# Saving metric df to .csv file
ambulatory_disability_sum = ambulatory_disability_sum.rename(columns={'Census_Tract': 'census_tract'})

ambulatory_disability_sum.to_csv('ambulatory_disability_sum.csv')
print('Saving demographic metric data to a .csv')
ambulatory_disability_sum

Saving demographic metric data to a .csv


Unnamed: 0,census_tract,sum_ambulatory_disabilities
0,6001400100,153
1,6001400200,86
2,6001400300,129
3,6001400400,156
4,6001400500,133
...,...,...
9124,6115040902,10
9125,6115041001,308
9126,6115041002,313
9127,6115041101,385


#### Cognitive Disability

In [15]:
# Read in cognitive data
cognitive_data = pd.read_csv('cognitive_difficulty_B18104.csv', header=[0,1])
cognitive_data.head(5)

Unnamed: 0_level_0,GEO_ID,NAME,B18104_001E,B18104_001M,B18104_002E,B18104_002M,B18104_003E,B18104_003M,B18104_004E,B18104_004M,...,B18104_029M,B18104_030E,B18104_030M,B18104_031E,B18104_031M,B18104_032E,B18104_032M,B18104_033E,B18104_033M,Unnamed: 68
Unnamed: 0_level_1,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!Male:,Margin of Error!!Total:!!Male:,Estimate!!Total:!!Male:!!5 to 17 years:,Margin of Error!!Total:!!Male:!!5 to 17 years:,Estimate!!Total:!!Male:!!5 to 17 years:!!With a cognitive difficulty,Margin of Error!!Total:!!Male:!!5 to 17 years:!!With a cognitive difficulty,...,Margin of Error!!Total:!!Female:!!65 to 74 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!No cognitive difficulty,Margin of Error!!Total:!!Female:!!65 to 74 years:!!No cognitive difficulty,Estimate!!Total:!!Female:!!75 years and over:,Margin of Error!!Total:!!Female:!!75 years and over:,Estimate!!Total:!!Female:!!75 years and over:!!With a cognitive difficulty,Margin of Error!!Total:!!Female:!!75 years and over:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!75 years and over:!!No cognitive difficulty,Margin of Error!!Total:!!Female:!!75 years and over:!!No cognitive difficulty,Unnamed: 68_level_1
0,1400000US06001400100,Census Tract 4001; Alameda County; California,3136,448,1549,341,300,216,11,18,...,13,242,81,279,113,10,15,269,115,
1,1400000US06001400200,Census Tract 4002; Alameda County; California,1978,199,992,130,76,37,0,13,...,13,136,33,174,60,58,44,116,40,
2,1400000US06001400300,Census Tract 4003; Alameda County; California,5492,574,2767,502,516,199,22,37,...,19,324,144,279,87,12,20,267,87,
3,1400000US06001400400,Census Tract 4004; Alameda County; California,3937,475,1791,277,261,118,0,13,...,13,225,63,119,50,0,13,119,50,
4,1400000US06001400500,Census Tract 4005; Alameda County; California,3791,737,1748,287,224,72,0,13,...,17,144,67,325,280,0,13,325,280,


In [16]:
# Making a Census tract column using the GEO_ID column
cognitive_data['Census_Tract'] = cognitive_data.copy()['GEO_ID', 'Geography'].str[10:]
# Filter columns based on criteria
columns_to_keep = ['Census_Tract']  # Always keep these columns
filtered_cognitive_disability = cognitive_data[['Census_Tract']]
# filter data to only include estimated population living with a cognitive disability
filtered_cognitive_disability = pd.concat(
    [filtered_cognitive_disability,
    cognitive_data.filter(regex=r'Estimate').filter(
    regex=r'With a cognitive difficulty')], axis=1)
# Display the resulting DataFrame
filtered_cognitive_disability.head(5)

Unnamed: 0_level_0,Census_Tract,B18104_004E,B18104_007E,B18104_010E,B18104_013E,B18104_016E,B18104_020E,B18104_023E,B18104_026E,B18104_029E,B18104_032E
Unnamed: 0_level_1,Unnamed: 1_level_1,Estimate!!Total:!!Male:!!5 to 17 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!18 to 34 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!35 to 64 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!65 to 74 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!75 years and over:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!5 to 17 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!18 to 34 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!35 to 64 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!75 years and over:!!With a cognitive difficulty
0,6001400100,11,13,19,10,0,20,11,28,0,10
1,6001400200,0,0,33,5,11,0,0,0,0,58
2,6001400300,22,20,42,0,18,15,19,25,0,12
3,6001400400,0,0,98,0,10,0,0,17,0,0
4,6001400500,0,40,7,29,0,0,36,6,10,0


In [17]:
# Create a new column by summing all data columns
filtered_cognitive_disability['sum_cognitive_disabilities'] = filtered_cognitive_disability.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
display(filtered_cognitive_disability)

Unnamed: 0_level_0,Census_Tract,B18104_004E,B18104_007E,B18104_010E,B18104_013E,B18104_016E,B18104_020E,B18104_023E,B18104_026E,B18104_029E,B18104_032E,sum_cognitive_disabilities
Unnamed: 0_level_1,Unnamed: 1_level_1,Estimate!!Total:!!Male:!!5 to 17 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!18 to 34 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!35 to 64 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!65 to 74 years:!!With a cognitive difficulty,Estimate!!Total:!!Male:!!75 years and over:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!5 to 17 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!18 to 34 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!35 to 64 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!65 to 74 years:!!With a cognitive difficulty,Estimate!!Total:!!Female:!!75 years and over:!!With a cognitive difficulty,Unnamed: 12_level_1
0,6001400100,11,13,19,10,0,20,11,28,0,10,111
1,6001400200,0,0,33,5,11,0,0,0,0,58,107
2,6001400300,22,20,42,0,18,15,19,25,0,12,151
3,6001400400,0,0,98,0,10,0,0,17,0,0,125
4,6001400500,0,40,7,29,0,0,36,6,10,0,128
...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6115040902,0,0,0,0,0,20,0,0,0,0,20
9125,6115041001,0,0,27,14,11,0,4,6,23,43,128
9126,6115041002,0,0,0,45,71,49,0,0,16,41,222
9127,6115041101,86,0,32,10,0,0,0,53,0,0,95


In [18]:
# clean up headers and keep the columns we want
cognitive_disability_sum = filtered_cognitive_disability.loc[:,['Census_Tract', 'sum_cognitive_disabilities']]
cognitive_disability_sum = cognitive_disability_sum.rename(columns={'Census_Tract': 'census_tract'})

# Saving metric df to .csv file
cognitive_disability_sum.to_csv('cognitive_disability_sum.csv')
print('Saving demographic metric data to a .csv')
cognitive_disability_sum.head(5)

Saving demographic metric data to a .csv


Unnamed: 0,census_tract,sum_cognitive_disabilities
,,
0.0,6001400100.0,111.0
1.0,6001400200.0,107.0
2.0,6001400300.0,151.0
3.0,6001400400.0,125.0
4.0,6001400500.0,128.0


#### Metric 6: Financial Assistance
* ACS data is for children under 18 years in households
* number of children per tract in financial support data matches number\
of children in demographic data, so no conversion necessary

In [19]:
# Read in financial assistance data
financial_assistance_data = pd.read_csv('financial_support_B09010.csv', header=[0,1])
# Making a Census tract column using the GEO_ID column
financial_assistance_data['Census_Tract'] = financial_assistance_data['GEO_ID', 'Geography'].str[10:]
financial_assistance_data.head(5)

Unnamed: 0_level_0,GEO_ID,NAME,B09010_001E,B09010_001M,B09010_002E,B09010_002M,B09010_003E,B09010_003M,B09010_004E,B09010_004M,...,B09010_010E,B09010_010M,B09010_011E,B09010_011M,B09010_012E,B09010_012M,B09010_013E,B09010_013M,Unnamed: 28,Census_Tract
Unnamed: 0_level_1,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,"Estimate!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:","Margin of Error!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:","Estimate!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:","Margin of Error!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:","Estimate!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In married-couple family","Margin of Error!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In married-couple family",...,"Estimate!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In married-couple family","Margin of Error!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In married-couple family","Estimate!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In male householder, no spouse present, family","Margin of Error!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In male householder, no spouse present, family","Estimate!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In female householder, no spouse present, family","Margin of Error!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In family households:!!In female householder, no spouse present, family","Estimate!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In nonfamily households","Margin of Error!!Total:!!Living in household with no Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:!!In nonfamily households",Unnamed: 28_level_1,Unnamed: 21_level_1
0,1400000US06001400100,Census Tract 4001; Alameda County; California,661,237,105,68,85,63,85,63,...,382,222,96,100,78,53,0,13,,6001400100
1,1400000US06001400200,Census Tract 4002; Alameda County; California,350,44,0,13,0,13,0,13,...,309,47,9,16,32,31,0,13,,6001400200
2,1400000US06001400300,Census Tract 4003; Alameda County; California,942,258,30,49,30,49,30,49,...,609,158,132,173,171,90,0,19,,6001400300
3,1400000US06001400400,Census Tract 4004; Alameda County; California,941,308,134,163,134,163,8,13,...,666,298,31,30,72,65,38,60,,6001400400
4,1400000US06001400500,Census Tract 4005; Alameda County; California,496,102,22,22,14,19,14,19,...,398,109,29,40,47,38,0,13,,6001400500


In [20]:
# Renaming columns from dictionary code to definition
financial_assistance_data = financial_assistance_data.rename(
    columns={'B09010_001E': 'total_children_under_18',
             'B09010_002E': 'estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months'}
)
# Drop the first row that contains additional info about columns
financial_assistance_data.columns = financial_assistance_data.columns.droplevel(-1)
# Subset for desired columns
filtered_financial_assistance_data = financial_assistance_data.loc[
:,['GEO_ID', 'Census_Tract', 'total_children_under_18', 
   'estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months']
]
# Create a new column for % of children living in household with financial assistance 
filtered_financial_assistance_data.loc[:,'percent_children_household_financial_assistance'] = pd.to_numeric(
    filtered_financial_assistance_data['estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months']
) / pd.to_numeric(filtered_financial_assistance_data['total_children_under_18'])
# Convert to percentage
filtered_financial_assistance_data.loc[:,'percent_children_household_financial_assistance'] *= 100

filtered_financial_assistance_data.head(5)

Unnamed: 0,GEO_ID,Census_Tract,total_children_under_18,estimated_total_children_household_ssi_cash_assistance_or_SNAP_12_months,percent_children_household_financial_assistance
0,1400000US06001400100,6001400100,661,105,15.885023
1,1400000US06001400200,6001400200,350,0,0.0
2,1400000US06001400300,6001400300,942,30,3.184713
3,1400000US06001400400,6001400400,941,134,14.24017
4,1400000US06001400500,6001400500,496,22,4.435484


In [21]:
# Saving metric df to .csv file
filtered_financial_assistance_data = filtered_financial_assistance_data.rename(columns={'Census_Tract':'census_tract'})
filtered_financial_assistance_data = filtered_financial_assistance_data[['census_tract', 'percent_children_household_financial_assistance']]
filtered_financial_assistance_data.to_csv('society_financial_assistance_metric.csv')
print('Saving demographic metric data to a .csv')
filtered_financial_assistance_data

Saving demographic metric data to a .csv


Unnamed: 0,census_tract,percent_children_household_financial_assistance
0,6001400100,15.885023
1,6001400200,0.000000
2,6001400300,3.184713
3,6001400400,14.240170
4,6001400500,4.435484
...,...,...
9124,6115040902,13.194444
9125,6115041001,0.000000
9126,6115041002,0.000000
9127,6115041101,44.331984


#### Metric 7: Health Insurance
* though the estimated total code (_001E) is the same as cognitive and ambulatory disability datasets,\
SOME of the values are the same as the demographic data values. Strange, first three match, fourth one does not, so to be safe,
I will impliment the resulting csv in the function below to calculate percent based on the demographic data total population

In [22]:
# Read in cognitive data
health_insurance_data = pd.read_csv('health_insurance_B27010.csv', header=[0,1])
# Making a Census tract column using the GEO_ID column
health_insurance_data['Census_Tract'] = health_insurance_data['GEO_ID', 'Geography'].str[10:]
health_insurance_data.head(5)

Unnamed: 0_level_0,GEO_ID,NAME,B27010_001E,B27010_001M,B27010_002E,B27010_002M,B27010_003E,B27010_003M,B27010_004E,B27010_004M,...,B27010_063E,B27010_063M,B27010_064E,B27010_064M,B27010_065E,B27010_065M,B27010_066E,B27010_066M,Unnamed: 134,Census_Tract
Unnamed: 0_level_1,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!Under 19 years:,Margin of Error!!Total:!!Under 19 years:,Estimate!!Total:!!Under 19 years:!!With one type of health insurance coverage:,Margin of Error!!Total:!!Under 19 years:!!With one type of health insurance coverage:,Estimate!!Total:!!Under 19 years:!!With one type of health insurance coverage:!!With employer-based health insurance only,Margin of Error!!Total:!!Under 19 years:!!With one type of health insurance coverage:!!With employer-based health insurance only,...,Estimate!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other private only combinations,Margin of Error!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other private only combinations,Estimate!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other public only combinations,Margin of Error!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other public only combinations,Estimate!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other coverage combinations,Margin of Error!!Total:!!65 years and over:!!With two or more types of health insurance coverage:!!Other coverage combinations,Estimate!!Total:!!65 years and over:!!No health insurance coverage,Margin of Error!!Total:!!65 years and over:!!No health insurance coverage,Unnamed: 134_level_1,Unnamed: 21_level_1
0,1400000US06001400100,Census Tract 4001; Alameda County; California,3269,452,671,236,570,222,289,101,...,0,13,0,13,214,93,0,13,,6001400100
1,1400000US06001400200,Census Tract 4002; Alameda County; California,2147,201,354,44,331,52,308,53,...,0,13,0,13,122,43,0,13,,6001400200
2,1400000US06001400300,Census Tract 4003; Alameda County; California,5619,571,990,249,911,234,889,234,...,0,19,0,19,213,139,0,19,,6001400300
3,1400000US06001400400,Census Tract 4004; Alameda County; California,4259,597,951,305,899,298,694,296,...,0,13,0,13,82,44,0,13,,6001400400
4,1400000US06001400500,Census Tract 4005; Alameda County; California,3949,737,555,125,547,125,461,103,...,0,13,0,13,87,45,32,50,,6001400500


In [23]:
# Renaming columns from dictionary code to definition
filtered_health_insurance_data = health_insurance_data[['GEO_ID', 'Census_Tract']]
# filter data to only include estimated population living with a cognitive disability
filtered_health_insurance_data = pd.concat(
    [filtered_health_insurance_data,
    health_insurance_data.filter(regex=r'Estimate').filter(
    regex=r'No health insurance coverage')], axis=1)
# Display the resulting DataFrame
filtered_health_insurance_data.head(5)

Unnamed: 0_level_0,GEO_ID,Census_Tract,B27010_017E,B27010_033E,B27010_050E,B27010_066E
Unnamed: 0_level_1,Geography,Unnamed: 2_level_1,Estimate!!Total:!!Under 19 years:!!No health insurance coverage,Estimate!!Total:!!19 to 34 years:!!No health insurance coverage,Estimate!!Total:!!35 to 64 years:!!No health insurance coverage,Estimate!!Total:!!65 years and over:!!No health insurance coverage
0,1400000US06001400100,6001400100,0,0,9,0
1,1400000US06001400200,6001400200,0,18,0,0
2,1400000US06001400300,6001400300,0,33,60,0
3,1400000US06001400400,6001400400,52,23,21,0
4,1400000US06001400500,6001400500,8,54,83,32


In [24]:
filtered_health_insurance_data.columns = filtered_health_insurance_data.columns.droplevel(-1)
# Create a new column by summing all columns starting from the second column
filtered_health_insurance_data['sum_without_health_insurance'] = filtered_health_insurance_data.iloc[:, 2:].sum(axis=1).astype(int)
# Display the DataFrame with the new column
filtered_health_insurance_data.head(5)

Unnamed: 0,GEO_ID,Census_Tract,B27010_017E,B27010_033E,B27010_050E,B27010_066E,sum_without_health_insurance
0,1400000US06001400100,6001400100,0,0,9,0,9
1,1400000US06001400200,6001400200,0,18,0,0,18
2,1400000US06001400300,6001400300,0,33,60,0,93
3,1400000US06001400400,6001400400,52,23,21,0,96
4,1400000US06001400500,6001400500,8,54,83,32,177


In [25]:
filtered_health_insurance_data = filtered_health_insurance_data.rename(columns={'Census_Tract':'census_tract'})
without_health_insurance_sum = filtered_health_insurance_data.loc[:,['census_tract', 'sum_without_health_insurance']]
# Saving metric df to .csv file
without_health_insurance_sum.to_csv('without_health_insurance_sum.csv')
print('Saving health insurance metric data to a .csv')
without_health_insurance_sum

Saving health insurance metric data to a .csv


Unnamed: 0,census_tract,sum_without_health_insurance
0,6001400100,9
1,6001400200,18
2,6001400300,93
3,6001400400,96
4,6001400500,177
...,...,...
9124,6115040902,27
9125,6115041001,102
9126,6115041002,530
9127,6115041101,170


#### Calculate % of total population

* pull the acs demographic (dpo5) estimated population csv file from AWS
* iterate through each input csv file that needs to calculate metric percentage
* perform the calculation on the selected csv column (sum/demographic total population) *100
* save new metric as a csv file

In [26]:
@append_metadata
def calculate_acs_metric_percentage(
    input_csv, output_csv, calculate_percentage=True, varname=""
):
    '''
    Calculates the following metrics sourced from the American Community Survey:
    - Ambulatory Difficulty: % of population living with an ambulatory disability
    - Cognitive Difficulty: % of population living with a cognitive disability
    - Financial Assistance: 
        % of population living in a household with Supplemental Security Income (SSI), 
        cash public assistance income, or Food Stamps/SNAP in the last 12 months
    - Health Insurance: % of population without health insurance

    Demography: 
    - % of population aged 65 years or older
    - % of population under 5 years old
    - % of population American Indian and Alaska Native
  
    Methods
    --------
    The estimated population column (DP05_0001E) from ACS dataset DP05 was used as the 
    universal population values when calculating metric percentage for the metrics above. 
    Columns were renamed and summed when a metric's values were separated by age group.

    Parameters
    ------------
    input_csv: string
        Dataframe containing a column with sum of people meeting a metric condition.
    output_csv: string
        Output filename.
    calculate_percentage: boolean
        if true, calculates percentage of input_csv based on a 'sum' column
        if false, skipts percentage calculation and uploads .csv to AWS
    varname: string
        Final metric name. 

    Script
    ------
    society_vulnerable_populations.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    '''
    print('Data transformation: utilizing ACS dataset DP05 population data to calculate metric.')
    print('Data transformation: columns renamed and summed for total metric value.')

    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/index_data'
    cri_est_pop = f"s3://{bucket_name}/0_map_data/cri_acs_demographic_estimated_population.csv"
    cri_tract_est_pop = pd.read_csv(cri_est_pop)
    data = pd.read_csv(input_csv)
    
    if calculate_percentage:
        data['est_total_population'] = cri_tract_est_pop['est_total_pop']
        # get column with raw sum
        sum_column = data.filter(regex=r'sum_').columns[0]
        # name new column for 'percent of total' data
        percent_column_name = sum_column.replace('sum','percent_population')
        # calculate percent of total population from sum
        data[percent_column_name] = (data[sum_column] / data['est_total_population']) * 100
    
        # Reorder columns with the new column at the end
        columns = list(data.columns)
        columns.remove(percent_column_name)  # Remove the column to be placed at the end
        columns.append(percent_column_name)  # Append it to the end
        data = data[columns]
        
    # Save the updated DataFrame to a new CSV file
    data.to_csv(output_csv, index=False)
    upload_csv_aws([output_csv], bucket_name, directory)
    os.remove(input_csv)

In [27]:
# for metrics that need the percent calculation
input_csvs = [
    'ambulatory_disability_sum.csv', 
    'cognitive_disability_sum.csv', 
    'without_health_insurance_sum.csv'
]
output_csvs = [
    'society_ambulatory_disability_metric.csv', 
    'society_cognitive_disability_metric.csv', 
    'society_without_health_insurance_metric.csv'
]
varnames = [
    'society_acs_ambulatory',
    'society_acs_cognitive',
    'society_acs_health_insurance'
]
aa = list(zip(input_csvs,output_csvs,varnames))
for input_csv,output_csv,varname in list(
    zip(input_csvs,output_csvs,varnames)
):
    calculate_acs_metric_percentage(
        input_csv, output_csv, calculate_percentage=True,
        varname=varname
    )

In [28]:
# for metrics that dont need the percent calculation
input_csvs = [
    'society_under_5yo_metric.csv',
    'society_american_indian_alaska_native_metric.csv',
    'society_over_65yo_metric.csv',
    'society_financial_assistance_metric.csv'
]
output_csvs = [
    'society_under_5yo_metric.csv',
    'society_american_indian_alaska_native_metric.csv',
    'society_over_65yo_metric.csv',
    'society_financial_assistance_metric.csv'
]
varnames = [
    'society_acs_demographic_under_5',
    'society_acs_demographic_american_indian',
    'society_acs_demographic_over_65',
    'society_acs_financial_assistance'
]
aa = list(zip(input_csvs,output_csvs,varnames))
for input_csv,output_csv,varname in list(
    zip(input_csvs,output_csvs,varnames)
):
    calculate_acs_metric_percentage(
        input_csv, output_csv, calculate_percentage=False,
        varname=varname
    )

In [30]:
# Delete all CSV files in the current directory that are not in the output folder
current_files = glob.glob('*.csv')
for file in current_files:
    os.remove(file)