# This notebook calculates the personal preparedness metrics sourced from FEMA
* % of survey respondents who can live in their homes for at least 3 days without running water
* % of survey respondents who can live in their homes for at least 3 days without power
* % of survey respondents who have engaged in at least one preparedness action in the last year

In [20]:
import pandas as pd
import os
import sys
import math
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/governance/personal_preparedness/fema/fema_national_household_survey/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'fema_prepareness_survey_subset.csv'


In [3]:
fema_household_data = pd.read_csv('fema_prepareness_survey_subset.csv')

In [4]:
fema_household_data

Unnamed: 0,Unique Respondent ID (IDs are consistent between NHS General Data and NHs Hazards Data. Respondents within the hazard dataset have the same IDs in the general dataset).,Oversample Hazard type,General Sample Weight,Phone/Online,Online/Cellphone/Landline Phone,What is the name of the state or territory you live in?,Census Division,Census Region,What is your ZIP Code?,What county in [state] do you live in?,...,"Generated column - rurality calculated using ZIP code, county, and state",Demographics imputed (Yes/No),Demographics imputed (Yes/No).1,Demographics imputed (Yes/No).2,Demographics imputed (Yes/No).3,Demographics imputed (Yes/No).4,Demographics imputed (Yes/No).5,Demographics imputed (Yes/No).6,Demographics imputed (Yes/No).7,Language in which survey was completed
0,6,Active Shooter,1.932171,Online,Online,California,Pacific,West,94550,Alameda,...,Urban,No,No,No,No,No,No,No,No,English
1,8,Active Shooter,1.066004,Online,Online,California,Pacific,West,95841,Sacramento,...,Urban,No,No,No,No,No,No,No,No,English
2,11,Active Shooter,3.601788,Online,Online,California,Pacific,West,92708,Orange,...,Urban,No,No,No,No,No,No,No,No,English
3,14,Active Shooter,0.464613,Online,Online,California,Pacific,West,90028,Los Angeles,...,Urban,No,No,No,No,No,No,No,No,English
4,15,Active Shooter,0.818878,Online,Online,California,Pacific,West,95835,Sacramento,...,Urban,No,No,No,No,No,No,No,No,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,7119,,1.841526,Phone,Landline phone,California,Pacific,West,95687,Solano,...,Urban,No,No,No,No,No,No,No,No,English
712,7130,,0.618372,Phone,Landline phone,California,Pacific,West,90405,Los Angeles,...,Urban,No,No,No,No,No,No,No,No,English
713,7132,,0.660724,Phone,Landline phone,California,Pacific,West,90403,Los Angeles,...,Urban,No,No,No,No,No,No,No,No,English
714,7133,,1.772459,Phone,Landline phone,California,Pacific,West,92346,San Bernardino,...,Urban,No,No,No,No,No,No,No,No,English


In [5]:
pd.set_option('display.max_columns', None)
print(fema_household_data.columns)

Index(['Unique Respondent ID (IDs are consistent between NHS General Data and NHs Hazards Data. Respondents within the hazard dataset have the same IDs in the general dataset).',
       'Oversample Hazard type', 'General Sample Weight', 'Phone/Online',
       'Online/Cellphone/Landline Phone',
       'What is the name of the state or territory you live in?',
       'Census Division', 'Census Region', 'What is your ZIP Code?',
       'What county in [state] do you live in? ',
       ...
       'Generated column - rurality calculated using ZIP code, county, and state',
       'Demographics imputed (Yes/No)', 'Demographics imputed (Yes/No).1',
       'Demographics imputed (Yes/No).2', 'Demographics imputed (Yes/No).3',
       'Demographics imputed (Yes/No).4', 'Demographics imputed (Yes/No).5',
       'Demographics imputed (Yes/No).6', 'Demographics imputed (Yes/No).7',
       'Language in which survey was completed'],
      dtype='object', length=368)


In [6]:
# Selecting specific columns for our metrics
selected_columns = ['What county in [state] do you live in? ',
'How long could you live in your home without power?',
'How long could you live in your home without running water? ',
'Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness? ']

selected_fema_household_data = fema_household_data[selected_columns]
selected_fema_household_data

Unnamed: 0,What county in [state] do you live in?,How long could you live in your home without power?,How long could you live in your home without running water?,"Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness?"
0,Alameda,3 days to 1 week,Don't know,"I am NOT prepared, but I intend to get prepare..."
1,Sacramento,1 to 3 days,1 to 3 days,"I am NOT prepared, and I do not intend to prep..."
2,Orange,Don't know,3 days to 1 week,I have been prepared for MORE than a year and ...
3,Los Angeles,3 days to 1 week,3 days to 1 week,"I am NOT prepared, but I intend to get prepare..."
4,Sacramento,3 days to 1 week,1 to 3 days,"I am NOT prepared, but I intend to get prepare..."
...,...,...,...,...
711,Solano,More than 1 month,More than 2 weeks,I have been prepared for MORE than a year and ...
712,Los Angeles,More than 1 week,More than 1 week,"I am NOT prepared, and I do not intend to prep..."
713,Los Angeles,More than 1 week,More than 1 week,I have been prepared for MORE than a year and ...
714,San Bernardino,1 to 3 days,3 days to 1 week,I have been prepared for MORE than a year and ...


In [31]:
# Looking at Alameda's data to compare with our metric calculations below
alameda = selected_fema_household_data[selected_fema_household_data['What county in [state] do you live in? '] == 'Alameda']
alameda

Unnamed: 0,What county in [state] do you live in?,How long could you live in your home without power?,How long could you live in your home without running water?,"Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness?"
0,Alameda,3 days to 1 week,Don't know,"I am NOT prepared, but I intend to get prepare..."
34,Alameda,More than 1 week,3 days to 1 week,I have been prepared for MORE than a year and ...
72,Alameda,3 days to 1 week,3 days to 1 week,"I am NOT prepared, but I intend to start prepa..."
96,Alameda,More than 1 week,More than 1 week,I have been prepared for LESS than a year
122,Alameda,More than 2 weeks,3 days to 1 week,"I am NOT prepared, and I do not intend to prep..."
178,Alameda,1 to 3 days,3 days to 1 week,I have been prepared for MORE than a year and ...
184,Alameda,1 to 3 days,3 days to 1 week,"I am NOT prepared, and I do not intend to prep..."
269,Alameda,1 to 3 days,Less than 1 day,I have been prepared for LESS than a year
311,Alameda,1 to 3 days,1 to 3 days,I have been prepared for MORE than a year and ...
351,Alameda,3 days to 1 week,1 to 3 days,"I am NOT prepared, but I intend to get prepare..."


In [7]:
# Looking at unique answers for each column to better identify which responses to isolate/count for
unique_values = {column: selected_fema_household_data[column].unique() for column in selected_fema_household_data.columns}
unique_values

{'What county in [state] do you live in? ': array(['Alameda', 'Sacramento', 'Orange', 'Los Angeles', 'San Diego',
        'Tulare', 'Humboldt', 'Alpine', 'Sonoma', 'Mendocino',
        'San Francisco', 'Santa Clara', 'San Bernardino', 'Marin',
        'Ventura', 'Merced', 'Fresno', 'El Dorado', 'San Mateo', 'Solano',
        'Yuba', 'Santa Barbara', 'Nevada', 'Inyo', 'Monterey',
        'San Joaquin', 'Contra Costa', 'Calaveras', 'Kern', 'Riverside',
        'Kings', 'Shasta', 'Butte', 'Stanislaus', 'Yolo', 'Madera',
        'Imperial', 'Tuolumne', 'Plumas', 'San Benito', 'Trinity',
        'Santa Cruz', 'Placer', 'Lassen', 'San Luis Obispo', 'Lake',
        'Siskiyou', 'Amador', 'Sutter'], dtype=object),
 'How long could you live in your home without power?': array(['3 days to 1 week', '1 to 3 days', "Don't know", 'Less than 1 day',
        'More than 3 months', 'More than 1 week', 'More than 1 month',
        'More than 2 weeks'], dtype=object),
 'How long could you live in your home

In [32]:
# Define the conditions we want to isolate for each column/survey question
conditions = {
    'How long could you live in your home without power?': ['1 to 3 days', "Don't know", 'Less than 1 day'],
    'How long could you live in your home without running water? ': ["Don't know", 'Less than 1 day', '1 to 3 days'],
    'Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness? ': [
        'I am NOT prepared, but I intend to get prepared in the next six months',
        'I am NOT prepared, and I do not intend to prepare in the next year',
        'I am NOT prepared, but I intend to start preparing in the next year',
        "Don't know"
    ]
}

# Create the new DataFrame with boolean values
boolean_df = pd.DataFrame()

# Retain the 'What county in [state] do you live in? ' column
boolean_df['What county in [state] do you live in? '] = selected_fema_household_data['What county in [state] do you live in? ']
for column, invalid_answers in conditions.items():
    boolean_df[column] = selected_fema_household_data[column].apply(lambda x: 0 if x in invalid_answers else 1)

boolean_df

Unnamed: 0,What county in [state] do you live in?,How long could you live in your home without power?,How long could you live in your home without running water?,"Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness?"
0,Alameda,1,0,0
1,Sacramento,0,0,0
2,Orange,0,1,1
3,Los Angeles,1,1,0
4,Sacramento,1,0,0
...,...,...,...,...
711,Solano,1,1,1
712,Los Angeles,1,1,0
713,Los Angeles,1,1,1
714,San Bernardino,0,1,1


In [15]:
boolean_df = boolean_df.rename(columns={
    'What county in [state] do you live in? ':'county',
    'How long could you live in your home without power?':'prepared_without_power',
    'How long could you live in your home without running water? ':'prepared_without_running_water',
    'Thinking about preparing yourself for a disaster, which of the following best represents your degree of preparedness? ':'general_preparedness'
})

In [16]:
boolean_df

Unnamed: 0,county,prepared_without_power,prepared_without_running_water,general_preparedness
0,Alameda,1,0,0
1,Sacramento,0,0,0
2,Orange,0,1,1
3,Los Angeles,1,1,0
4,Sacramento,1,0,0
...,...,...,...,...
711,Solano,1,1,1
712,Los Angeles,1,1,0
713,Los Angeles,1,1,1
714,San Bernardino,0,1,1


In [17]:
# Group by county, sum the boolean values, and count the occurrences of each county
grouped_df = boolean_df.groupby('county').agg({
    'prepared_without_power': 'sum',
    'prepared_without_running_water': 'sum',
    'general_preparedness': 'sum',
    'county': 'size'
}).rename(columns={'county': 'county_count'}).reset_index()

# Group by county, sum the boolean values, and count the occurrences of each county
grouped_df = boolean_df.groupby('county').agg({
    'prepared_without_power': 'sum',
    'prepared_without_running_water': 'sum',
    'general_preparedness': 'sum'
})

# Add 'county_count' column
grouped_df['county_count'] = boolean_df['county'].value_counts()

# Reset the index to move 'county' back as a column
grouped_df.reset_index(inplace=True)

grouped_df

Unnamed: 0,county,prepared_without_power,prepared_without_running_water,general_preparedness,county_count
0,Alameda,12,12,11,21
1,Alpine,0,0,1,1
2,Amador,2,2,1,2
3,Butte,3,3,3,5
4,Calaveras,5,7,6,7
5,Contra Costa,8,8,10,18
6,El Dorado,2,1,1,2
7,Fresno,11,8,8,21
8,Humboldt,3,2,2,3
9,Imperial,4,5,0,5


In [18]:
# Calculate percentage for each metric
fema_household_metric = pd.DataFrame()
fema_household_metric['county'] = grouped_df['county']
fema_household_metric['county_count'] = grouped_df['county_count']
fema_household_metric['percent_prepared_without_power'] = (grouped_df['prepared_without_power'] / grouped_df['county_count']) * 100
fema_household_metric['percent_prepared_without_water'] = (grouped_df['prepared_without_running_water'] / grouped_df['county_count']) * 100
fema_household_metric['percent_prepared_for_general_disaster'] = (grouped_df['general_preparedness'] / grouped_df['county_count']) * 100

fema_household_metric

Unnamed: 0,county,county_count,percent_prepared_without_power,percent_prepared_without_water,percent_prepared_for_general_disaster
0,Alameda,21,57.142857,57.142857,52.380952
1,Alpine,1,0.0,0.0,100.0
2,Amador,2,100.0,100.0,50.0
3,Butte,5,60.0,60.0,60.0
4,Calaveras,7,71.428571,100.0,85.714286
5,Contra Costa,18,44.444444,44.444444,55.555556
6,El Dorado,2,100.0,50.0,50.0
7,Fresno,21,52.380952,38.095238,38.095238
8,Humboldt,3,100.0,66.666667,66.666667
9,Imperial,5,80.0,100.0,0.0


In [25]:
# Verify county entries are valid
filtered_counties, omitted_rows = filter_counties(fema_household_metric, 'county')
omitted_rows

Unnamed: 0,county,county_count,percent_prepared_without_power,percent_prepared_without_water,percent_prepared_for_general_disaster


In [23]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'tract', 'County':'county'})

In [24]:
ca_tract_county

Unnamed: 0,tract,county
0,06085504321,Santa Clara
1,06085504410,Santa Clara
2,06085507003,Santa Clara
3,06085507004,Santa Clara
4,06085502204,Santa Clara
...,...,...
9124,06059001303,Orange
9125,06059001304,Orange
9126,06059001401,Orange
9127,06013367200,Contra Costa


In [26]:
# merge data with CA tract data so each CA tract has a value from its corresponding county
fema_household_merge = pd.merge(ca_tract_county, fema_household_metric, on='county', how='left')
fema_household_merge

Unnamed: 0,tract,county,county_count,percent_prepared_without_power,percent_prepared_without_water,percent_prepared_for_general_disaster
0,06085504321,Santa Clara,24.0,54.166667,50.000000,45.833333
1,06085504410,Santa Clara,24.0,54.166667,50.000000,45.833333
2,06085507003,Santa Clara,24.0,54.166667,50.000000,45.833333
3,06085507004,Santa Clara,24.0,54.166667,50.000000,45.833333
4,06085502204,Santa Clara,24.0,54.166667,50.000000,45.833333
...,...,...,...,...,...,...
9124,06059001303,Orange,33.0,54.545455,48.484848,54.545455
9125,06059001304,Orange,33.0,54.545455,48.484848,54.545455
9126,06059001401,Orange,33.0,54.545455,48.484848,54.545455
9127,06013367200,Contra Costa,18.0,44.444444,44.444444,55.555556


In [27]:
# Create a unique DataFrame for each percent column
df_percent_power = fema_household_merge[['tract', 'county', 'county_count', 'percent_prepared_without_power']]
df_percent_water = fema_household_merge[['tract', 'county', 'county_count', 'percent_prepared_without_water']]
df_percent_general = fema_household_merge[['tract', 'county', 'county_count', 'percent_prepared_for_general_disaster']]

In [29]:
# Save each DataFrame as a CSV file
df_percent_power.to_csv('percent_prepared_without_power.csv', index=False)
df_percent_water.to_csv('percent_prepared_without_water.csv', index=False)
df_percent_general.to_csv('percent_prepared_for_general_disaster.csv', index=False)

## Function Call

In [33]:
@append_metadata
def fema_household_survey_upload(input_csv, export=False, varname=''):
    '''
    Uploads the disaster preparation metrics to S3 bucket. The metrics are:
    
    * % of survey respondents who can live in their homes for at least 3 days without running water
    * % of survey respondents who can live in their homes for at least 3 days without power
    * % of survey respondents who have engaged in at least one preparedness action in the last year

    Note: Sample size for this dataset is fairly small

    Data for this metric was sourced from the Federal Emergency Management Agency at:
    https://www.fema.gov/about/openfema/data-sets#emergency

    Methods
    -------
    Relevant data columns were isolated, some were renamed for later merging with California tract data.
    Specific answers to selected survey questions were isolate to represent participant preparedness.
    Answers were grouped by county and summed for total 'preparedness' and total answers.
    Percentage was calculated for the three survey questions and merged to California tract data.
    
    Parameters
    ----------
    input_csv: string
        csv disaster preparedness survey metric data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI disaster preparedness survey metric to AWS
        True = will upload resulting df containing CAL CRAI disaster preparedness survey metric to AWS

    Script
    ------
    governance_fema_household_survey.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns and contaminants were isolated and renamed.')
    print('Data transformation: responses to specific columns were summed, including total responses.')
    print('Data transformation: percent preparedness was calculated for each metric.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [35]:
input_csv = ['percent_prepared_without_water.csv', 
             'percent_prepared_without_power.csv',
             'percent_prepared_for_general_disaster.csv']

varnames = ['governance_fema_household_water_resilience',
            'governance_fema_household_power_resilience',
            'governance_fema_household_preparedness']

bucket_name = 'ca-climate-index'
directory = '3_fair_data/index_data'

for csv, var in zip(input_csv, varnames):
    fema_household_survey_upload(csv, export=True, varname='test')#var)