# This notebook calculates the climate change health vulnerabilities data metrics
* % of households without air conditioning
* num of vilent crimes per 1,000 people
* % of population aged 16+ working outdoors

In [1]:
import pandas as pd
import os
import sys
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/climate_change_health_vulnerabilities/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'selectedCHVIdata.csv'


In [3]:
# read in food access data (already subsetted for CA)
cchvi_data = pd.read_csv('selectedCHVIdata.csv')
print(len(cchvi_data))
#os.remove('selectedCHVIdata.csv')

1128364


In [4]:
cchvi_data

Unnamed: 0,County,FIPS,Region,Definition,Strata,Race,Year,Mean,LL95,UL95,Numerator,Denominator
0,Alameda,6001400100,Bay Area,Annual Mean Ambient Concentration of Fine Part...,none,White,2012-2014,8.697944,,,,2086.0
1,Alameda,6001400100,Bay Area,Annual Mean Ambient Concentration of Fine Part...,none,Total,2012-2014,8.697944,,,,2952.0
2,Alameda,6001400100,Bay Area,Annual Mean Ambient Concentration of Fine Part...,none,AIAN,2009-2011,7.797807,,,,2.0
3,Alameda,6001400100,Bay Area,Annual Mean Ambient Concentration of Fine Part...,none,Asian,2009-2011,7.797807,,,,456.0
4,Alameda,6001400100,Bay Area,Annual Mean Ambient Concentration of Fine Part...,none,AfricanAm,2009-2011,7.797807,,,,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1128359,Yuba,6115041100,Northern Central Valley,Population living in sea level rise inundation...,none,Total,2009,0.000000,,,0.0,4941.0
1128360,Yuba,6115041100,Northern Central Valley,Poverty Rate (200% FPL),Overall,White,2011-2015,39.876265,30.344324,49.408207,1418.0,3556.0
1128361,Yuba,6115041100,Northern Central Valley,Poverty Rate (200% FPL),Overall,Total,2011-2015,43.177047,30.793378,55.560716,1851.0,4287.0
1128362,Yuba,6115041100,Northern Central Valley,Projected number of extreme heat days 2040-2060,2040-2060,Total,2040-2060,30.100000,,,36.2,


In [5]:
cchvi_data = cchvi_data[cchvi_data['Race']== 'Total']
unique_entries = cchvi_data['Definition'].unique()
unique_entries

array(['Annual Mean Ambient Concentration of Fine Particulate Matter (PM2.5)',
       'Average Daily Maximum Ozone Concentration',
       'Number of Violent Crimes per 1,000 Population',
       'Percent impervious surface cover',
       'Percent of adults with less than college education',
       'Percent of households with no one aged > 14 years speaking English',
       'Percent of households with no vehicle ownership',
       'Percent of households without air conditioning',
       'Percent of population age less than 5 years',
       'Percent of population aged 65 years or older',
       'Percent of population currently living in very high wildfire risk areas',
       'Percent of population employed and aged > 16 working outdoors',
       'Percent of population with a disability',
       'Percent of population without health insurance',
       'Percent without tree canopy coverage',
       'Population living in sea level rise inundation areas',
       'Poverty Rate (200% FPL)',
   

In [6]:
unique_entries = cchvi_data['Year'].unique()
unique_entries

array(['2012-2014', '2009-2011', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2016', '2006-2010', '2011-2015', '2008-2012', '2009-2013',
       '2040-2060', '2080-2099', nan], dtype=object)

In [7]:
selected_entries = ['2040-2060',
                    '2080-2099',
                    ]

cchvi_data_filtered = cchvi_data[~cchvi_data['Year'].isin(selected_entries)]
# Drop rows with NaN values in 'Year' column
cchvi_data_filtered = cchvi_data_filtered.dropna(subset=['Year'])
unique_entries = cchvi_data_filtered['Year'].unique()

unique_entries

array(['2012-2014', '2009-2011', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2016', '2006-2010', '2011-2015', '2008-2012', '2009-2013'],
      dtype=object)

In [17]:
selected_entries = ['Number of Violent Crimes per 1,000 Population', 
                    'Percent of households without air conditioning', 
                    'Percent of population employed and aged > 16 working outdoors']

cchvi_data_filtered = cchvi_data_filtered[cchvi_data_filtered['Definition'].isin(selected_entries)]
cchvi_data_filtered

Unnamed: 0,County,FIPS,Region,Definition,Strata,Race,Year,Mean,LL95,UL95,Numerator,Denominator
13,Alameda,6001400100,Bay Area,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2000,6.582206,6.449864,6.714547,9503.000000,1.443741e+06
14,Alameda,6001400100,Bay Area,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2001,6.407031,6.277658,6.536403,9422.000000,1.470572e+06
15,Alameda,6001400100,Bay Area,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2002,6.457854,6.329112,6.586596,9666.000000,1.496782e+06
16,Alameda,6001400100,Bay Area,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2003,6.791432,6.659006,6.923857,10104.000000,1.487757e+06
17,Alameda,6001400100,Bay Area,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2004,6.166925,6.040314,6.293536,9114.000000,1.477884e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
1128272,Yuba,6115041100,Northern Central Valley,"Number of Violent Crimes per 1,000 Population",ViolentCrime,Total,2013,3.715238,3.273710,4.156766,272.000000,7.321200e+04
1128304,Yuba,6115041100,Northern Central Valley,Percent of households without air conditioning,none,Total,2009,1.725293,0.000000,3.798216,756.431724,4.384366e+04
1128305,Yuba,6115041100,Northern Central Valley,Percent of households without air conditioning,none,Total,2009,1.725293,0.000000,3.798216,756.431724,4.384366e+04
1128330,Yuba,6115041100,Northern Central Valley,Percent of population employed and aged > 16 w...,none,Total,2011-2015,8.034611,,,130.000000,1.618000e+03


In [18]:
grouping_cchvi = cchvi_data_filtered.groupby(['FIPS', 'Definition'])['Mean'].mean().reset_index()
grouping_cchvi.head(10)

Unnamed: 0,FIPS,Definition,Mean
0,6001400100,"Number of Violent Crimes per 1,000 Population",7.267424
1,6001400100,Percent of households without air conditioning,64.306645
2,6001400100,Percent of population employed and aged > 16 w...,0.919842
3,6001400200,"Number of Violent Crimes per 1,000 Population",7.267424
4,6001400200,Percent of households without air conditioning,64.306645
5,6001400200,Percent of population employed and aged > 16 w...,1.595745
6,6001400300,"Number of Violent Crimes per 1,000 Population",7.267424
7,6001400300,Percent of households without air conditioning,64.306645
8,6001400300,Percent of population employed and aged > 16 w...,1.152702
9,6001400400,"Number of Violent Crimes per 1,000 Population",7.267424


In [19]:
pivot_table = grouping_cchvi.pivot_table(index='FIPS', columns='Definition', values='Mean', aggfunc='mean')
pivot_table.reset_index(inplace=True)
random_rows = pivot_table.sample(n=10)
random_rows

Definition,FIPS,"Number of Violent Crimes per 1,000 Population",Percent of households without air conditioning,Percent of population employed and aged > 16 working outdoors
2773,6037534002,6.666117,33.890887,10.054143
2587,6037481402,6.666117,33.890887,6.351054
7675,6099003904,5.759227,16.226552,12.672505
4246,6059088101,2.587129,28.057946,1.080742
6331,6075016802,7.933275,90.793392,1.213297
7490,6097150202,3.887135,64.495138,4.239129
7197,6085510802,3.170385,35.430875,0.789793
4498,6065030200,4.369377,5.112505,6.002879
3614,6047000202,6.309828,2.869682,18.187081
3622,6047000504,6.309828,2.869682,23.234415


### Importing CA tracts - county data for further clarity on data and the similar entries
* Data has eight thousand tracts, need to translate to get the full nine-thousand
* Decided to merge then fill in the excess empty tracts with the county wide average values for the respective county for the metric columns

In [20]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'TRACT': 'FIPS'})
ca_county_tract

Unnamed: 0.1,Unnamed: 0,FIPS,COUNTYFP,County
0,0,6085504321,85,Santa Clara
1,1,6085504410,85,Santa Clara
2,2,6085507003,85,Santa Clara
3,3,6085507004,85,Santa Clara
4,4,6085502204,85,Santa Clara
...,...,...,...,...
9124,9124,6059001303,59,Orange
9125,9125,6059001304,59,Orange
9126,9126,6059001401,59,Orange
9127,9127,6013367200,13,Contra Costa


In [21]:
# Merge the datasets
cchvi_ca_counties = pd.merge(ca_county_tract, pivot_table, on ='FIPS', how='left')

# Move the 'County' column to the second position
column_to_move = 'County'
col = cchvi_ca_counties.pop(column_to_move)
cchvi_ca_counties.insert(1, column_to_move, col)

# Columns to fill NaN values
columns_to_fill = ['Number of Violent Crimes per 1,000 Population',
                   'Percent of households without air conditioning',
                   'Percent of population employed and aged > 16 working outdoors']

# Add a new column indicating whether a value was originally NaN
original_na_flag_column = 'Original_NA_Flag'
cchvi_ca_counties[original_na_flag_column] = np.where(cchvi_ca_counties[columns_to_fill].isna().any(axis=1), 1, 0)

# Compute average values for each column grouped by 'County'
average_values_by_county = cchvi_ca_counties.groupby('County')[columns_to_fill].transform('mean')

# Fill NaN values in each column with the corresponding average value of that column for the respective 'County'
for column in columns_to_fill:
    na_mask = cchvi_ca_counties[column].isna()
    cchvi_ca_counties.loc[na_mask, column] = average_values_by_county.loc[na_mask, column]

print(len(cchvi_ca_counties))
cchvi_ca_counties.head(10)

9129


Unnamed: 0.1,Unnamed: 0,County,FIPS,COUNTYFP,"Number of Violent Crimes per 1,000 Population",Percent of households without air conditioning,Percent of population employed and aged > 16 working outdoors,Original_NA_Flag
0,0,Santa Clara,6085504321,85,3.170385,35.430875,2.022881,0
1,1,Santa Clara,6085504410,85,3.170385,35.430875,1.144842,0
2,2,Santa Clara,6085507003,85,3.170385,35.430875,4.619098,1
3,3,Santa Clara,6085507004,85,3.170385,35.430875,4.619098,1
4,4,Santa Clara,6085502204,85,3.170385,35.430875,4.619098,1
5,5,Santa Clara,6085502203,85,3.170385,35.430875,4.619098,1
6,6,Santa Clara,6085501902,85,3.170385,35.430875,4.619098,1
7,7,Santa Clara,6085502104,85,3.170385,35.430875,4.619098,1
8,8,Santa Clara,6085502103,85,3.170385,35.430875,4.619098,1
9,9,Santa Clara,6085504424,85,3.170385,35.430875,4.619098,1


In [22]:
cchvi_ca_counties.loc[cchvi_ca_counties['County'] == 'Santa Clara']

Unnamed: 0.1,Unnamed: 0,County,FIPS,COUNTYFP,"Number of Violent Crimes per 1,000 Population",Percent of households without air conditioning,Percent of population employed and aged > 16 working outdoors,Original_NA_Flag
0,0,Santa Clara,6085504321,85,3.170385,35.430875,2.022881,0
1,1,Santa Clara,6085504410,85,3.170385,35.430875,1.144842,0
2,2,Santa Clara,6085507003,85,3.170385,35.430875,4.619098,1
3,3,Santa Clara,6085507004,85,3.170385,35.430875,4.619098,1
4,4,Santa Clara,6085502204,85,3.170385,35.430875,4.619098,1
...,...,...,...,...,...,...,...,...
7911,7911,Santa Clara,6085512313,85,3.170385,35.430875,8.054497,0
7912,7912,Santa Clara,6085512314,85,3.170385,35.430875,12.720899,0
7913,7913,Santa Clara,6085512604,85,3.170385,35.430875,23.006469,0
7914,7914,Santa Clara,6085502603,85,3.170385,35.430875,4.649416,0


In [34]:
retained_columns = ['FIPS', 'County']

violent_crime = cchvi_ca_counties[retained_columns + ['Number of Violent Crimes per 1,000 Population']].copy()
percent_without_ac = cchvi_ca_counties[retained_columns + ['Percent of households without air conditioning']].copy()
percent_working_outdoors = cchvi_ca_counties[retained_columns + ['Percent of population employed and aged > 16 working outdoors']].copy()

In [None]:
violent_crime.to_csv('violent_crime_metric.csv')
percent_without_ac.to_csv('percent_without_ac_metric.csv')
percent_working_outdoors.to_csv('percent_working_outdoors.csv')

### Function Call

In [94]:
#@append_metadata
def cchvi_metric_calc(df, output_csvs, export=False): #, varname = ''):
    '''
    The function calculates the vulnerable population metrics sourced from the California Department of Public Health
    Climate Change and Health Vulnerability Indicators for California. The metrics include:
    
    * Number of Violent Crimes per 1,000 Population	
    * Percent of households without air conditioning	
    * Percent of population employed and aged > 16 working outdoors

    Parameters
    ----------
    df: DataFrame
        The DataFrame containing the CCHVI indicators.
    output_csvs: tuple of strings
        Tuple containing filenames for exporting the DataFrames.
    export: bool, optional
        True to upload resulting DataFrames containing the CCHVI indicator metrics to AWS, False otherwise.

    Methods
    --------
    Relevant columns for Cal-CRAI metrics were isolated from the original dataset.
    Specific entries were omitted to isolate for the CA population.
    Entries within rows were converted to columns for better metric entry/visualization.
    Cal-CRAI tracts were merged in with the data, missing data from the extra tracts 
    were given values for each metric column based on the average values from matching
    County entries.
    Metric columns were isolated to their own dataframe and uploaded to AWS.

    Script
    ------
    society_cchvi_vulnerable_populations.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''

    print('Data transformation: isolated for relevant columns and data enties.')
    df = df[df['Race']== 'Total']
    selected_entries = ['2040-2060', '2080-2099']
    cchvi_data_filtered = df[~df['Year'].isin(selected_entries)]
    cchvi_data_filtered = cchvi_data_filtered.dropna(subset=['Year'])

    selected_entries = ['Number of Violent Crimes per 1,000 Population', 
                        'Percent of households without air conditioning', 
                        'Percent of population employed and aged > 16 working outdoors']

    cchvi_data_filtered = cchvi_data_filtered[cchvi_data_filtered['Definition'].isin(selected_entries)]

    grouping_cchvi = cchvi_data_filtered.groupby(['FIPS', 'Definition'])['Mean'].mean().reset_index()
    
    print('Data transformation: adjust row entries from definition column to be their own columns.')
    pivot_table = grouping_cchvi.pivot_table(index='FIPS', columns='Definition', values='Mean', aggfunc='mean')
    pivot_table.reset_index(inplace=True)

    print('Data transformation: add Cal-CRAI census tract set and fill missing values with average county values.')
    county_tract = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
    ca_county_tract = pd.read_csv(county_tract)
    ca_county_tract = ca_county_tract.rename(columns={'TRACT': 'FIPS'})

    # Merge the datasets
    cchvi_ca_counties = pd.merge(ca_county_tract, pivot_table, on ='FIPS', how='left')

    # Move the 'County' column to the second position
    column_to_move = 'County'
    col = cchvi_ca_counties.pop(column_to_move)
    cchvi_ca_counties.insert(1, column_to_move, col)

    # Columns to fill NaN values
    columns_to_fill = ['Number of Violent Crimes per 1,000 Population',
                    'Percent of households without air conditioning',
                    'Percent of population employed and aged > 16 working outdoors']

    # Compute average values for each column grouped by 'County'
    average_values_by_county = cchvi_ca_counties.groupby('County')[columns_to_fill].transform('mean')

    # Fill NaN values in each column with the corresponding average value of that column for the respective 'County'
    for column in columns_to_fill:
        na_mask = cchvi_ca_counties[column].isna()
        cchvi_ca_counties.loc[na_mask, column] = average_values_by_county.loc[na_mask, column]

    retained_columns = ['FIPS', 'County']

    violent_crime = cchvi_ca_counties[retained_columns + ['Number of Violent Crimes per 1,000 Population']].copy()
    percent_without_ac = cchvi_ca_counties[retained_columns + ['Percent of households without air conditioning']].copy()
    percent_working_outdoors = cchvi_ca_counties[retained_columns + ['Percent of population employed and aged > 16 working outdoors']].copy()

    list_of_dfs = [violent_crime, percent_without_ac, percent_working_outdoors]
    directory = '3_fair_data/index_data'
    
    if export:
        for df, output_csv in zip(list_of_dfs, output_csvs):
            # Save the updated DataFrame to a new CSV file
            df.to_csv(output_csv, index=False)
            upload_csv_aws([output_csv], bucket_name, directory)
            os.remove(output_csv)  # Remove local file after upload

    return list_of_dfs

In [95]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/climate_change_health_vulnerabilities/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

# read in food access data (already subsetted for CA)
cchvi_data = pd.read_csv('selectedCHVIdata.csv')

output_csvs = ('society_vulnerable_violent_crime_metric.csv','society_vulnerable_percent_without_ac_metric.csv','society_vulnerable_percent_working_outdoors.csv')
varname = ['society_cdph_violent_crimes', 'society_cdph_air_conditioning_access', 'society_cdph_working_outdoors']
cchvi_metric_calc(cchvi_data, output_csvs, export=True) #, varname = varname)

Saved DataFrame as 'selectedCHVIdata.csv'
society_vulnerable_violent_crime_metric.csv uploaded to AWS
society_vulnerable_percent_without_ac_metric.csv uploaded to AWS
society_vulnerable_percent_working_outdoors.csv uploaded to AWS


[            FIPS        County  Number of Violent Crimes per 1,000 Population
 0     6085504321   Santa Clara                                       3.170385
 1     6085504410   Santa Clara                                       3.170385
 2     6085507003   Santa Clara                                       3.170385
 3     6085507004   Santa Clara                                       3.170385
 4     6085502204   Santa Clara                                       3.170385
 ...          ...           ...                                            ...
 9124  6059001303        Orange                                       2.587129
 9125  6059001304        Orange                                       2.587129
 9126  6059001401        Orange                                       2.587129
 9127  6013367200  Contra Costa                                       4.276592
 9128  6037578100   Los Angeles                                       6.666117
 
 [9129 rows x 3 columns],
             FIPS       