# This notebook calculates the climate change health vulnerabilities data metrics
* % of households without air conditioning
* num of violent crimes per 10,000 people
* % of population aged 16+ working outdoors

In [1]:
import pandas as pd
import os
import sys
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/climate_change_health_vulnerabilities/'
output = 'cchvi'

pull_csv_from_directory(bucket_name, aws_dir, output, search_zipped=False)

In [None]:
# read in food access data (already subsetted for CA)

cchvi_data = pd.read_csv(r'cchvi/selectedCHVIdata.csv')
print(len(cchvi_data))
#os.remove('selectedCHVIdata.csv')

In [None]:
cchvi_data

In [None]:
cchvi_data = cchvi_data[cchvi_data['Race']== 'Total']
unique_entries = cchvi_data['Definition'].unique()
unique_entries

In [None]:
unique_entries = cchvi_data['Year'].unique()
unique_entries

In [None]:
selected_entries = ['2040-2060',
                    '2080-2099',
                    ]

cchvi_data_filtered = cchvi_data[~cchvi_data['Year'].isin(selected_entries)]
# Drop rows with NaN values in 'Year' column
cchvi_data_filtered = cchvi_data_filtered.dropna(subset=['Year'])
unique_entries = cchvi_data_filtered['Year'].unique()

unique_entries

In [None]:
selected_entries = ['Number of Violent Crimes per 1,000 Population', 
                    'Percent of households without air conditioning', 
                    'Percent of population employed and aged > 16 working outdoors']

cchvi_data_filtered = cchvi_data_filtered[cchvi_data_filtered['Definition'].isin(selected_entries)]
cchvi_data_filtered

In [None]:
grouping_cchvi = cchvi_data_filtered.groupby(['FIPS', 'Definition'])['Mean'].mean().reset_index()
grouping_cchvi.head(10)

In [None]:
# Filter the DataFrame for the specific FIPS code
testing_fips = grouping_cchvi[grouping_cchvi['FIPS'] == 6085512037]

# Display the results
print(testing_fips)

In [None]:
pivot_table = grouping_cchvi.pivot_table(index='FIPS', columns='Definition', values='Mean', aggfunc='mean')
pivot_table.reset_index(inplace=True)
random_rows = pivot_table.sample(n=10)
random_rows

### Importing CA tracts - county data for further clarity on data and the similar entries
* Data has eight thousand tracts, need to translate to get the full nine-thousand
* Decided to merge then fill in the excess empty tracts with the county wide average values for the respective county for the metric columns

In [None]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'TRACT': 'FIPS'})
ca_county_tract

In [None]:
# Merge the datasets
cchvi_ca_counties = pd.merge(ca_county_tract, pivot_table, on ='FIPS', how='left')

# Move the 'County' column to the second position
column_to_move = 'County'
col = cchvi_ca_counties.pop(column_to_move)
cchvi_ca_counties.insert(1, column_to_move, col)

# Columns to fill NaN values
columns_to_fill = ['Number of Violent Crimes per 1,000 Population',
                   'Percent of households without air conditioning',
                   'Percent of population employed and aged > 16 working outdoors']

# Add a new column indicating whether a value was originally NaN
original_na_flag_column = 'Original_NA_Flag'
cchvi_ca_counties[original_na_flag_column] = np.where(cchvi_ca_counties[columns_to_fill].isna().any(axis=1), 1, 0)

# Compute average values for each column grouped by 'County'
average_values_by_county = cchvi_ca_counties.groupby('County')[columns_to_fill].transform('mean')

# Fill NaN values in each column with the corresponding average value of that column for the respective 'County'
for column in columns_to_fill:
    na_mask = cchvi_ca_counties[column].isna()
    cchvi_ca_counties.loc[na_mask, column] = average_values_by_county.loc[na_mask, column]

print(len(cchvi_ca_counties))
cchvi_ca_counties.head(10)

In [None]:
cchvi_ca_counties.loc[cchvi_ca_counties['County'] == 'Santa Clara']

In [15]:
retained_columns = ['FIPS', 'County']

# convert violent crime per 1,000 to 10,000 population
violent_crime = cchvi_ca_counties[retained_columns + ['Number of Violent Crimes per 1,000 Population']].copy()
violent_crime['Number of Violent Crimes per 10,000 Population'] = violent_crime['Number of Violent Crimes per 1,000 Population'] * 10
violent_crime = violent_crime.drop(columns='Number of Violent Crimes per 1,000 Population')

percent_without_ac = cchvi_ca_counties[retained_columns + ['Percent of households without air conditioning']].copy()
percent_working_outdoors = cchvi_ca_counties[retained_columns + ['Percent of population employed and aged > 16 working outdoors']].copy()

In [None]:
violent_crime.to_csv('society_vulnerable_violent_crime_metric.csv')
percent_without_ac.to_csv('society_vulnerable_percent_without_ac_metric.csv')
percent_working_outdoors.to_csv('society_vulnerable_percent_working_outdoors_metric.csv')

### Function Call

In [17]:
@append_metadata
def cchvi_metric_calc(input_csv, export=False, varname=''):
    '''
    The function calculates the vulnerable population metrics sourced from the California Department of Public Health
    Climate Change and Health Vulnerability Indicators for California. The metrics include:
    
    * Number of Violent Crimes per 10,000 Population    
    * Percent of households without air conditioning    
    * Percent of population employed and aged > 16 working outdoors

    Parameters
    ----------
    df: DataFrame
        The DataFrame containing the CCHVI indicators.
    output_csv: string
        Filename for exporting the DataFrame.
    export: bool, optional
        True to upload resulting DataFrame containing the CCHVI indicator metrics to AWS, False otherwise.

    Methods
    --------
    Relevant columns for Cal-CRAI metrics were isolated from the original dataset.
    Specific entries were omitted to isolate for the CA population.
    Entries within rows were converted to columns for better metric entry/visualization.
    Cal-CRAI tracts were merged in with the data, missing data from the extra tracts 
    were given values for each metric column based on the average values from matching
    county entries.
    Metric columns were isolated to their own dataframe and uploaded to AWS.

    Script
    ------
    society_cchvi_vulnerable_populations.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    if export == False:
        print('Data transformation: isolated dataset for metric related columns.')
        print('Data transformation: adjust row entries from definition column to be their own columns.')
        print('Data transformation: add Cal-CRAI census tract set and fill missing values with average county values.')
        print('Data transformation: multiply the violent crimes per 1,000 people column by 10 to calculate Cal-CRAI metric per 10,000.')
        print(f'{input_csv} uploaded to AWS.')
        return None
    
    if export == True:
        bucket_name = 'ca-climate-index'
        upload_csv_aws([input_csv], bucket_name, '3_fair_data/index_data')
        os.remove(input_csv)  # Remove local file after upload

In [None]:
input_csvs = ['society_vulnerable_violent_crime_metric.csv',
               'society_vulnerable_percent_without_ac_metric.csv',
               'society_vulnerable_percent_working_outdoors_metric.csv']

varnames = ['society_cdph_violent_crimes',
            'society_cdph_air_conditioning_access',
            'society_cdph_working_outdoors']

# Process the data and export
for input_csv, varname in zip(input_csvs, varnames):
    print(f'Processing {input_csv} with varname {varname}')
    cchvi_metric_calc(input_csv, export=False, varname=varname)

print('Processing complete.')