## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Economic health

This notebook calculates one metric, sourced from the Bureau of Labor Statistics:
* Metric 1: Hachman Index

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties, data_stats_check
from scripts.utils.write_metadata import append_metadata

In [2]:
# Define a function to display data for a specific county
def display_county_data(df, county_col, county_name):
    county_data = df[df[county_col] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)  

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/society_economy/economic_health/bureau_labor_statistics/employment_hachman_index/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

aws_dir = '1_pull_data/society_economy/economic_health/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

In [4]:
hachman_data = pd.read_csv('employment_data_hachman_subset.csv')

In [5]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2022.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'Census Tract': 'GEO_ID'})
ca_county_tract = ca_county_tract.drop(columns={'Unnamed: 0', 'COUNTYFP', 'County', 'Total Population 2021'})

### Hachman Index Metric
$$
HI = \frac{1}{Σ_j (\ \left(\frac{E_si}{E_ri}\right) \cdot E_si)}
$$

{E_si} is share of area economic indicator in industry i -- county level <br>
{E_ri} is share of regions economic indicator in industry i -- state level

Several levels of data clean-up need to occur to calculate the Hachman Index. 
* Clean-up the County naming, dropping all statewide metrics
* Drop counts for "Service-providing" and "Goods-producing" -- these sum other columns! 

In [None]:
len(hachman_data)

In [None]:
hachman_data.head(5)

Drop the following rows:
* "101 Goods-producing" (it's a summary of the 101X categories)
* "102 Service-providing" (it's a summary of the 102X categories)

In [None]:
# create a Boolean mask for the rows to remove
mask101 = hachman_data['Industry'] == '101 Goods-producing'
mask102 = hachman_data['Industry'] == '102 Service-providing'

# select all rows except the ones that contain either
hachman_data_cleaned = hachman_data[~mask101]
hachman_data_cleaned = hachman_data_cleaned[~mask102]
hachman_data_cleaned

Taking a look at the entries within area
* separating California entries as we will need that data also
* getting rid of all 'County, California' portions of each entry

In [None]:
# Get unique entries in 'Column1'
unique_entries = hachman_data_cleaned['Area'].unique()
print(unique_entries)

## Adjust the columns and entries within for future cleaning

In [10]:
hachman_data_cleaned = hachman_data_cleaned[['Area', 'Industry', 'Annual Average Employment']]
# Remove any mention of 'county' within the legalAgencyName column
hachman_data_cleaned.loc[:,'Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
hachman_data_cleaned.loc[:,'Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)

unique_entries = hachman_data_cleaned['Area'].unique()
hachman_data_cleaned = hachman_data_cleaned.rename(columns={'Area':'County'})

In [None]:
hachman_data_cleaned

## Eliminate any row with '10 Total, all industries' as their totals are inconsistent with observed values
* will calculate totals with the industry employment values

In [None]:
# Eliminate rows where the Industry column has the value '10 Total, all industries'
industry_clean_df = hachman_data_cleaned[hachman_data_cleaned['Industry'] != '10 Total, all industries']
industry_clean_df.loc[:,'Annual Average Employment'] = industry_clean_df['Annual Average Employment'].str.replace(',', '').astype(float)

industry_clean_df

In [None]:
display_county_data(industry_clean_df, 'County', 'Los Angeles')

## Make a new df with the total employment for each county (and California as a whole)

In [None]:
total_emp_county = industry_clean_df.groupby('County')['Annual Average Employment'].sum().reset_index()
total_emp_county = total_emp_county.rename(columns={'Annual Average Employment': 'Total County Employed'})
total_emp_county.head(5)

## Run county filter on the data to isolate non-California counties from the dataset

In [None]:
filtered_hachman_data, omitted_data = filter_counties(industry_clean_df, 'County')

print('Counties kept:', len(filtered_hachman_data))
print('Omitted data entries:', len(omitted_data))

## Keep California data and place within its own df as it is needed to calculate reference values

In [None]:
# Get rid of the Unknown entries
mask = omitted_data['County'] == 'Unknown Or Undefined, California'
california_employ_data = omitted_data[~mask]
california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)

# Now we have a df that holds all CA state employment per industry
california_employ_data

In [None]:
filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)
filtered_hachman_data

In [None]:
display_county_data(industry_clean_df, 'County', 'Alpine')

## Merge the two datasets together based on 'Industry' so we have a single df with county and state employment per industry

In [None]:
merge_data = pd.merge(filtered_hachman_data, california_employ_data, on='Industry', how='left')
merge_data

## Merge once again with the new df and the total employment values per county
* now we have a df with employment per industry for the state and its counties
* also have total employees per county across all industries

In [None]:
hachman_denominator = pd.merge(merge_data, total_emp_county, on='County', how='left')

new_column_order = ['County', 
                    'Industry', 
                    'Industry Employed County',
                    'Total County Employed', 
                    'State', 
                    'Industry Employed CA']

# Rearrange the DataFrame columns
hachman_denominator = hachman_denominator[new_column_order]
hachman_denominator

## Add another column with the total number of employed in California

In [None]:
hachman_denominator['Total State Employment'] = 15438558.0 
hachman_denominator

Calculate the Esi and Eri values
* Esi = (county employment in industry i / total county employment for all industries)
* Eri = (state employment in industry i / total state employment for all industries)

Then we divide Esi by Eri, and multiply by Esi to create our hachman denominator column

In [None]:
# county score per industry -- fraction of county employment in industry
hachman_denominator['county_industry_frac'] = hachman_denominator['Industry Employed County'] / hachman_denominator['Total County Employed']

# state score per industry -- fraction of state employment in industry
hachman_denominator['state_industry_frac'] = hachman_denominator['Industry Employed CA'] / hachman_denominator['Total State Employment']
hachman_denominator

## Calculate the Hachman denominator score per industry

In [None]:
hachman_denominator['scores'] = (hachman_denominator['county_industry_frac'] / hachman_denominator['state_industry_frac']) * hachman_denominator['county_industry_frac']
hachman_denominator

In [None]:
hachman_denominator.loc[hachman_denominator.County == 'Sierra']

## Now we sum the Hachman denominator industry values together per county

In [None]:
hachman_denominator_sum = hachman_denominator.groupby('County')['scores'].sum().reset_index()
hachman_denominator_sum.head()

## Take the reciprical for each county score to get the final Hachman index value

In [None]:
hachman_denominator_sum['hachman_index'] = 1 / hachman_denominator_sum['scores']
hachman_denominator_sum.columns = hachman_denominator_sum.columns.str.lower()
hachman_denominator_sum = hachman_denominator_sum.applymap(lambda s: s.lower() if type(s) == str else s)
hachman_denominator_sum

In [None]:
# should not exceed 1
data_stats_check(hachman_denominator_sum, 'hachman_index')

## Merge with California census tracts

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

In [None]:
hachman_metric = pd.merge(ca_tract_county, hachman_denominator_sum, on='county', how='left')
hachman_metric

In [None]:
display_county_data(hachman_metric, 'county', 'yuba')

In [32]:
# Save as a csv for function call
hachman_metric.to_csv('society_economy_hachman_metric.csv', index=False)

## Function Call

In [35]:
@append_metadata
def hachman_index_upload(input_csv, export=False, varname=''):
    '''
    Uploads the calculated Hachman Index metric to S3 bucket. The metric is:
    Hachman Index score.

    Data for this metric was sourced from the US Bureau of Labor Statistics at:
    https://www.bls.gov/cew/downloadable-data-files.htm

    Hachman Index methodology was followed from:
    https://d36oiwf74r1rap.cloudfront.net/wp-content/uploads/ERG-Hachman-RB-Mar2023.pdf

    Methods
    -------
    Data Collection: 
    Gathered employment data at the industry level for California and its counties.

    Hachman Score Calculation:
    Calculated new total employment values based on industry-specific employment data due to inconsistencies in source data.
    Computed each county's employment per industry as a proportion of its total employment.
    Computed California's employment per industry as a proportion of its total employment.
    Divided each county's industry employment proportion by California's corresponding proportion, then multiplied by the county's proportion.
    Summed these values for each county.
    Took the reciprocal of each county's sum to obtain the Hachman Index value.

    Integration with Census Data:
    Merged the Hachman Index values with 2021 California Census data to assign each census tract the Hachman value of its county.

    Parameters
    ----------
    input_csv: string
        csv Hachman calculated data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI Hachman metric to AWS
        True = will upload resulting df containing CAL CRAI Hachman metric to AWS

    Script
    ------
    society_economic_metrics_hachman.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are 
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: New total employment values calculated for California and its counties.')
    print('Data transformation: Dropped the following columns as they summarized counts from other industries: 101 Goods-producing and 102 Service-providing')
    print('Data transformation: Removed unknown or uncategorized entries within the county column.')
    print('Data transformation: Isolated relevant columns and created new ones resulting for Hachman calculations.')
    print('Data transformation: Resulting Hachman calculation per county was extrapolated to California census tracts.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [36]:
input_csv = 'society_economy_hachman_metric.csv'
varname = 'society_bls_hachman'

hachman_index_upload(input_csv, export=True, varname='test') #varname)